In [1]:
import pandas as pd
import numpy as np
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import altair as alt
import seaborn as sns
import re

In [2]:
#Make sure numpy version is < 1.20
np.version.version

'1.18.5'

In [3]:
#Install known version of numpy that works
!python -m pip install numpy==1.18.5



In [4]:
#Install gensim
!python -m pip install gensim



In [5]:
import gensim
from gensim.models.word2vec import Word2Vec
from tqdm.notebook import tqdm
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import Counter
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mryua\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mryua\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mryua\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
RANDOM_SEED=694

## 1.Loading data

### Training Dataset

In [15]:
train_path = 'Data/WikiLarge_Train.csv'
df = pd.read_csv(train_path, skiprows=0, skipfooter=0, engine='python')
df.head()

Unnamed: 0,original_text,label
0,There is manuscript evidence that Austen conti...,1
1,"In a remarkable comparative analysis , Mandaea...",1
2,"Before Persephone was released to Hermes , who...",1
3,Cogeneration plants are commonly found in dist...,1
4,"Geneva -LRB- , ; , ; , ; ; -RRB- is the second...",1


In [16]:
len(df[df['label']==1])/len(df) # the dataset label is well balanced 

0.5

In [17]:
#Adding a column of text length for exploration purpose only
df['text_length'] = df['original_text'].apply(len)

In [18]:
#Inspecting data with different label and text length combinations
df[(df['label']==0) & (df['text_length']==5)].head()

Unnamed: 0,original_text,label,text_length
208709,Pages,0,5
208988,Plain,0,5
209004,Drama,0,5
209374,Child,0,5
209606,equal,0,5


In [19]:
df.head()

Unnamed: 0,original_text,label,text_length
0,There is manuscript evidence that Austen conti...,1,216
1,"In a remarkable comparative analysis , Mandaea...",1,156
2,"Before Persephone was released to Hermes , who...",1,248
3,Cogeneration plants are commonly found in dist...,1,246
4,"Geneva -LRB- , ; , ; , ; ; -RRB- is the second...",1,202


In [20]:
df['original_text'].apply(lambda x: len(x)).mean()
# This means all texts are considered short text, which allows us to use dense representations, 
# as dense representations work well with short text.
# Gensim.KeyedVectors.load('assets/wikipedia.100.word-vecs.kv')??? How to generate and use this???
# Maybe we should train word2vec model on the entire corpus. Just training data? TOP 100 word-vectors(features)
# Alternatively we could use bag-of-words model, which is term-document matrix representation, having much more features

117.921906192414

In [21]:
X = df['original_text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Testing Dataset

In [29]:
test_path = 'Data/WikiLarge_Test.csv'
test_df = pd.read_csv(test_path, skiprows=0, skipfooter=0, engine='python')
test_df.head()

Unnamed: 0,id,original_text,label
0,0,-2011.0,
1,1,-2011.0,
2,2,-2000.0,
3,3,-1997.0,
4,4,1.636,


### Sample Submission

In [28]:
samplesubmission_path = 'Data/sampleSubmission.csv'
samplesubmission_df = pd.read_csv(samplesubmission_path, skiprows=0, skipfooter=0, engine='python')
samplesubmission_df.head()

Unnamed: 0,id,label
0,0,0
1,1,0
2,2,1
3,3,1
4,4,0


To conclude, the dataframes we are working with are:

train_df, test_df, samplesubmission_df

## 2. Data Preprocessing

In [32]:
# Using tf-idf model, to generate a vectorized representation of the documents.
vectorizer = TfidfVectorizer(min_df=10,stop_words='english',ngram_range=(1,2))
X_train_transform = vectorizer.fit_transform(X_train)
X_test_transform  = vectorizer.transform(X_test)

In [33]:
X_train_transform

<333414x57773 sparse matrix of type '<class 'numpy.float64'>'
	with 4071111 stored elements in Compressed Sparse Row format>

In [34]:
len(set(stopwords.words('english')))

179

== dale_chall.txt ==

This is the Dale Chall 3000 Word List, which is one definition of words that are considered "basic" English.

A summary is at https://www.readabilityformulas.com/articles/dale-chall-readability-word-list.php

In [35]:
#Basic english words
dalechall_path = 'Data/dale_chall.txt'
dale_chall = pd.read_csv(dalechall_path,delimiter='\t',header=None,names=['word'])
dale = set(dale_chall['word'].values)

In [36]:
len(dale)

2946

The 2946 words in dale can be combined with the nltk stopwords, as they are considered easy words. 

### We will use a geo dataset to add city and country names to the stopwords library

In [37]:
!python -m pip install datapackage



In [39]:
# Run a second time if this cell fails.
from datapackage import Package
package = Package('https://datahub.io/core/world-cities/datapackage.json')
# print list of all resources:
print(package.resource_names)

['validation_report', 'world-cities_csv', 'world-cities_json', 'world-cities_zip', 'world-cities_csv_preview', 'world-cities']


In [41]:
# Run a second time if this cell fails.
world_cities = []
for resource in package.resources:
    if resource.descriptor['datahub']['type'] == 'derived/csv':
        world_cities = resource.read()

In [44]:
world_cities_df = pd.DataFrame(world_cities, columns=['name', 'country', 'subcountry', 'geonameid'])

In [45]:
world_cities_df.head()

Unnamed: 0,name,country,subcountry,geonameid
0,les Escaldes,Andorra,Escaldes-Engordany,3040051
1,Andorra la Vella,Andorra,Andorra la Vella,3041563
2,Umm al Qaywayn,United Arab Emirates,Umm al Qaywayn,290594
3,Ras al-Khaimah,United Arab Emirates,Raʼs al Khaymah,291074
4,Khawr Fakkān,United Arab Emirates,Ash Shāriqah,291696


In [46]:
world_cities_df = world_cities_df.applymap(lambda s:s.lower() if type(s) == str else s)

In [47]:
world_cities_df[world_cities_df['country']=='france'].head()

Unnamed: 0,name,country,subcountry,geonameid
6633,yerres,france,île-de-france,2967245
6634,wittenheim,france,alsace-champagne-ardenne-lorraine,2967318
6635,wattrelos,france,nord-pas-de-calais-picardie,2967421
6636,wasquehal,france,nord-pas-de-calais-picardie,2967438
6637,voiron,france,auvergne-rhône-alpes,2967758


In [48]:
cities = set(world_cities_df['name'].unique())
countries = set(world_cities_df['country'].unique())
subcountries = set(world_cities_df['subcountry'].unique())

In [49]:
#We will add this to stopwords
geo_data = cities | countries | subcountries

In [53]:
len(geo_data)

23803

### We will use a language dataset to add language names to the stopwords library

In [54]:
language_package = Package('https://datahub.io/core/language-codes/datapackage.json')

# print list of all resources:
print(language_package.resource_names)

['validation_report', 'language-codes_csv', 'language-codes-3b2_csv', 'language-codes-full_csv', 'ietf-language-tags_csv', 'language-codes_json', 'language-codes-3b2_json', 'language-codes-full_json', 'ietf-language-tags_json', 'language-codes_zip', 'language-codes', 'language-codes-3b2', 'language-codes-full', 'ietf-language-tags']


In [55]:
languages_data = language_package.resources[1].read()

In [57]:
languages_df = pd.DataFrame(languages_data, columns=['alpha2', 'english'])
languages_df = languages_df.applymap(lambda s:s.lower() if type(s) == str else s)
languages_df.head()

Unnamed: 0,alpha2,english
0,aa,afar
1,ab,abkhazian
2,ae,avestan
3,af,afrikaans
4,ak,akan


In [58]:
languages = set(languages_df['english'].unique())

In [59]:
len(languages)

184

### We will use a nationality dataset to add nationality names to the stopwords library

In [60]:
nationality_path = 'Data/CH_Nationality_List_20171130_v1.csv'
nationality_df = pd.read_csv(nationality_path, skiprows=0, skipfooter=0, engine='python')
nationality_df = nationality_df.applymap(lambda s:s.lower() if type(s) == str else s)
nationality_df.head()

Unnamed: 0,Nationality
0,afghan
1,albanian
2,algerian
3,american
4,andorran


In [61]:
nationalities = set(nationality_df['Nationality'].unique())
len(nationalities)

225

### We will use a state name dataset to add state names to the stopwords library

In [63]:
states_path = 'Data/states.csv'
states_df = pd.read_csv(states_path, skiprows=0, skipfooter=0, engine='python')
states_df = states_df.applymap(lambda s:s.lower() if type(s) == str else s)
states_df.head()

Unnamed: 0,id,name,country_id,country_code,country_name,state_code,type,latitude,longitude
0,3901,badakhshan,1,af,afghanistan,bds,,36.734772,70.811995
1,3871,badghis,1,af,afghanistan,bdg,,35.167134,63.769538
2,3875,baghlan,1,af,afghanistan,bgl,,36.178903,68.745306
3,3884,balkh,1,af,afghanistan,bal,,36.75506,66.897537
4,3872,bamyan,1,af,afghanistan,bam,,34.810007,67.82121


In [64]:
states = set(states_df['name'].unique())
len(states)

4896

### We will use a continent name dataset to add continent names to the stopwords library

In [65]:
continents_path = 'Data/continents2.csv'
continents_df = pd.read_csv(continents_path, skiprows=0, skipfooter=0, engine='python')
continents_df = continents_df.applymap(lambda s:s.lower() if type(s) == str else s)
continents_df.head()

Unnamed: 0,ï»¿name,alpha-2,alpha-3,country-code,iso_3166-2,region,sub-region,intermediate-region,region-code,sub-region-code,intermediate-region-code
0,afghanistan,af,afg,4,iso 3166-2:af,asia,southern asia,,142.0,34.0,
1,ã…land islands,ax,ala,248,iso 3166-2:ax,europe,northern europe,,150.0,154.0,
2,albania,al,alb,8,iso 3166-2:al,europe,southern europe,,150.0,39.0,
3,algeria,dz,dza,12,iso 3166-2:dz,africa,northern africa,,2.0,15.0,
4,american samoa,as,asm,16,iso 3166-2:as,oceania,polynesia,,9.0,61.0,


In [66]:
continents = set(continents_df['region'].unique())
len(continents)

6

### We will use a firstname dataset to add first names to the stopwords library

In [69]:
firstname_path = 'Data/new-top-firstNames.csv'
firstname_df = pd.read_csv(firstname_path, skiprows=0, skipfooter=0, engine='python')
firstname_df = firstname_df.applymap(lambda s:s.lower() if type(s) == str else s)
firstname_df.head()

Unnamed: 0.1,Unnamed: 0,name,newPerct2013
0,1,michael,0.011577
1,2,james,0.010218
2,3,john,0.009675
3,4,robert,0.009493
4,5,david,0.008943


In [70]:
firstnames = set(firstname_df['name'].unique())
len(firstnames)

100

In [71]:
firstname_path2 = 'Data/babynames-clean.csv'
firstname_df2 = pd.read_csv(firstname_path2, header= None, skiprows=0, skipfooter=0, engine='python')
firstname_df2 = firstname_df2.applymap(lambda s:s.lower() if type(s) == str else s)
firstname_df2.head()

Unnamed: 0,0,1
0,john,boy
1,william,boy
2,james,boy
3,charles,boy
4,george,boy


In [72]:
firstnames2 = set(firstname_df2[0].unique())
len(firstnames2)

6782

In [73]:
firstnames = firstnames | firstnames2
len(firstnames)

6782

### We will use a surname dataset to add surnames to the stopwords library

In [74]:
surname_path = 'Data/new-top-surnames.csv'
surname_df = pd.read_csv(surname_path, skiprows=0, skipfooter=0, engine='python')
surname_df = surname_df.applymap(lambda s:s.lower() if type(s) == str else s)
surname_df.head()

Unnamed: 0.1,Unnamed: 0,name,perct2013
0,1,smith,0.007999
1,2,johnson,0.006346
2,3,williams,0.00533
3,4,brown,0.004724
4,5,jones,0.004676


In [75]:
surnames = set(surname_df['name'].unique())
len(surnames)

100

### We will add calendar words to the stopwords library

In [77]:
days=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
months=['January','February','March', 'April','May','June','July','August','September','October','November','December']
calendar = days.copy()
calendar.extend(months)
calendar = set([w.lower() for w in calendar])

### Pre-process data for supervised learning

In [79]:
X_train

304501    1979-80 Buffalo Sabres NHL 32 1880 74 1 4 2.36...
162313    Diseases Lentils in culture Lentils are mentio...
336845    Railroads , like the Lehigh Valley Railroad , ...
150625    An example of this would be an individual anim...
40240     Both the Matanuska and Susitna Rivers have maj...
                                ...                        
259178    After the Germans invaded Norway in April 1940...
365838    July 28 - Henry Bennet , 1st Earl of Arlington...
131932    Pancake restaurants are popular family restaur...
146867                                 A cycling domestique
121958    David Boreanaz 's first paid acting appearance...
Name: original_text, Length: 333414, dtype: object

In [84]:
tokenized_text_train=[]
tokenized_text_test=[]
stopWords = set(stopwords.words('english')) | dale | geo_data | languages | nationalities | states | continents | firstnames | surnames | calendar
# This cell will run 4 minutes
import gensim
from nltk.stem.porter import *
def lemmatize_stemming(text):
    stemmer = PorterStemmer()
    #Un-hash next line to use stemming
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
    #Un-hash next line to NOT use stemming
    #return WordNetLemmatizer().lemmatize(text, pos='v')

# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in stopWords and len(token) > 3:
            #Un-hash next line to use lemmatization/stemming
            result.append(lemmatize_stemming(token))
            #Un-hash next line to NOT use lemmatization/stemming
            #result.append(token)
            
    return result

tokenized_text_train = [preprocess(text) for text in X_train]
tokenized_text_test=[preprocess(text) for text in X_test]

#for text in tqdm(X_train):
#    tokens_in_text = word_tokenize(text)
#    tokens_in_text = [word for word in tokens_in_text if word.lower() not in stopWords]
#    tokenized_text_train.append(tokens_in_text)
    
#for text in tqdm(X_test):
#    tokens_in_text = word_tokenize(text)
#    tokens_in_text = [word for word in tokens_in_text if word.lower() not in stopWords]
#    tokenized_text_test.append(tokens_in_text)

In [85]:
len(stopWords)

36872

In [86]:
model = Word2Vec(vector_size=100,window=2,min_count=100,seed= RANDOM_SEED,workers=4)
model.build_vocab(tokenized_text_train)
model.train(tokenized_text_train,total_examples=model.corpus_count,epochs=model.epochs)

(6205597, 9373610)

In [87]:
word_vectors = model.wv

In [88]:
#word_vectors.vocab

In [89]:
word_dict = word_vectors.key_to_index

In [90]:
words_in_vector = word_vectors.index_to_key
len(words_in_vector)

2718

# Adding word's difficulty to the vector

== Concreteness_ratings_Brysbaert_et_al_BRM.txt ==

This file contains concreteness ratings for 40 thousand English lemma words gathered via Amazon Mechanical Turk. The ratings come from a larger list of 63 thousand words and represent all English words known to 85% of the raters.

The file contains eight columns:
1. The word
2. Whether it is a single word or a two-word expression 
3. The mean concreteness rating
4. The standard deviation of the concreteness ratings
5. The number of persons indicating they did not know the word
6. The total number of persons who rated the word
7. Percentage participants who knew the word
8. The SUBTLEX-US frequency count (on a total of 51 million; Brysbaert & New, 2009) 
9. The dominant part-of-speech usage

Original source: http://crr.ugent.be/archives/1330

Brysbaert, M., Warriner, A.B., & Kuperman, V. (2014). Concreteness ratings for 40 thousand generally known English word lemmas. Behavior Research Methods, 46, 904-911.
http://crr.ugent.be/papers/Brysbaert_Warriner_Kuperman_BRM_Concreteness_ratings.pdf

In [91]:
#Concreteness rating - the higher Conc.M, the easier the word is.
concreteness_path = 'Data/Concreteness_ratings_Brysbaert_et_al_BRM.txt'
concrete_df = pd.read_csv(concreteness_path,delimiter='\t', keep_default_na=False)
concreteset=(concrete_df['Word'].values)

In [92]:
concrete_df.head()

Unnamed: 0,Word,Bigram,Conc.M,Conc.SD,Unknown,Total,Percent_known,SUBTLEX,Dom_Pos
0,roadsweeper,0,4.85,0.37,1,27,0.96,0,0
1,traindriver,0,4.54,0.71,3,29,0.9,0,0
2,tush,0,4.45,1.01,3,25,0.88,66,0
3,hairdress,0,3.93,1.28,0,29,1.0,1,0
4,pharmaceutics,0,3.77,1.41,4,26,0.85,0,0


In [93]:
# Stem words in concrete_df to match stemmed words in the vector
concrete_df['stem'] = concrete_df['Word'].apply(lemmatize_stemming)

In [94]:
concrete_df.head()

Unnamed: 0,Word,Bigram,Conc.M,Conc.SD,Unknown,Total,Percent_known,SUBTLEX,Dom_Pos,stem
0,roadsweeper,0,4.85,0.37,1,27,0.96,0,0,roadsweep
1,traindriver,0,4.54,0.71,3,29,0.9,0,0,traindriv
2,tush,0,4.45,1.01,3,25,0.88,66,0,tush
3,hairdress,0,3.93,1.28,0,29,1.0,1,0,hairdress
4,pharmaceutics,0,3.77,1.41,4,26,0.85,0,0,pharmaceut


In [346]:
np.min(concrete_df['Conc.M'])

1.04

In [347]:
np.max(concrete_df['Conc.M'])

5.0

### Concreteness values range from 1 - 5, we could possible use the inverse value of concreteness to scale it to a 0-1 range and give easier words less weight.

In [98]:
#concrete_words = list(concrete_df['Word'].values)
concrete_words = list(concrete_df['stem'].values)

In [99]:
len(concrete_words)

39954

In [106]:
# How many words are not covered by the concreteness dataset?
concrete_complement = [word for word in words_in_vector if word not in concrete_words]
len(concrete_complement)

255

In [107]:
# What are these words?
concrete_complement[:10]

['largest',
 'ndash',
 'picardi',
 'european',
 'aquitain',
 'disney',
 'britain',
 'alp',
 'oldest',
 'larger']

In [108]:
# How many words are covered by the concreteness dataset?
concrete_intersect = [word for word in words_in_vector if word in concrete_words]
len(concrete_intersect)

2463

In [109]:
concrete_intersect[:10]

['unit',
 'commun',
 'depart',
 'region',
 'state',
 'includ',
 'call',
 'nation',
 'play',
 'area']

In [110]:
# Multiply the inverse of the mean concreteness value to the vector
for word in concrete_intersect:
    word_vectors[word] = word_vectors[word] * 1/concrete_df[concrete_df['stem']==word]['Conc.M'].values.mean()

== AoA_51715_words.csv ==

This file contains "Age of Acquisition" (AoA) estimates for about 51k English words, which refers to the approximate age (in years) when a word was learned. Early words, being more basic, have lower average AoA.

The main columns you will be interested in are "Word" and "AoA_Kup_lem". But the others may be useful too.

The file contains these columns:

Word :: The word in question
Alternative.spelling :: if the Word may be spelled frequently in another form	
Freq_pm	:: Freq of the Word in general English (larger -> more common)
Dom_PoS_SUBTLEX	:: Dominant part of speech in general usage
Nletters :: number of letters 
Nphon :: number of phonemes
Nsyll :: number of syllables
Lemma_highest_PoS :: the "lemmatized" or "root" form of the word (in the dominant part of speech. e.g. The root form of the verb "abates" is "abate".
AoA_Kup	:: The AoA from a previous study by Kuperman et al.
Perc_known :: Percent of people who knew the word in the Kuperman et al. study
AoA_Kup_lem :: Estimated AoA based on Kuperman et al. study lemmatized words. THIS IS THE MAIN COLUMN OF INTEREST.
Perc_known_lem	:: Estimated percentage of people who would know this form of the word in the Kuperman study.
AoA_Bird_lem :: AoA reported in previous study by Bird (2001) 
AoA_Bristol_lem	:: AoA reported in previous study from Bristol Univ. (2006)
AoA_Cort_lem :: AoA reported in previous study by Cortese & Khanna (2008)
AoA_Schock :: AoA reported in previous study by Schock (2012)

Original source : http://crr.ugent.be/archives/806

In [111]:
aoawords_path = 'Data/AoA_51715_words.csv'
AoA = pd.read_csv(aoawords_path,encoding = 'unicode_escape')
AoA = AoA[AoA['Word'].notna()]
AoA_set = set(AoA['Word'].values)
AoA.head(5)

Unnamed: 0,Word,Alternative.spelling,Freq_pm,Dom_PoS_SUBTLEX,Nletters,Nphon,Nsyll,Lemma_highest_PoS,AoA_Kup,Perc_known,AoA_Kup_lem,Perc_known_lem,AoA_Bird_lem,AoA_Bristol_lem,AoA_Cort_lem,AoA_Schock
0,a,a,20415.27,Article,1,1,1,a,2.89,1.0,2.89,1.0,3.16,,,
1,aardvark,aardvark,0.41,Noun,8,7,2,aardvark,9.89,1.0,9.89,1.0,,,,
2,abacus,abacus,0.24,Noun,6,6,3,abacus,8.69,0.65,8.69,0.65,,,,
3,abacuses,abacuses,0.02,Noun,8,9,4,abacus,,,8.69,0.65,,,,
4,abalone,abalone,0.51,Verb,7,7,4,abalone,12.23,0.72,12.23,0.72,,,,


In [112]:
# Stem words in AoA to match stemmed words in the vector
AoA['stem'] = AoA['Word'].apply(lemmatize_stemming)

In [370]:
# We are going to impute all Nan values in AoA_Kup_lem as the max AoA value 25, as they appear to be hard words.
AoA['AoA_Kup_lem'].fillna(value=AoA['AoA_Kup_lem'].max(), inplace=True)

In [365]:
AoA.AoA_Kup_lem.min()

1.58

In [366]:
AoA.AoA_Kup_lem.max()

25.0

### AoA values range from 0 - 25, which means the smaller the AoA value, the easier the word is. We could possibly use the AoA value to give easier words less weight.

In [113]:
aoa_words = list(AoA['stem'].values)

In [114]:
len(aoa_words)

51714

In [115]:
aoa_complement = [word for word in words_in_vector if word not in aoa_words]
aoa_intersect = [word for word in words_in_vector if word in aoa_words]

In [116]:
len(aoa_complement)

264

In [117]:
aoa_complement[:10]

['ndash',
 'picardi',
 'european',
 'commonli',
 'aquitain',
 'atlant',
 'lower',
 'disney',
 'throughout',
 'britain']

In [118]:
len(aoa_intersect)

2454

In [119]:
aoa_intersect[:10]

['unit',
 'commun',
 'depart',
 'region',
 'state',
 'includ',
 'call',
 'nation',
 'play',
 'area']

In [120]:
# How many words are covered in both AoA and concreteness dataset?
len([word for word in aoa_intersect if word in concrete_intersect])

2404

In [121]:
# Multiply the scaled-down mean AoA value to the vector
for word in aoa_intersect:
    word_vectors[word] = word_vectors[word] * AoA[AoA['stem']==word]['AoA_Kup_lem'].values.mean()/25

### Generate 100 dense features to reduce dimentionality

In [123]:
def generate_dense_features(tokenized_text,word_vectors):
    dense_list=[]
    words=[]
    for _ in tokenized_text: 
        words =[word for word in _ if word in word_vectors.key_to_index]
        
        if len(words) >0:
            dense_list.append(np.mean(word_vectors[words],axis=0))
            
        else: 
            dense_list.append(np.zeros(word_vectors.vector_size))
            
    return np.array(dense_list)

In [124]:
X_train_wv = generate_dense_features(tokenized_text_train,word_vectors)
X_test_wv = generate_dense_features(tokenized_text_test,word_vectors)

In [125]:
X_train_wv.shape

(333414, 100)

## Bag of Words Model

In [128]:
# A dummy classifier to compare
def dummy_fun(doc):
    return doc
vectorizer = TfidfVectorizer(analyzer='word',tokenizer=dummy_fun, preprocessor=dummy_fun, token_pattern=r'(?u)\b\w\w+__\([\w\s]*\)')
X_train_transform = vectorizer.fit_transform(tokenized_text_train)
X_test_transform  = vectorizer.transform(tokenized_text_test)

In [132]:
model_word = set(word_vectors.index_to_key) #around 6k words in the Word2Vec model

In [133]:
len(model_word)

2718

In [134]:
len(model_word.intersection(concreteset))

1495

In [136]:
lemmatizer = WordNetLemmatizer()
word_list = []
for word in model_word: 
    word_list.append((word,lemmatizer.lemmatize(word.lower())))
df = pd.DataFrame(word_list,columns=['Original','word'])
df = df.merge(AoA,left_on='word',right_on='Word',how='left')
df = df[['Original','word','Perc_known','AoA_Kup_lem']]
word_not_matched = set(df[df['Perc_known'].isnull()].word.values)

for i in range(len(df)):   
    if df['word'][i][0] in set(('0','1','2','3','4','5','6','7','8','9')) or len(df['word'][i])==1:
        df['AoA_Kup_lem'][i] = 3
mean_value = df['AoA_Kup_lem'].mean()
df['AoA_Kup_lem'].fillna(value=mean_value,inplace=True)

In [137]:
#df.loc[df['Original']==['troops','weapons']]
df[df['Original'].isin(['troops','weapon'])]

Unnamed: 0,Original,word,Perc_known,AoA_Kup_lem
1916,weapon,weapon,1.0,6.95


In [138]:
def generate_perc_known(tokenized_text,df):
    avg_perc_know=None
    perc_know_list=[]
    for _ in tokenized_text: 
        words =[word for word in _ if word in word_vectors.key_to_index]
        
        if len(words) >0:
            avg_perc_know = np.mean(df[df['Original'].isin(words)]['AoA_Kup_lem'])
            perc_know_list.append(avg_perc_know)
        else: 
            
            perc_know_list.append(0)
            
    return perc_know_list

In [139]:
df_train = pd.DataFrame(X_train_wv)
#df_train['year'] = generate_perc_known(tokenized_text_train,df)

In [140]:
df_test = pd.DataFrame(X_test_wv)
#df_test['year'] = generate_perc_known(tokenized_text_test,df)

In [142]:
df_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.091324,0.005874,0.007535,0.00651,-0.128226,-0.094255,0.001027,-0.084303,0.016012,-0.002037,...,0.054801,-0.05838,0.041944,-0.044783,0.011489,0.052686,-0.000511,0.011446,0.00031,-0.045275
1,0.031129,-0.033987,-0.00287,-0.012246,0.031541,-0.071388,-0.130682,0.042675,-0.05329,0.105653,...,-0.094573,0.07067,0.046636,-0.037725,-0.00173,0.031297,0.066133,0.004233,0.144917,0.02977
2,-0.009796,0.011272,-0.010751,-0.037797,0.014127,0.004115,0.004441,-0.025115,-0.023149,-0.062626,...,-0.038165,-0.040126,0.029267,-0.058483,0.011886,-0.045043,0.013641,-0.012209,0.029171,-0.07038
3,0.024144,-0.017985,-0.008495,0.015597,0.006969,-0.092467,-0.032274,0.019717,0.002522,0.025245,...,0.046756,0.039036,-0.022688,0.005665,0.008009,0.011293,-0.00381,0.025832,0.03354,-0.021246
4,0.048582,0.063041,0.085754,-0.07376,-0.068502,-0.074329,-0.04267,0.043928,-0.009552,0.099189,...,-0.023131,0.05288,0.071288,-0.037294,-0.036126,0.026508,0.034064,0.093113,0.067002,0.010256


# 2. Supervised Learning

## Random Classifier

In [146]:
dummy_bow = DummyClassifier(strategy='uniform',random_state=RANDOM_SEED).fit(X_train_transform,y_train)

In [147]:
accuracy_score(y_test, dummy_bow.predict(X_test_transform))

0.5011277203253593

In [148]:
dummy_wv = DummyClassifier(strategy='uniform',random_state=RANDOM_SEED).fit(X_train_wv,y_train)

In [149]:
accuracy_score(y_test,dummy_wv.predict(X_test_wv))

0.5011277203253593

## Logistic Regression Classifier

In [150]:
lr_bow = LogisticRegression(random_state=RANDOM_SEED,max_iter=1000).fit(X_train_transform,y_train)

In [151]:
accuracy_score(y_test,lr_bow.predict(X_test_transform))

0.6433644456174868

In [152]:
lr_wv = LogisticRegression(random_state=RANDOM_SEED,max_iter=1000).fit(X_train_wv,y_train)

In [153]:
accuracy_score(y_test,lr_wv.predict(X_test_wv))

0.5794442978141421

## Random Forest Classifier

In [154]:
rf_bow = RandomForestClassifier(n_estimators=500,max_depth=5,random_state=RANDOM_SEED).fit(X_train_transform,y_train)

In [155]:
accuracy_score(y_test,rf_bow.predict(X_test_transform))

0.6336948436787677

In [156]:
rf_wv = RandomForestClassifier(n_estimators=100,max_depth=5,random_state=RANDOM_SEED).fit(X_train_wv,y_train)

In [157]:
accuracy_score(y_test,rf_wv.predict(X_test_wv))

0.6082131631355424

# 3. Unsupervised Learning

In [158]:
kmeans = KMeans(n_clusters=2,random_state=RANDOM_SEED).fit(X_train_transform)

In [159]:
cluster_df = pd.DataFrame({'cluster':kmeans.labels_,'y_label':y_train,'text':X_train})
cluster_df

Unnamed: 0,cluster,y_label,text
304501,1,0,1979-80 Buffalo Sabres NHL 32 1880 74 1 4 2.36...
162313,1,1,Diseases Lentils in culture Lentils are mentio...
336845,1,0,"Railroads , like the Lehigh Valley Railroad , ..."
150625,1,1,An example of this would be an individual anim...
40240,1,1,Both the Matanuska and Susitna Rivers have maj...
...,...,...,...
259178,1,0,After the Germans invaded Norway in April 1940...
365838,1,0,"July 28 - Henry Bennet , 1st Earl of Arlington..."
131932,1,1,Pancake restaurants are popular family restaur...
146867,1,1,A cycling domestique


### LDA Topic Modeling - Consider NMF to create a document-topic matrix

In [160]:
import gensim
from nltk.stem.porter import *
def lemmatize_stemming(text):
    stemmer = PorterStemmer()
    #Un-hash next line to use stemming
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
    #Un-hash next line to NOT use stemming
    #return WordNetLemmatizer().lemmatize(text, pos='v')
# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            #Un-hash next line to use stemming
            result.append(lemmatize_stemming(token))
            #Un-hash next line to NOT use stemming
            #result.append(token)
            
    return result

In [268]:
df['original_text'][0]

"There is manuscript evidence that Austen continued to work on these pieces as late as the period 1809 Ã¢ '' 11 , and that her niece and nephew , Anna and James Edward Austen , made further additions as late as 1814 ."

In [269]:
preprocess(df['original_text'][0])

['manuscript',
 'evidence',
 'austen',
 'continue',
 'work',
 'piece',
 'late',
 'period',
 'niece',
 'nephew',
 'anna',
 'jam',
 'edward',
 'austen',
 'additions',
 'late']

In [238]:
# This cell will run about 2 minutes
processed_docs = [preprocess(text) for text in df['original_text']]

In [239]:
dictionary = gensim.corpora.Dictionary(processed_docs)
dictionary

<gensim.corpora.dictionary.Dictionary at 0x1567b848520>

In [240]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
#bow_corpus

In [241]:
len(bow_corpus)

416768

In [242]:
# This cell will run 10 minutes
#lda_model =  gensim.models.LdaMulticore(bow_corpus, 
#                                   num_topics = 8, 
#                                   id2word = dictionary,                                    
#                                   passes = 10,
#                                   workers = 2)

In [347]:
#for idx, topic in lda_model.print_topics(-1):
#    print("Topic: {} \nWords: {}".format(idx, topic ))
#    print("\n")