In [5]:
import pandas as pd
import numpy as np
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import altair as alt
import seaborn as sns
import re

In [152]:
world_cities_df[world_cities_df['country']=='france']

Unnamed: 0,name,country,subcountry,geonameid


In [6]:
#Make sure numpy version is < 1.20
np.version.version

'1.18.5'

In [7]:
#Install known version of numpy that works
!python -m pip install numpy==1.18.5



In [8]:
#Install gensim
!python -m pip install gensim



In [32]:
import gensim
from gensim.models.word2vec import Word2Vec
from tqdm.notebook import tqdm
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import Counter
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mryua\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mryua\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mryua\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [33]:
RANDOM_SEED=694

Training Dataset

In [34]:
train_path = 'Data/WikiLarge_Train.csv'
df = pd.read_csv(train_path, skiprows=0, skipfooter=0, engine='python')
df

Unnamed: 0,original_text,label
0,There is manuscript evidence that Austen conti...,1
1,"In a remarkable comparative analysis , Mandaea...",1
2,"Before Persephone was released to Hermes , who...",1
3,Cogeneration plants are commonly found in dist...,1
4,"Geneva -LRB- , ; , ; , ; ; -RRB- is the second...",1
...,...,...
416763,A Duke Nukem 3D version has been sold for Xbox...,0
416764,"However , it is becoming replaced as a method ...",0
416765,There are hand gestures in both Hindu and Budd...,0
416766,"If it is necessary to use colors , try to choo...",0


In [35]:
len(df[df['label']==1])/len(df) # the dataset label is well balanced 

0.5

In [36]:
df.iloc[50]['original_text']

'He studied in Armenia and Istanbul , then at Wisconsin University which he finished in 1915 .'

In [37]:
df['original_text'].apply(lambda x: len(x)).mean()
# This means all texts are considered short text, which allows us to use dense representations, 
# as dense representations work well with short text.
# Gensim.KeyedVectors.load('assets/wikipedia.100.word-vecs.kv')??? How to generate and use this???
# Maybe we should train word2vec model on the entire corpus. Just training data? TOP 100 word-vectors(features)
# Alternatively we could use bag-of-words model, which is term-document matrix representation, having much more features

117.921906192414

In [38]:
X = df['original_text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Testing Dataset

In [39]:
test_path = 'Data/WikiLarge_Test.csv'
test_df = pd.read_csv(test_path, skiprows=0, skipfooter=0, engine='python')
test_df

Unnamed: 0,id,original_text,label
0,0,-2011,
1,1,-2011,
2,2,-2000,
3,3,-1997,
4,4,1.636,
...,...,...,...
119087,119087,#NAME?,
119088,119088,#NAME?,
119089,119089,#NAME?,
119090,119090,#NAME?,


In [40]:
test_df.iloc[10000]

id                                                           10000
original_text    An atheist would say that this argument proves...
label                                                          NaN
Name: 10000, dtype: object

Sample Submission

In [41]:
samplesubmission_path = 'Data/sampleSubmission.csv'
samplesubmission_df = pd.read_csv(samplesubmission_path, skiprows=0, skipfooter=0, engine='python')
samplesubmission_df

Unnamed: 0,id,label
0,0,0
1,1,0
2,2,1
3,3,1
4,4,0
...,...,...
119087,119087,0
119088,119088,1
119089,119089,1
119090,119090,1


To conclude, the dataframes we are working with are:

dalechall_df, concreteness_df, aoawords_df, train_df, test_df, samplesubmission_df

# 1. Data Preprocessing

In [42]:
vectorizer = TfidfVectorizer(min_df=10,stop_words='english',ngram_range=(1,2))
X_train_transform = vectorizer.fit_transform(X_train)
X_test_transform  = vectorizer.transform(X_test)

In [43]:
X_train_transform

<333414x57773 sparse matrix of type '<class 'numpy.float64'>'
	with 4071111 stored elements in Compressed Sparse Row format>

## Word2Vec Model

In [44]:
len(stopwords.words('english'))

179

In [45]:
len(set(stopwords.words('english')))

179

In [46]:
#set(stopwords.words('english'))

== dale_chall.txt ==

This is the Dale Chall 3000 Word List, which is one definition of words that are considered "basic" English.

A summary is at https://www.readabilityformulas.com/articles/dale-chall-readability-word-list.php

In [47]:
#Basic english words
dalechall_path = 'Data/dale_chall.txt'
dale_chall = pd.read_csv(dalechall_path,delimiter='\t',header=None,names=['word'])
dale = set(dale_chall['word'].values)

In [48]:
len(dale)

2946

### The 2946 words in dale can be combined with the nltk stopwords.
### We could maybe assign an arbitrary score to each dale_chall word. - for reference only

### We will use a geo dataset to add city and country names to the stopwords library

In [134]:
!python -m pip install datapackage



In [135]:
from datapackage import Package
package = Package('https://datahub.io/core/world-cities/datapackage.json')
# print list of all resources:
print(package.resource_names)

['validation_report', 'world-cities_csv', 'world-cities_json', 'world-cities_zip', 'world-cities_csv_preview', 'world-cities']


In [142]:
world_cities = []
for resource in package.resources:
    if resource.descriptor['datahub']['type'] == 'derived/csv':
        world_cities = resource.read()

In [144]:
len(world_cities)

23018

In [147]:
world_cities[:10]

[['les Escaldes', 'Andorra', 'Escaldes-Engordany', 3040051],
 ['Andorra la Vella', 'Andorra', 'Andorra la Vella', 3041563],
 ['Umm al Qaywayn', 'United Arab Emirates', 'Umm al Qaywayn', 290594],
 ['Ras al-Khaimah', 'United Arab Emirates', 'Raʼs al Khaymah', 291074],
 ['Khawr Fakkān', 'United Arab Emirates', 'Ash Shāriqah', 291696],
 ['Dubai', 'United Arab Emirates', 'Dubai', 292223],
 ['Dibba Al-Fujairah', 'United Arab Emirates', 'Al Fujayrah', 292231],
 ['Dibba Al-Hisn', 'United Arab Emirates', 'Al Fujayrah', 292239],
 ['Sharjah', 'United Arab Emirates', 'Ash Shāriqah', 292672],
 ['Ar Ruways', 'United Arab Emirates', 'Abu Dhabi', 292688]]

In [146]:
world_cities_df = pd.DataFrame(world_cities, columns=['name', 'country', 'subcountry', 'geonameid'])

In [148]:
world_cities_df

Unnamed: 0,name,country,subcountry,geonameid
0,les Escaldes,Andorra,Escaldes-Engordany,3040051
1,Andorra la Vella,Andorra,Andorra la Vella,3041563
2,Umm al Qaywayn,United Arab Emirates,Umm al Qaywayn,290594
3,Ras al-Khaimah,United Arab Emirates,Raʼs al Khaymah,291074
4,Khawr Fakkān,United Arab Emirates,Ash Shāriqah,291696
...,...,...,...,...
23013,Bulawayo,Zimbabwe,Bulawayo,894701
23014,Bindura,Zimbabwe,Mashonaland Central,895061
23015,Beitbridge,Zimbabwe,Matabeleland South,895269
23016,Epworth,Zimbabwe,Harare,1085510


In [155]:
world_cities_df = world_cities_df.applymap(lambda s:s.lower() if type(s) == str else s)

In [159]:
world_cities_df[world_cities_df['country']=='france']

Unnamed: 0,name,country,subcountry,geonameid
6633,yerres,france,île-de-france,2967245
6634,wittenheim,france,alsace-champagne-ardenne-lorraine,2967318
6635,wattrelos,france,nord-pas-de-calais-picardie,2967421
6636,wasquehal,france,nord-pas-de-calais-picardie,2967438
6637,voiron,france,auvergne-rhône-alpes,2967758
...,...,...,...,...
7261,marseille 15,france,provence-alpes-côte d'azur,7284896
7262,marseille 16,france,provence-alpes-côte d'azur,7284897
7263,la defense,france,île-de-france,8504417
7264,saint-quentin-en-yvelines,france,île-de-france,8533870


In [168]:
cities = set(world_cities_df['name'].unique())
countries = set(world_cities_df['country'].unique())
subcountries = set(world_cities_df['subcountry'].unique())

In [169]:
#We will add this to stopwords
geo_data = cities | countries | subcountries

In [171]:
len(cities)

21940

In [172]:
len(countries)

244

In [173]:
len(subcountries)

2594

In [170]:
len(geo_data)

23803

In [260]:
language_package = Package('https://datahub.io/core/language-codes/datapackage.json')

# print list of all resources:
print(language_package.resource_names)

['validation_report', 'language-codes_csv', 'language-codes-3b2_csv', 'language-codes-full_csv', 'ietf-language-tags_csv', 'language-codes_json', 'language-codes-3b2_json', 'language-codes-full_json', 'ietf-language-tags_json', 'language-codes_zip', 'language-codes', 'language-codes-3b2', 'language-codes-full', 'ietf-language-tags']


In [263]:
# print processed tabular data (if exists any)
languages_data = []
#for resource in language_package.resources:
#    if resource.descriptor['datahub']['derivedFrom']=='language-codes':
#        print(resource.read())

In [264]:
languages_data = language_package.resources[1].read()

In [266]:
languages_data

[['aa', 'Afar'],
 ['ab', 'Abkhazian'],
 ['ae', 'Avestan'],
 ['af', 'Afrikaans'],
 ['ak', 'Akan'],
 ['am', 'Amharic'],
 ['an', 'Aragonese'],
 ['ar', 'Arabic'],
 ['as', 'Assamese'],
 ['av', 'Avaric'],
 ['ay', 'Aymara'],
 ['az', 'Azerbaijani'],
 ['ba', 'Bashkir'],
 ['be', 'Belarusian'],
 ['bg', 'Bulgarian'],
 ['bh', 'Bihari languages'],
 ['bi', 'Bislama'],
 ['bm', 'Bambara'],
 ['bn', 'Bengali'],
 ['bo', 'Tibetan'],
 ['br', 'Breton'],
 ['bs', 'Bosnian'],
 ['ca', 'Catalan; Valencian'],
 ['ce', 'Chechen'],
 ['ch', 'Chamorro'],
 ['co', 'Corsican'],
 ['cr', 'Cree'],
 ['cs', 'Czech'],
 ['cu',
  'Church Slavic; Old Slavonic; Church Slavonic; Old Bulgarian; Old Church Slavonic'],
 ['cv', 'Chuvash'],
 ['cy', 'Welsh'],
 ['da', 'Danish'],
 ['de', 'German'],
 ['dv', 'Divehi; Dhivehi; Maldivian'],
 ['dz', 'Dzongkha'],
 ['ee', 'Ewe'],
 ['el', 'Greek, Modern (1453-)'],
 ['en', 'English'],
 ['eo', 'Esperanto'],
 ['es', 'Spanish; Castilian'],
 ['et', 'Estonian'],
 ['eu', 'Basque'],
 ['fa', 'Persian'],
 ['

In [269]:
languages_df = pd.DataFrame(languages_data, columns=['alpha2', 'english'])
languages_df = languages_df.applymap(lambda s:s.lower() if type(s) == str else s)
languages_df

Unnamed: 0,alpha2,english
0,aa,afar
1,ab,abkhazian
2,ae,avestan
3,af,afrikaans
4,ak,akan
...,...,...
179,yi,yiddish
180,yo,yoruba
181,za,zhuang; chuang
182,zh,chinese


In [270]:
languages = set(languages_df['english'].unique())

In [271]:
len(languages)

184

In [324]:
nationality_path = 'Data/CH_Nationality_List_20171130_v1.csv'
nationality_df = pd.read_csv(nationality_path, skiprows=0, skipfooter=0, engine='python')
nationality_df = nationality_df.applymap(lambda s:s.lower() if type(s) == str else s)
nationality_df

Unnamed: 0,Nationality
0,afghan
1,albanian
2,algerian
3,american
4,andorran
...,...
220,wallisian
221,welsh
222,yemeni
223,zambian


In [327]:
nationalities = set(nationality_df['Nationality'].unique())
len(nationalities)

225

In [391]:
firstname_path = 'Data/new-top-firstNames.csv'
firstname_df = pd.read_csv(firstname_path, skiprows=0, skipfooter=0, engine='python')
firstname_df = firstname_df.applymap(lambda s:s.lower() if type(s) == str else s)
firstname_df

Unnamed: 0.1,Unnamed: 0,name,newPerct2013
0,1,michael,0.011577
1,2,james,0.010218
2,3,john,0.009675
3,4,robert,0.009493
4,5,david,0.008943
...,...,...,...
95,96,christina,0.001435
96,97,julie,0.001418
97,98,jordan,0.001416
98,99,kyle,0.001413


In [392]:
firstnames = set(firstname_df['name'].unique())
len(firstnames)

100

In [605]:
firstname_path2 = 'Data/babynames-clean.csv'
firstname_df2 = pd.read_csv(firstname_path2, header= None, skiprows=0, skipfooter=0, engine='python')
firstname_df2 = firstname_df2.applymap(lambda s:s.lower() if type(s) == str else s)
firstname_df2

Unnamed: 0,0,1
0,john,boy
1,william,boy
2,james,boy
3,charles,boy
4,george,boy
...,...,...
6777,laylah,girl
6778,carleigh,girl
6779,kenley,girl
6780,sloane,girl


In [607]:
firstnames2 = set(firstname_df2[0].unique())
len(firstnames2)

6782

In [609]:
firstnames = firstnames | firstnames2
len(firstnames)

6782

In [610]:
surname_path = 'Data/new-top-surnames.csv'
surname_df = pd.read_csv(surname_path, skiprows=0, skipfooter=0, engine='python')
surname_df = surname_df.applymap(lambda s:s.lower() if type(s) == str else s)
surname_df

Unnamed: 0.1,Unnamed: 0,name,perct2013
0,1,smith,0.007999
1,2,johnson,0.006346
2,3,williams,0.005330
3,4,brown,0.004724
4,5,jones,0.004676
...,...,...,...
95,96,vasquez,0.000760
96,97,sanders,0.000753
97,98,jimenez,0.000751
98,99,long,0.000747


In [611]:
surnames = set(surname_df['name'].unique())
len(surnames)

100

In [612]:
days=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
months=['January','February','March', 'April','May','June','July','August','September','October','November','December']
calendar = days.copy()
calendar.extend(months)
calendar = set([w.lower() for w in calendar])

In [613]:
calendar

{'april',
 'august',
 'december',
 'february',
 'friday',
 'january',
 'july',
 'june',
 'march',
 'may',
 'monday',
 'november',
 'october',
 'saturday',
 'september',
 'sunday',
 'thursday',
 'tuesday',
 'wednesday'}

In [614]:
X_train

304501    1979-80 Buffalo Sabres NHL 32 1880 74 1 4 2.36...
162313    Diseases Lentils in culture Lentils are mentio...
336845    Railroads , like the Lehigh Valley Railroad , ...
150625    An example of this would be an individual anim...
40240     Both the Matanuska and Susitna Rivers have maj...
                                ...                        
259178    After the Germans invaded Norway in April 1940...
365838    July 28 - Henry Bennet , 1st Earl of Arlington...
131932    Pancake restaurants are popular family restaur...
146867                                 A cycling domestique
121958    David Boreanaz 's first paid acting appearance...
Name: original_text, Length: 333414, dtype: object

In [615]:
X_train[304501]

'1979-80 Buffalo Sabres NHL 32 1880 74 1 4 2.36 20 8 4 0 0.000'

In [616]:
gensim.utils.simple_preprocess(X_train[304501])

['buffalo', 'sabres', 'nhl']

In [617]:
#gensim.parsing.preprocessing.STOPWORDS
#stopWords

In [618]:
tokenized_text_train=[]
tokenized_text_test=[]
stopWords = set(stopwords.words('english')) | dale | geo_data | languages | nationalities | firstnames | surnames | calendar
# This cell will run 4 minutes
import gensim
from nltk.stem.porter import *
def lemmatize_stemming(text):
    stemmer = PorterStemmer()
    #Un-hash next line to use stemming
    #return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
    #Un-hash next line to NOT use stemming
    return WordNetLemmatizer().lemmatize(text, pos='v')

# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in stopWords and len(token) > 3:
            #Un-hash next line to use lemmatization/stemming
            result.append(lemmatize_stemming(token))
            #Un-hash next line to NOT use lemmatization/stemming
            #result.append(token)
            
    return result

tokenized_text_train = [preprocess(text) for text in X_train]
tokenized_text_test=[preprocess(text) for text in X_test]

#for text in tqdm(X_train):
#    tokens_in_text = word_tokenize(text)
#    tokens_in_text = [word for word in tokens_in_text if word.lower() not in stopWords]
#    tokenized_text_train.append(tokens_in_text)
    
#for text in tqdm(X_test):
#    tokens_in_text = word_tokenize(text)
#    tokens_in_text = [word for word in tokens_in_text if word.lower() not in stopWords]
#    tokenized_text_test.append(tokens_in_text)

In [619]:
len(stopWords)

33276

In [620]:
#tokenized_text_train

In [621]:
model = Word2Vec(vector_size=100,window=2,min_count=100,seed= RANDOM_SEED,workers=4)
model.build_vocab(tokenized_text_train)
model.train(tokenized_text_train,total_examples=model.corpus_count,epochs=model.epochs)

(5883773, 9470970)

In [622]:
word_vectors = model.wv

In [623]:
#word_vectors.vocab

In [624]:
word_dict = word_vectors.key_to_index

In [625]:
word_dict

{'unite': 0,
 'department': 1,
 'state': 2,
 'region': 3,
 'commune': 4,
 'include': 5,
 'call': 6,
 'play': 7,
 'national': 8,
 'district': 9,
 'release': 10,
 'years': 11,
 'name': 12,
 'locate': 13,
 'area': 14,
 'former': 15,
 'series': 16,
 'later': 17,
 'album': 18,
 'league': 19,
 'system': 20,
 'work': 21,
 'century': 22,
 'base': 23,
 'award': 24,
 'population': 25,
 'refer': 26,
 'largest': 27,
 'province': 28,
 'start': 29,
 'record': 30,
 'form': 31,
 'ndash': 32,
 'create': 33,
 'main': 34,
 'usually': 35,
 'president': 36,
 'international': 37,
 'force': 38,
 'produce': 39,
 'type': 40,
 'television': 41,
 'use': 42,
 'species': 43,
 'game': 44,
 'common': 45,
 'feature': 46,
 'professional': 47,
 'publish': 48,
 'found': 49,
 'municipality': 50,
 'although': 51,
 'currently': 52,
 'character': 53,
 'championship': 54,
 'modern': 55,
 'famous': 56,
 'develop': 57,
 'members': 58,
 'popular': 59,
 'video': 60,
 'tropical': 61,
 'time': 62,
 'serve': 63,
 'republic': 64,
 '

In [626]:
word_vectors[0] == word_vectors['unite']

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

In [627]:
words_in_vector = word_vectors.index_to_key
len(words_in_vector)

2896

# Word's Difficulty Considered

== Concreteness_ratings_Brysbaert_et_al_BRM.txt ==

This file contains concreteness ratings for 40 thousand English lemma words gathered via Amazon Mechanical Turk. The ratings come from a larger list of 63 thousand words and represent all English words known to 85% of the raters.

The file contains eight columns:
1. The word
2. Whether it is a single word or a two-word expression 
3. The mean concreteness rating
4. The standard deviation of the concreteness ratings
5. The number of persons indicating they did not know the word
6. The total number of persons who rated the word
7. Percentage participants who knew the word
8. The SUBTLEX-US frequency count (on a total of 51 million; Brysbaert & New, 2009) 
9. The dominant part-of-speech usage

Original source: http://crr.ugent.be/archives/1330

Brysbaert, M., Warriner, A.B., & Kuperman, V. (2014). Concreteness ratings for 40 thousand generally known English word lemmas. Behavior Research Methods, 46, 904-911.
http://crr.ugent.be/papers/Brysbaert_Warriner_Kuperman_BRM_Concreteness_ratings.pdf

In [628]:
#Concreteness rating - the higher Conc.M, the easier the word is.
concreteness_path = 'Data/Concreteness_ratings_Brysbaert_et_al_BRM.txt'
concrete_df = pd.read_csv(concreteness_path,delimiter='\t', keep_default_na=False)
concreteset=(concrete_df['Word'].values)

In [629]:
concrete_df

Unnamed: 0,Word,Bigram,Conc.M,Conc.SD,Unknown,Total,Percent_known,SUBTLEX,Dom_Pos
0,roadsweeper,0,4.85,0.37,1,27,0.96,0,0
1,traindriver,0,4.54,0.71,3,29,0.90,0,0
2,tush,0,4.45,1.01,3,25,0.88,66,0
3,hairdress,0,3.93,1.28,0,29,1.00,1,0
4,pharmaceutics,0,3.77,1.41,4,26,0.85,0,0
...,...,...,...,...,...,...,...,...,...
39949,unenvied,0,1.21,0.62,1,30,0.97,0,
39950,agnostically,0,1.20,0.50,2,27,0.93,0,
39951,conceptualistic,0,1.18,0.50,4,26,0.85,0,
39952,conventionalism,0,1.18,0.48,1,29,0.97,0,


In [630]:
concrete_df.Bigram.value_counts()

0    37058
1     2896
Name: Bigram, dtype: int64

In [631]:
concrete_df[concrete_df.Bigram==1]

Unnamed: 0,Word,Bigram,Conc.M,Conc.SD,Unknown,Total,Percent_known,SUBTLEX,Dom_Pos
28707,baking soda,1,5.00,0.00,0,30,1.00,0,
28709,baseball bat,1,5.00,0.00,0,29,1.00,0,
28710,bath towel,1,5.00,0.00,0,29,1.00,0,
28711,beach ball,1,5.00,0.00,0,28,1.00,0,
28712,bed sheet,1,5.00,0.00,0,28,1.00,0,
...,...,...,...,...,...,...,...,...,...
39619,tantamount to,1,1.52,0.85,4,27,0.85,0,
39857,chance on,1,1.38,0.75,2,28,0.93,0,
39871,free rein,1,1.37,0.63,2,29,0.93,0,
39899,by chance,1,1.34,0.72,1,30,0.97,0,


In [632]:
#There is no Nan value in Conc.M column
concrete_df[concrete_df['Conc.M'].isna()]

Unnamed: 0,Word,Bigram,Conc.M,Conc.SD,Unknown,Total,Percent_known,SUBTLEX,Dom_Pos


### Are we gonna consider bigrams in this dataset, given it's only a small fraction ~ 8% in size?

In [633]:
np.min(concrete_df['Conc.M'])

1.04

In [634]:
np.max(concrete_df['Conc.M'])

5.0

### Concreteness values range from 1 - 5, we could possible use the inverse value of concreteness to scale it to a 0-1 range and give easier words less weight.

In [635]:
concrete_words = list(concrete_df['Word'].values)

In [636]:
len(concrete_words)

39954

In [637]:
concrete_complement = [word for word in words_in_vector if word not in concrete_words]

In [638]:
len(concrete_complement)

570

In [639]:
concrete_complement

['years',
 'largest',
 'ndash',
 'members',
 'aisne',
 'europe',
 'countries',
 'european',
 'calvados',
 'basse',
 'normandie',
 'islands',
 'areas',
 'loire',
 'gironde',
 'events',
 'africa',
 'picardie',
 'languages',
 'aquitaine',
 'songs',
 'systems',
 'others',
 'cities',
 'players',
 'animals',
 'disney',
 'britain',
 'oldest',
 'units',
 'websites',
 'windows',
 'larger',
 'mountains',
 'albums',
 'students',
 'alpes',
 'nations',
 'partement',
 'months',
 'nobel',
 'african',
 'things',
 'cells',
 'picardy',
 'towns',
 'sarthe',
 'regions',
 'humans',
 'elements',
 'prix',
 'korea',
 'inhabitants',
 'earlier',
 'greatest',
 'hours',
 'brothers',
 'friends',
 'vocals',
 'saturn',
 'counties',
 'problems',
 'examples',
 'users',
 'artists',
 'weeks',
 'products',
 'roles',
 'provence',
 'nazi',
 'isbn',
 'linux',
 'versions',
 'provinces',
 'earliest',
 'longest',
 'khan',
 'materials',
 'microsoft',
 'centuries',
 'rhine',
 'goals',
 'kilometres',
 'novels',
 'families',
 'dea

In [640]:
concrete_intersect = [word for word in words_in_vector if word in concrete_words]

In [641]:
len(concrete_intersect)

2326

In [642]:
concrete_intersect[0]

'unite'

In [643]:
word_vectors['state']

array([-0.34864047, -0.96178025,  1.0424236 , -0.27514943, -1.125268  ,
        0.17379373, -0.5000698 ,  0.61671704,  0.3502842 , -0.0587092 ,
        0.11508509, -0.21317965,  1.1454185 , -0.40656507, -0.89661133,
       -1.2150089 ,  0.40510127, -0.758923  , -0.7322218 , -0.38091078,
        0.09221615,  0.01026862,  0.4988388 ,  0.6096505 ,  1.1464705 ,
       -0.46910125,  0.2855899 , -0.5065553 , -1.1206437 ,  0.49509016,
       -0.4553206 , -0.20433855,  0.1936442 ,  0.00710674,  1.4585135 ,
       -0.3271947 ,  0.09666988,  0.23133074,  0.59747565,  0.6051638 ,
       -0.821614  , -0.25832888, -0.08141857, -1.1627805 , -0.1859263 ,
       -0.74427146, -0.2904973 , -1.0154371 , -0.09791026, -0.28925666,
       -0.27558604, -0.28107327,  0.45664597, -0.73296994,  0.01105025,
       -0.03365638,  0.07879242, -0.16169213, -0.5991656 , -0.29063004,
        0.4216433 , -0.01379751, -0.18666983,  0.22629549, -1.3479476 ,
       -0.8134935 , -0.31116077,  0.22562763, -0.07893478,  0.58

In [644]:
concrete_df[concrete_df['Word']=='state']['Conc.M'].values

array([3.52])

In [645]:
for word in concrete_intersect:
    word_vectors[word] = word_vectors[word] * 1/concrete_df[concrete_df['Word']==word]['Conc.M'].values

== AoA_51715_words.csv ==

This file contains "Age of Acquisition" (AoA) estimates for about 51k English words, which refers to the approximate age (in years) when a word was learned. Early words, being more basic, have lower average AoA.

The main columns you will be interested in are "Word" and "AoA_Kup_lem". But the others may be useful too.

The file contains these columns:

Word :: The word in question
Alternative.spelling :: if the Word may be spelled frequently in another form	
Freq_pm	:: Freq of the Word in general English (larger -> more common)
Dom_PoS_SUBTLEX	:: Dominant part of speech in general usage
Nletters :: number of letters 
Nphon :: number of phonemes
Nsyll :: number of syllables
Lemma_highest_PoS :: the "lemmatized" or "root" form of the word (in the dominant part of speech. e.g. The root form of the verb "abates" is "abate".
AoA_Kup	:: The AoA from a previous study by Kuperman et al.
Perc_known :: Percent of people who knew the word in the Kuperman et al. study
AoA_Kup_lem :: Estimated AoA based on Kuperman et al. study lemmatized words. THIS IS THE MAIN COLUMN OF INTEREST.
Perc_known_lem	:: Estimated percentage of people who would know this form of the word in the Kuperman study.
AoA_Bird_lem :: AoA reported in previous study by Bird (2001) 
AoA_Bristol_lem	:: AoA reported in previous study from Bristol Univ. (2006)
AoA_Cort_lem :: AoA reported in previous study by Cortese & Khanna (2008)
AoA_Schock :: AoA reported in previous study by Schock (2012)

Original source : http://crr.ugent.be/archives/806

In [646]:
#AoA
#Perc_known_lem, AoA_Kup_lem
aoawords_path = 'Data/AoA_51715_words.csv'
AoA = pd.read_csv(aoawords_path,encoding = 'unicode_escape')
AoA_set = set(AoA['Word'].values)
AoA.head(5)

Unnamed: 0,Word,Alternative.spelling,Freq_pm,Dom_PoS_SUBTLEX,Nletters,Nphon,Nsyll,Lemma_highest_PoS,AoA_Kup,Perc_known,AoA_Kup_lem,Perc_known_lem,AoA_Bird_lem,AoA_Bristol_lem,AoA_Cort_lem,AoA_Schock
0,a,a,20415.27,Article,1,1,1,a,2.89,1.0,2.89,1.0,3.16,,,
1,aardvark,aardvark,0.41,Noun,8,7,2,aardvark,9.89,1.0,9.89,1.0,,,,
2,abacus,abacus,0.24,Noun,6,6,3,abacus,8.69,0.65,8.69,0.65,,,,
3,abacuses,abacuses,0.02,Noun,8,9,4,abacus,,,8.69,0.65,,,,
4,abalone,abalone,0.51,Verb,7,7,4,abalone,12.23,0.72,12.23,0.72,,,,


In [647]:
len(AoA)

51715

In [648]:
AoA.AoA_Kup_lem.min()

1.58

In [649]:
AoA.AoA_Kup_lem.max()

25.0

In [650]:
AoA.sort_values(['AoA_Kup_lem'], ascending=False)

Unnamed: 0,Word,Alternative.spelling,Freq_pm,Dom_PoS_SUBTLEX,Nletters,Nphon,Nsyll,Lemma_highest_PoS,AoA_Kup,Perc_known,AoA_Kup_lem,Perc_known_lem,AoA_Bird_lem,AoA_Bristol_lem,AoA_Cort_lem,AoA_Schock
14878,eisteddfod,eisteddfod,,,10,8,3,eisteddfod,25.0,0.05,25.0,0.05,,,,
2084,architrave,architrave,0.04,Noun,10,8,3,architrave,21.0,0.05,21.0,0.05,,,,
6274,calceolaria,calceolaria,0.02,Noun,11,11,6,calceolaria,21.0,0.11,21.0,0.11,,,,
32931,penury,penury,0.02,Noun,6,7,3,penury,20.6,0.28,20.6,0.28,,,,
25243,kendo,kendo,0.37,Noun,5,5,2,kendo,20.5,0.11,20.5,0.11,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38932,rogation,rogation,,,8,7,3,rogation,,0.00,,0.00,,,,
42089,smilax,smilax,,,6,7,2,smilax,,0.00,,0.00,,,,
46368,thulium,thulium,,,7,6,3,thulium,,0.00,,0.00,,,,
50862,wickiup,wickiup,0.27,Noun,7,6,3,wickiup,,0.00,,0.00,,,,


In [651]:
len(AoA[AoA['AoA_Kup_lem'].isna()])

20

In [652]:
AoA[AoA['AoA_Kup_lem'].isna()]

Unnamed: 0,Word,Alternative.spelling,Freq_pm,Dom_PoS_SUBTLEX,Nletters,Nphon,Nsyll,Lemma_highest_PoS,AoA_Kup,Perc_known,AoA_Kup_lem,Perc_known_lem,AoA_Bird_lem,AoA_Bristol_lem,AoA_Cort_lem,AoA_Schock
442,actinium,actinium,,,8,8,4,actinium,,0.0,,0.0,,,,
1322,ambuscade,ambuscade,,,9,8,3,ambuscade,,0.0,,0.0,,,,
2306,ashlar,ashlar,,,6,5,2,ashlar,,0.0,,0.0,,,,
5095,bosky,bosky,,,5,4,2,bosky,,0.0,,0.0,,,,
6404,canaille,canaille,,,8,5,2,canaille,,0.0,,0.0,,,,
9004,compeer,compeer,,,7,6,3,compeer,,0.0,,0.0,,,,
9005,compeers,compeers,0.02,Noun,8,7,3,compeer,,,,0.0,,,,
16000,europium,europium,,,8,8,4,europium,,0.0,,0.0,,,,
19065,gallimaufry,gallimaufry,,,11,9,4,gallimaufry,,0.0,,0.0,,,,
22498,hutment,hutment,,,7,7,2,hutment,,0.0,,0.0,,,,


In [653]:
# We are going to impute all Nan values in AoA_Kup_lem as the max AoA value 25, as they appear to be hard words.
AoA['AoA_Kup_lem'].fillna(value=AoA['AoA_Kup_lem'].max(), inplace=True)

In [654]:
AoA.sort_values(['AoA_Kup_lem'], ascending=False)

Unnamed: 0,Word,Alternative.spelling,Freq_pm,Dom_PoS_SUBTLEX,Nletters,Nphon,Nsyll,Lemma_highest_PoS,AoA_Kup,Perc_known,AoA_Kup_lem,Perc_known_lem,AoA_Bird_lem,AoA_Bristol_lem,AoA_Cort_lem,AoA_Schock
2306,ashlar,ashlar,,,6,5,2,ashlar,,0.00,25.00,0.00,,,,
38932,rogation,rogation,,,8,7,3,rogation,,0.00,25.00,0.00,,,,
46368,thulium,thulium,,,7,6,3,thulium,,0.00,25.00,0.00,,,,
14878,eisteddfod,eisteddfod,,,10,8,3,eisteddfod,25.00,0.05,25.00,0.05,,,,
5095,bosky,bosky,,,5,4,2,bosky,,0.00,25.00,0.00,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27395,mamma,mamma,3.02,Noun,5,4,2,mama,,,1.89,1.00,,,,
27393,mamas,mamas,0.71,Noun,5,5,2,mama,,,1.89,1.00,,,,
27392,mama,mama,103.71,Noun,4,4,2,mama,1.89,1.00,1.89,1.00,,,,
29050,mommas,mommas,0.10,Noun,6,5,2,momma,,,1.58,1.00,,,,


### AoA values range from 0 - 25, which means the smaller the AoA value, the easier the word is. We could possibly use the AoA value to give easier words less weight.

In [655]:
aoa_words = list(AoA['Word'].values)

In [656]:
len(aoa_words)

51715

In [657]:
aoa_complement = [word for word in words_in_vector if word not in aoa_words]
aoa_intersect = [word for word in words_in_vector if word in aoa_words]

In [658]:
len(aoa_complement)

342

In [659]:
aoa_complement

['ndash',
 'usually',
 'aisne',
 'europe',
 'european',
 'calvados',
 'basse',
 'normandie',
 'commonly',
 'loire',
 'gironde',
 'africa',
 'picardie',
 'aquitaine',
 'generally',
 'atlantic',
 'officially',
 'lower',
 'especially',
 'disney',
 'throughout',
 'britain',
 'wikipedia',
 'approximately',
 'pacific',
 'mainly',
 'nintendo',
 'alpes',
 'partement',
 'typically',
 'nobel',
 'african',
 'widely',
 'picardy',
 'sarthe',
 'particularly',
 'formerly',
 'olympic',
 'jewish',
 'prix',
 'korea',
 'previously',
 'directly',
 'probably',
 'saturn',
 'online',
 'recently',
 'highly',
 'provence',
 'nazi',
 'isbn',
 'linux',
 'islamic',
 'microsoft',
 'unlike',
 'rhine',
 'relatively',
 'kilometres',
 'completely',
 'kong',
 'aube',
 'frequently',
 'municipalities',
 'azur',
 'shortly',
 'gregorian',
 'closely',
 'largely',
 'traditionally',
 'asian',
 'metres',
 'ardãƒ',
 'respectively',
 'caribbean',
 'tehsil',
 'smackdown',
 'abbottabad',
 'grammy',
 'yorkshire',
 'johann',
 'montre

In [675]:
cities

{'saitama',
 'shardara',
 'rapallo',
 'wazīrābād',
 'malvar',
 'vīsāvadar',
 'yōkaichiba',
 'fundación',
 'gondomar',
 'konya',
 'skadovs’k',
 'tulsa',
 'grenoble',
 'la fría',
 'masumbwe',
 'yangzhou',
 'belleville',
 'cortazar',
 'southall',
 'kolhāpur',
 'zürich (kreis 12)',
 'bhubaneshwar',
 'puurs',
 'challans',
 'kėdainiai',
 'kalach',
 'montebello',
 'north valley stream',
 'astorga',
 'pingliang',
 'ijūin',
 'joliette',
 'kwale',
 'nador',
 'chhala',
 'vikhorevka',
 'wellingborough',
 'asyūţ',
 'viçosa',
 'beveren',
 'ossett',
 'rustenburg',
 'santiago de los caballeros',
 'almansa',
 'santa maria capua vetere',
 'leer',
 'pilsen',
 'gaspar',
 'žiar nad hronom',
 'labé',
 'lys’va',
 'altagracia de orituco',
 'banning',
 'port moody',
 'cibinong',
 'deventer',
 'harīpur',
 'cần giờ',
 'kundla',
 'city center',
 'antibes',
 'diwek',
 'lake zurich',
 'kangaba',
 'maghār',
 'ashta',
 'pimentel',
 'linxia chengguanzhen',
 'jerez de la frontera',
 'arrentela',
 'breda',
 'supaul',
 '

In [677]:
countries

{'afghanistan',
 'aland islands',
 'albania',
 'algeria',
 'american samoa',
 'andorra',
 'angola',
 'anguilla',
 'antigua and barbuda',
 'argentina',
 'armenia',
 'aruba',
 'australia',
 'austria',
 'azerbaijan',
 'bahamas',
 'bahrain',
 'bangladesh',
 'barbados',
 'belarus',
 'belgium',
 'belize',
 'benin',
 'bermuda',
 'bhutan',
 'bolivia',
 'bonaire, saint eustatius and saba',
 'bosnia and herzegovina',
 'botswana',
 'brazil',
 'british virgin islands',
 'brunei',
 'bulgaria',
 'burkina faso',
 'burundi',
 'cambodia',
 'cameroon',
 'canada',
 'cape verde',
 'cayman islands',
 'central african republic',
 'chad',
 'chile',
 'china',
 'christmas island',
 'cocos islands',
 'colombia',
 'comoros',
 'cook islands',
 'costa rica',
 'croatia',
 'cuba',
 'curacao',
 'cyprus',
 'czech republic',
 'democratic republic of the congo',
 'denmark',
 'djibouti',
 'dominica',
 'dominican republic',
 'east timor',
 'ecuador',
 'egypt',
 'el salvador',
 'equatorial guinea',
 'eritrea',
 'estonia',


In [660]:
len(aoa_intersect)

2554

In [661]:
aoa_intersect[:20]

['unite',
 'department',
 'state',
 'region',
 'commune',
 'include',
 'call',
 'play',
 'national',
 'district',
 'release',
 'years',
 'name',
 'locate',
 'area',
 'former',
 'series',
 'later',
 'album',
 'league']

In [662]:
len([word for word in aoa_intersect if word in concrete_intersect])

2226

In [663]:
for word in aoa_intersect:
    word_vectors[word] = word_vectors[word] * AoA[AoA['Word']==word]['AoA_Kup_lem'].values/25

In [664]:
word_vectors['state']

array([-0.02531605, -0.06983836,  0.07569417, -0.0199796 , -0.08170979,
        0.0126198 , -0.03631189,  0.04478207,  0.02543541, -0.00426309,
        0.00835675, -0.01547975,  0.083173  , -0.02952217, -0.06510621,
       -0.08822621,  0.02941588, -0.05510816, -0.05316929, -0.02765932,
        0.00669615,  0.00074564,  0.0362225 ,  0.04426894,  0.0832494 ,
       -0.03406315,  0.02073772, -0.03678282, -0.08137402,  0.0359503 ,
       -0.03306248, -0.01483777,  0.01406121,  0.00051605,  0.10590797,
       -0.0237588 ,  0.00701955,  0.01679777,  0.04338488,  0.04394314,
       -0.05966038, -0.0187582 , -0.0059121 , -0.08443372, -0.01350078,
       -0.05404426, -0.02109407, -0.07373458, -0.00710962, -0.02100398,
       -0.02001131, -0.02040975,  0.03315873, -0.05322361,  0.0008024 ,
       -0.00244391,  0.0057214 , -0.01174105, -0.04350759, -0.0211037 ,
        0.03061705, -0.00100189, -0.01355478,  0.01643214, -0.09787938,
       -0.05907072, -0.02259452,  0.01638364, -0.00573174,  0.04

In [665]:
def generate_dense_features(tokenized_text,word_vectors):
    dense_list=[]
    words=[]
    for _ in tokenized_text: 
        words =[word for word in _ if word in word_vectors.key_to_index]
        
        if len(words) >0:
            dense_list.append(np.mean(word_vectors[words],axis=0))
            
        else: 
            dense_list.append(np.zeros(word_vectors.vector_size))
            
    return np.array(dense_list)

In [666]:
X_train_wv = generate_dense_features(tokenized_text_train,word_vectors)
X_test_wv = generate_dense_features(tokenized_text_test,word_vectors)

In [667]:
X_train_wv.shape

(333414, 100)

In [668]:
lr_wv = LogisticRegression(random_state=RANDOM_SEED,max_iter=1000).fit(X_train_wv,y_train)

In [669]:
accuracy_score(y_test,lr_wv.predict(X_test_wv))

0.5863305900136766

## Bag of Words Model

In [670]:
def dummy_fun(doc):
    return doc
vectorizer = TfidfVectorizer(analyzer='word',tokenizer=dummy_fun, preprocessor=dummy_fun, token_pattern=r'(?u)\b\w\w+__\([\w\s]*\)')
X_train_transform = vectorizer.fit_transform(tokenized_text_train)
X_test_transform  = vectorizer.transform(tokenized_text_test)

In [671]:
len(vectorizer.get_feature_names())

103167

In [672]:
vectorizer.get_feature_names()

['a_nx',
 'aabout',
 'aabye',
 'aach',
 'aafc',
 'aage',
 'aaiil',
 'aaliyahs',
 'aall',
 'aalto',
 'aames',
 'aamir',
 'aang',
 'aangã',
 'aapep',
 'aarberg',
 'aarburg',
 'aarc',
 'aarde',
 'aardman',
 'aardsma',
 'aardvark',
 'aardvarks',
 'aare',
 'aargauer',
 'aarhus',
 'aaroni',
 'aarons',
 'aarre',
 'aarseth',
 'aartselaar',
 'aarwangen',
 'aasen',
 'aashurah',
 'aast',
 'aastana',
 'aave',
 'ababa',
 'ababba',
 'ababda',
 'abac',
 'abacada',
 'abaci',
 'aback',
 'abacus',
 'abacuses',
 'abad',
 'abagnale',
 'abahutu',
 'abaj',
 'abajo',
 'abakanskoye',
 'abal',
 'abalo',
 'abalone',
 'abando',
 'abandon',
 'abandonded',
 'abandonment',
 'abarat',
 'abassi',
 'abate',
 'abattoirs',
 'abatutsi',
 'abauzit',
 'abavo',
 'abazhou',
 'abaãºj',
 'abba',
 'abbado',
 'abbadon',
 'abbados',
 'abbandando',
 'abbas',
 'abbasi',
 'abbasid',
 'abbasids',
 'abbasies',
 'abbass',
 'abbassid',
 'abbay',
 'abbaye',
 'abbe',
 'abbeydale',
 'abbeys',
 'abbiamo',
 'abbiati',
 'abbiss',
 'abbondanci

In [388]:
X_train_transform

<333414x106068 sparse matrix of type '<class 'numpy.float64'>'
	with 1944434 stored elements in Compressed Sparse Row format>

In [389]:
lr_bow = LogisticRegression(random_state=RANDOM_SEED,max_iter=1000).fit(X_train_transform,y_train)

In [390]:
accuracy_score(y_test,lr_bow.predict(X_test_transform))

0.6466876214698755

In [None]:
model_word = set(word_vectors.index_to_key) #around 6k words in the Word2Vec model

In [None]:
len(model_word)

In [None]:
len(model_word.intersection(concreteset))

In [None]:
word_vectors['live']

In [None]:
lemmatizer = WordNetLemmatizer()
word_list = []
for word in model_word: 
    word_list.append((word,lemmatizer.lemmatize(word.lower())))
df = pd.DataFrame(word_list,columns=['Original','word'])
df = df.merge(AoA,left_on='word',right_on='Word',how='left')
df = df[['Original','word','Perc_known','AoA_Kup_lem']]
word_not_matched = set(df[df['Perc_known'].isnull()].word.values)

for i in range(len(df)):   
    if df['word'][i][0] in set(('0','1','2','3','4','5','6','7','8','9')) or len(df['word'][i])==1:
        df['AoA_Kup_lem'][i] = 3
mean_value = df['AoA_Kup_lem'].mean()
df['AoA_Kup_lem'].fillna(value=mean_value,inplace=True)

In [None]:
#df.loc[df['Original']==['troops','weapons']]
df[df['Original'].isin(['troops','weapon'])]

In [None]:
def generate_perc_known(tokenized_text,df):
    avg_perc_know=None
    perc_know_list=[]
    for _ in tokenized_text: 
        words =[word for word in _ if word in word_vectors.key_to_index]
        
        if len(words) >0:
            avg_perc_know = np.mean(df[df['Original'].isin(words)]['AoA_Kup_lem'])
            perc_know_list.append(avg_perc_know)
        else: 
            
            perc_know_list.append(0)
            
    return perc_know_list

In [188]:
df_train = pd.DataFrame(X_train_wv)
#df_train['year'] = generate_perc_known(tokenized_text_train,df)

In [189]:
df_test = pd.DataFrame(X_test_wv)
#df_test['year'] = generate_perc_known(tokenized_text_test,df)

In [190]:
df_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,year
0,0.163066,0.066503,0.007967,-0.368197,-0.475971,0.174799,-0.226500,0.288741,-0.101118,0.251682,...,0.343301,0.449110,-0.301583,-0.318929,-0.027628,-0.003120,0.640888,0.395134,-0.211103,7.319698
1,0.098105,-0.697004,-0.067849,0.073167,0.001977,-0.519177,-0.064798,-0.384014,0.359658,-0.080730,...,0.100243,-0.152842,0.018108,-0.616458,0.208961,0.239500,-0.078117,0.907243,0.644744,8.900953
2,0.608009,-0.270855,-0.351858,-1.324698,0.509448,0.466696,-0.869674,0.316894,-0.832663,0.482958,...,-0.804097,-1.260673,-0.484280,-1.026836,-0.381989,0.006748,0.651532,0.502151,-1.543706,7.385000
3,-0.231419,-0.460309,-0.321846,-0.401228,-1.299778,-0.461486,0.002258,-0.175611,0.296010,0.373852,...,-0.068769,0.134842,0.026607,0.200088,0.376173,0.175164,-0.239718,0.463941,-0.541556,8.971588
4,-0.155188,0.110082,0.749716,-0.211680,-0.294006,-0.928232,0.095029,0.326077,0.020296,0.458989,...,-0.496064,0.562254,-0.161042,-0.556670,-0.152797,0.216482,-0.109737,1.134926,-0.073294,7.939948
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83349,-0.212178,-0.577913,0.233901,-0.283749,-0.250686,-0.740940,-0.073741,0.163121,-0.268991,0.569778,...,-0.062439,0.181861,-0.294118,0.674152,-0.312687,-0.416863,0.006465,0.296494,0.144787,7.846061
83350,0.083994,-0.119798,0.014636,-0.046240,-0.176528,-0.371178,-0.049741,-0.063575,0.069346,0.097987,...,-0.106163,0.598239,-0.423099,-0.277646,0.249423,0.238795,-0.084238,0.325800,-0.371009,7.653076
83351,-0.027579,-0.583053,-0.212853,0.064448,-0.001676,-0.386104,-0.194504,0.125628,0.087920,0.013556,...,-0.087076,0.200042,0.022237,0.865286,0.345294,0.206362,-0.050420,0.287032,-0.024188,6.618984
83352,0.150752,-0.344787,0.016055,-0.438976,0.105028,-0.072180,-0.229091,0.010541,-0.065317,0.162644,...,-0.090400,-0.352687,-0.262663,-0.028436,0.180446,-0.053098,0.068634,-0.034959,0.074879,7.009195


In [191]:
lr = LogisticRegression(random_state=RANDOM_SEED,max_iter=1000).fit(df_train,y_train)

In [192]:
accuracy_score(y_test,lr.predict(df_test))

0.58372723564556

# 2. Supervised Learning

## Random Classifier

In [64]:
dummy_bow = DummyClassifier(strategy='uniform',random_state=RANDOM_SEED).fit(X_train_transform,y_train)

In [65]:
accuracy_score(y_test, dummy_bow.predict(X_test_transform))

0.5011277203253593

In [66]:
dummy_wv = DummyClassifier(strategy='uniform',random_state=RANDOM_SEED).fit(X_train_wv,y_train)

In [67]:
accuracy_score(y_test,dummy_wv.predict(X_test_wv))

0.5011277203253593

## Logistic Regression Classifier

In [68]:
lr_bow = LogisticRegression(random_state=RANDOM_SEED,max_iter=1000).fit(X_train_transform,y_train)

In [69]:
accuracy_score(y_test,lr_bow.predict(X_test_transform))

0.6465916452719725

In [70]:
lr_wv = LogisticRegression(random_state=RANDOM_SEED,max_iter=1000).fit(X_train_wv,y_train)

In [71]:
accuracy_score(y_test,lr_wv.predict(X_test_wv))

0.5640041269765098

## Random Forest Classifier

In [227]:
rf_bow = RandomForestClassifier(n_estimators=500,max_depth=5,random_state=RANDOM_SEED).fit(X_train_transform,y_train)

In [228]:
accuracy_score(y_test,rf_bow.predict(X_test_transform))

0.6416968591789236

In [None]:
rf_wv = RandomForestClassifier(n_estimators=100,max_depth=5,random_state=RANDOM_SEED).fit(X_train_wv,y_train)

In [None]:
accuracy_score(y_test,rf_wv.predict(X_test_wv))

# 3. Unsupervised Learning

In [None]:
kmeans = KMeans(n_clusters=2,random_state=RANDOM_SEED).fit(X_train_transform)

In [None]:
cluster_df = pd.DataFrame({'cluster':kmeans.labels_,'y_label':y_train,'text':X_train})
cluster_df

### LDA Topic Modeling - Consider NMF to create a document-topic matrix

In [267]:
import gensim
from nltk.stem.porter import *
def lemmatize_stemming(text):
    stemmer = PorterStemmer()
    #Un-hash next line to use stemming
    #return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
    #Un-hash next line to NOT use stemming
    return WordNetLemmatizer().lemmatize(text, pos='v')
# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            #Un-hash next line to use stemming
            result.append(lemmatize_stemming(token))
            #Un-hash next line to NOT use stemming
            #result.append(token)
            
    return result

In [268]:
df['original_text'][0]

"There is manuscript evidence that Austen continued to work on these pieces as late as the period 1809 Ã¢ '' 11 , and that her niece and nephew , Anna and James Edward Austen , made further additions as late as 1814 ."

In [269]:
preprocess(df['original_text'][0])

['manuscript',
 'evidence',
 'austen',
 'continue',
 'work',
 'piece',
 'late',
 'period',
 'niece',
 'nephew',
 'anna',
 'jam',
 'edward',
 'austen',
 'additions',
 'late']

In [238]:
# This cell will run about 2 minutes
processed_docs = [preprocess(text) for text in df['original_text']]

In [239]:
dictionary = gensim.corpora.Dictionary(processed_docs)
dictionary

<gensim.corpora.dictionary.Dictionary at 0x1567b848520>

In [240]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
#bow_corpus

In [241]:
len(bow_corpus)

416768

In [242]:
# This cell will run 10 minutes
#lda_model =  gensim.models.LdaMulticore(bow_corpus, 
#                                   num_topics = 8, 
#                                   id2word = dictionary,                                    
#                                   passes = 10,
#                                   workers = 2)

In [347]:
#for idx, topic in lda_model.print_topics(-1):
#    print("Topic: {} \nWords: {}".format(idx, topic ))
#    print("\n")