In [98]:
import pandas as pd
import numpy as np
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import altair as alt
import seaborn as sns
import re

In [99]:
pip install gensim

Note: you may need to restart the kernel to use updated packages.


In [100]:
from gensim.models.word2vec import Word2Vec
from tqdm.notebook import tqdm
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import Counter
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mryua\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mryua\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mryua\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [101]:
RANDOM_SEED=694

Training Dataset

In [102]:
train_path = 'Data/WikiLarge_Train.csv'
df = pd.read_csv(train_path, skiprows=0, skipfooter=0, engine='python')
df

Unnamed: 0,original_text,label
0,There is manuscript evidence that Austen conti...,1
1,"In a remarkable comparative analysis , Mandaea...",1
2,"Before Persephone was released to Hermes , who...",1
3,Cogeneration plants are commonly found in dist...,1
4,"Geneva -LRB- , ; , ; , ; ; -RRB- is the second...",1
...,...,...
416763,A Duke Nukem 3D version has been sold for Xbox...,0
416764,"However , it is becoming replaced as a method ...",0
416765,There are hand gestures in both Hindu and Budd...,0
416766,"If it is necessary to use colors , try to choo...",0


In [103]:
len(df[df['label']==1])/len(df) # the dataset label is well balanced 

0.5

In [104]:
df.iloc[50]['original_text']

'He studied in Armenia and Istanbul , then at Wisconsin University which he finished in 1915 .'

In [105]:
df['original_text'].apply(lambda x: len(x)).mean()
# This means all texts are considered short text, which allows us to use dense representations, 
# as dense representations work well with short text.
# Gensim.KeyedVectors.load('assets/wikipedia.100.word-vecs.kv')??? How to generate and use this???
# Maybe we should train word2vec model on the entire corpus. Just training data? TOP 100 word-vectors(features)
# Alternatively we could use bag-of-words model, which is term-document matrix representation, having much more features

117.921906192414

In [106]:
X = df['original_text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Testing Dataset

In [107]:
test_path = 'Data/WikiLarge_Test.csv'
test_df = pd.read_csv(test_path, skiprows=0, skipfooter=0, engine='python')
test_df

Unnamed: 0,id,original_text,label
0,0,-2011,
1,1,-2011,
2,2,-2000,
3,3,-1997,
4,4,1.636,
...,...,...,...
119087,119087,#NAME?,
119088,119088,#NAME?,
119089,119089,#NAME?,
119090,119090,#NAME?,


In [108]:
test_df.iloc[10000]

id                                                           10000
original_text    An atheist would say that this argument proves...
label                                                          NaN
Name: 10000, dtype: object

Sample Submission

In [109]:
samplesubmission_path = 'Data/sampleSubmission.csv'
samplesubmission_df = pd.read_csv(samplesubmission_path, skiprows=0, skipfooter=0, engine='python')
samplesubmission_df

Unnamed: 0,id,label
0,0,0
1,1,0
2,2,1
3,3,1
4,4,0
...,...,...
119087,119087,0
119088,119088,1
119089,119089,1
119090,119090,1


To conclude, the dataframes we are working with are:

dalechall_df, concreteness_df, aoawords_df, train_df, test_df, samplesubmission_df

### LDA Topic Modeling - Consider NMF to create a document-topic matrix

In [128]:
import gensim
from nltk.stem.porter import *
def lemmatize_stemming(text):
    stemmer = PorterStemmer()
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
            
    return result

In [129]:
df['original_text'][0]

"There is manuscript evidence that Austen continued to work on these pieces as late as the period 1809 Ã¢ '' 11 , and that her niece and nephew , Anna and James Edward Austen , made further additions as late as 1814 ."

In [130]:
preprocess(df['original_text'][0])

['manuscript',
 'evid',
 'austen',
 'continu',
 'work',
 'piec',
 'late',
 'period',
 'niec',
 'nephew',
 'anna',
 'jam',
 'edward',
 'austen',
 'addit',
 'late']

In [131]:
# This cell will run about 2 minutes
processed_docs = [preprocess(text) for text in df['original_text']]

In [135]:
dictionary = gensim.corpora.Dictionary(processed_docs)
dictionary

<gensim.corpora.dictionary.Dictionary at 0x20dcd558790>

In [134]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus

[[(0, 1),
  (1, 1),
  (2, 2),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 2),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1)],
 [(14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 2),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1)],
 [(11, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1)],
 [(43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 2),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 3),
  (56, 1),
  (57, 1),
  (58, 1),
  (59, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 1)],
 [(43, 2),
  (64, 1),
  (65, 1),
  (66, 2),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 2),
  (71, 1)],
 [(42, 1),
  (72, 1),
  (73, 1),
  (74, 1),
  (75, 1),
  (76, 1),
  (77, 1),
  (78, 1),
  (79, 1)],
 [(77, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 1), (86, 1)],
 [(87, 1),
  (88

In [137]:
len(bow_corpus)

416768

In [142]:
# This cell will run 10 minutes
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 8, 
                                   id2word = dictionary,                                    
                                   passes = 10,
                                   workers = 2)

KeyboardInterrupt: 

In [None]:
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

# 1. Data Preprocessing

In [15]:
vectorizer = TfidfVectorizer(min_df=10,stop_words='english',ngram_range=(1,2))
X_train_transform = vectorizer.fit_transform(X_train)
X_test_transform  = vectorizer.transform(X_test)

In [16]:
X_train_transform

<333414x57773 sparse matrix of type '<class 'numpy.float64'>'
	with 4071111 stored elements in Compressed Sparse Row format>

## Word2Vec Model

In [144]:
len(stopwords.words('english'))

179

In [146]:
len(set(stopwords.words('english')))

179

== dale_chall.txt ==

This is the Dale Chall 3000 Word List, which is one definition of words that are considered "basic" English.

A summary is at https://www.readabilityformulas.com/articles/dale-chall-readability-word-list.php

In [147]:
#Basic english words
dalechall_path = 'Data/dale_chall.txt'
dale_chall = pd.read_csv(dalechall_path,delimiter='\t',header=None,names=['word'])
dale = set(dale_chall['word'].values)

In [148]:
len(dale)

2946

### The 2946 words in dale can be combined with the nltk stopwords.

In [149]:
stopWords = set(stopwords.words('english')) | dale

In [150]:
len(stopWords)

2986

In [153]:
X_train

304501    1979-80 Buffalo Sabres NHL 32 1880 74 1 4 2.36...
162313    Diseases Lentils in culture Lentils are mentio...
336845    Railroads , like the Lehigh Valley Railroad , ...
150625    An example of this would be an individual anim...
40240     Both the Matanuska and Susitna Rivers have maj...
                                ...                        
259178    After the Germans invaded Norway in April 1940...
365838    July 28 - Henry Bennet , 1st Earl of Arlington...
131932    Pancake restaurants are popular family restaur...
146867                                 A cycling domestique
121958    David Boreanaz 's first paid acting appearance...
Name: original_text, Length: 333414, dtype: object

In [158]:
X_train[304501]

'1979-80 Buffalo Sabres NHL 32 1880 74 1 4 2.36 20 8 4 0 0.000'

In [157]:
gensim.utils.simple_preprocess(X_train[304501])

['buffalo', 'sabres', 'nhl']

In [159]:
tokenized_text_train=[]
tokenized_text_test=[]
stopWords = set(stopwords.words('english')) | dale
# This cell will run 4 minutes
import gensim
from nltk.stem.porter import *
def lemmatize_stemming(text):
    stemmer = PorterStemmer()
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
            
    return result

tokenized_text_train = [preprocess(text) for text in X_train]
tokenized_text_test=[preprocess(text) for text in X_test]

#for text in tqdm(X_train):
#    tokens_in_text = word_tokenize(text)
#    tokens_in_text = [word for word in tokens_in_text if word.lower() not in stopWords]
#    tokenized_text_train.append(tokens_in_text)
    
#for text in tqdm(X_test):
#    tokens_in_text = word_tokenize(text)
#    tokens_in_text = [word for word in tokens_in_text if word.lower() not in stopWords]
#    tokenized_text_test.append(tokens_in_text)

In [160]:
tokenized_text_train

[['buffalo', 'sabr'],
 ['diseas',
  'lentil',
  'cultur',
  'lentil',
  'mention',
  'time',
  'testament',
  'time',
  'recount',
  'incid',
  'jacob',
  'purchas',
  'birthright',
  'esau',
  'stew',
  'lentil',
  'genesi'],
 ['railroad',
  'like',
  'lehigh',
  'valley',
  'railroad',
  'import',
  'materi',
  'finish',
  'good',
  'creat',
  'job'],
 ['exampl',
  'individu',
  'anim',
  'learn',
  'bud',
  'seedl',
  'food',
  'crop',
  'destroy',
  'normal',
  'suppli',
  'food',
  'later',
  'avail',
  'matur',
  'plant'],
 ['matanuska', 'susitna', 'river', 'major', 'salmon', 'spawn', 'stream'],
 ['mettingen'],
 ['analog', 'natur', 'caus', 'great', 'number', 'problem'],
 ['cooper',
  'republ',
  'kazakhstan',
  'islam',
  'republ',
  'pakistan',
  'kazakhstan',
  'ministri',
  'foreign',
  'affair',
  'kazakhstan',
  'emerg',
  'market',
  'pakistani',
  'good'],
 ['import', 'philosophi'],
 ['extrem',
  'attain',
  'wind',
  'speed',
  'stretch',
  'mile',
  'stay',
  'grind',
  

In [161]:
model = Word2Vec(vector_size=100,window=2,min_count=100,seed= RANDOM_SEED,workers=4)
model.build_vocab(tokenized_text_train)
model.train(tokenized_text_train,total_examples=model.corpus_count,epochs=model.epochs)

(11794243, 15511955)

In [162]:
word_vectors = model.wv

In [163]:
def generate_dense_features(tokenized_text,word_vectors):
    dense_list=[]
    words=[]
    for _ in tokenized_text: 
        words =[word for word in _ if word in word_vectors.key_to_index]
        
        if len(words) >0:
            dense_list.append(np.mean(word_vectors[words],axis=0))
            
        else: 
            dense_list.append(np.zeros(word_vectors.vector_size))
            
    return np.array(dense_list)

In [164]:
X_train_wv = generate_dense_features(tokenized_text_train,word_vectors)
X_test_wv = generate_dense_features(tokenized_text_test,word_vectors)

## Bag of Words Model

In [223]:
def dummy_fun(doc):
    return doc
vectorizer = TfidfVectorizer(analyzer='word',tokenizer=dummy_fun, preprocessor=dummy_fun, token_pattern=r'(?u)\b\w\w+__\([\w\s]*\)')
X_train_transform = vectorizer.fit_transform(tokenized_text_train)
X_test_transform  = vectorizer.transform(tokenized_text_test)

In [224]:
X_train_transform

<333414x96682 sparse matrix of type '<class 'numpy.float64'>'
	with 2864404 stored elements in Compressed Sparse Row format>

# Word's Difficulty Considered

== Concreteness_ratings_Brysbaert_et_al_BRM.txt ==

This file contains concreteness ratings for 40 thousand English lemma words gathered via Amazon Mechanical Turk. The ratings come from a larger list of 63 thousand words and represent all English words known to 85% of the raters.

The file contains eight columns:
1. The word
2. Whether it is a single word or a two-word expression 
3. The mean concreteness rating
4. The standard deviation of the concreteness ratings
5. The number of persons indicating they did not know the word
6. The total number of persons who rated the word
7. Percentage participants who knew the word
8. The SUBTLEX-US frequency count (on a total of 51 million; Brysbaert & New, 2009) 
9. The dominant part-of-speech usage

Original source: http://crr.ugent.be/archives/1330

Brysbaert, M., Warriner, A.B., & Kuperman, V. (2014). Concreteness ratings for 40 thousand generally known English word lemmas. Behavior Research Methods, 46, 904-911.
http://crr.ugent.be/papers/Brysbaert_Warriner_Kuperman_BRM_Concreteness_ratings.pdf

In [165]:
#Concreteness rating - the higher Conc.M, the easier the word is.
concreteness_path = 'Data/Concreteness_ratings_Brysbaert_et_al_BRM.txt'
concrete_df = pd.read_csv(concreteness_path,delimiter='\t', keep_default_na=False)
concreteset=(concrete_df['Word'].values)

In [166]:
concrete_df

Unnamed: 0,Word,Bigram,Conc.M,Conc.SD,Unknown,Total,Percent_known,SUBTLEX,Dom_Pos
0,roadsweeper,0,4.85,0.37,1,27,0.96,0,0
1,traindriver,0,4.54,0.71,3,29,0.90,0,0
2,tush,0,4.45,1.01,3,25,0.88,66,0
3,hairdress,0,3.93,1.28,0,29,1.00,1,0
4,pharmaceutics,0,3.77,1.41,4,26,0.85,0,0
...,...,...,...,...,...,...,...,...,...
39949,unenvied,0,1.21,0.62,1,30,0.97,0,
39950,agnostically,0,1.20,0.50,2,27,0.93,0,
39951,conceptualistic,0,1.18,0.50,4,26,0.85,0,
39952,conventionalism,0,1.18,0.48,1,29,0.97,0,


In [167]:
concrete_df.Bigram.value_counts()

0    37058
1     2896
Name: Bigram, dtype: int64

In [168]:
concrete_df[concrete_df.Bigram==1]

Unnamed: 0,Word,Bigram,Conc.M,Conc.SD,Unknown,Total,Percent_known,SUBTLEX,Dom_Pos
28707,baking soda,1,5.00,0.00,0,30,1.00,0,
28709,baseball bat,1,5.00,0.00,0,29,1.00,0,
28710,bath towel,1,5.00,0.00,0,29,1.00,0,
28711,beach ball,1,5.00,0.00,0,28,1.00,0,
28712,bed sheet,1,5.00,0.00,0,28,1.00,0,
...,...,...,...,...,...,...,...,...,...
39619,tantamount to,1,1.52,0.85,4,27,0.85,0,
39857,chance on,1,1.38,0.75,2,28,0.93,0,
39871,free rein,1,1.37,0.63,2,29,0.93,0,
39899,by chance,1,1.34,0.72,1,30,0.97,0,


In [169]:
#There is no Nan value in Conc.M column
concrete_df[concrete_df['Conc.M'].isna()]

Unnamed: 0,Word,Bigram,Conc.M,Conc.SD,Unknown,Total,Percent_known,SUBTLEX,Dom_Pos


### Are we gonna consider bigrams in this dataset, given it's only a small fraction ~ 8% in size?

In [170]:
np.min(concrete_df['Conc.M'])

1.04

In [171]:
np.max(concrete_df['Conc.M'])

5.0

### Concreteness values range from 1 - 5, we could possible use the inverse value of concreteness to scale it to a 0-1 range and give easier words less weight.

== AoA_51715_words.csv ==

This file contains "Age of Acquisition" (AoA) estimates for about 51k English words, which refers to the approximate age (in years) when a word was learned. Early words, being more basic, have lower average AoA.

The main columns you will be interested in are "Word" and "AoA_Kup_lem". But the others may be useful too.

The file contains these columns:

Word :: The word in question
Alternative.spelling :: if the Word may be spelled frequently in another form	
Freq_pm	:: Freq of the Word in general English (larger -> more common)
Dom_PoS_SUBTLEX	:: Dominant part of speech in general usage
Nletters :: number of letters 
Nphon :: number of phonemes
Nsyll :: number of syllables
Lemma_highest_PoS :: the "lemmatized" or "root" form of the word (in the dominant part of speech. e.g. The root form of the verb "abates" is "abate".
AoA_Kup	:: The AoA from a previous study by Kuperman et al.
Perc_known :: Percent of people who knew the word in the Kuperman et al. study
AoA_Kup_lem :: Estimated AoA based on Kuperman et al. study lemmatized words. THIS IS THE MAIN COLUMN OF INTEREST.
Perc_known_lem	:: Estimated percentage of people who would know this form of the word in the Kuperman study.
AoA_Bird_lem :: AoA reported in previous study by Bird (2001) 
AoA_Bristol_lem	:: AoA reported in previous study from Bristol Univ. (2006)
AoA_Cort_lem :: AoA reported in previous study by Cortese & Khanna (2008)
AoA_Schock :: AoA reported in previous study by Schock (2012)

Original source : http://crr.ugent.be/archives/806

In [172]:
#AoA
#Perc_known_lem, AoA_Kup_lem
aoawords_path = 'Data/AoA_51715_words.csv'
AoA = pd.read_csv(aoawords_path,encoding = 'unicode_escape')
AoA_set = set(AoA['Word'].values)
AoA.head(5)

Unnamed: 0,Word,Alternative.spelling,Freq_pm,Dom_PoS_SUBTLEX,Nletters,Nphon,Nsyll,Lemma_highest_PoS,AoA_Kup,Perc_known,AoA_Kup_lem,Perc_known_lem,AoA_Bird_lem,AoA_Bristol_lem,AoA_Cort_lem,AoA_Schock
0,a,a,20415.27,Article,1,1,1,a,2.89,1.0,2.89,1.0,3.16,,,
1,aardvark,aardvark,0.41,Noun,8,7,2,aardvark,9.89,1.0,9.89,1.0,,,,
2,abacus,abacus,0.24,Noun,6,6,3,abacus,8.69,0.65,8.69,0.65,,,,
3,abacuses,abacuses,0.02,Noun,8,9,4,abacus,,,8.69,0.65,,,,
4,abalone,abalone,0.51,Verb,7,7,4,abalone,12.23,0.72,12.23,0.72,,,,


In [173]:
len(AoA)

51715

In [174]:
AoA.AoA_Kup_lem.min()

1.58

In [175]:
AoA.AoA_Kup_lem.max()

25.0

In [176]:
AoA.sort_values(['AoA_Kup_lem'], ascending=False)

Unnamed: 0,Word,Alternative.spelling,Freq_pm,Dom_PoS_SUBTLEX,Nletters,Nphon,Nsyll,Lemma_highest_PoS,AoA_Kup,Perc_known,AoA_Kup_lem,Perc_known_lem,AoA_Bird_lem,AoA_Bristol_lem,AoA_Cort_lem,AoA_Schock
14878,eisteddfod,eisteddfod,,,10,8,3,eisteddfod,25.0,0.05,25.0,0.05,,,,
2084,architrave,architrave,0.04,Noun,10,8,3,architrave,21.0,0.05,21.0,0.05,,,,
6274,calceolaria,calceolaria,0.02,Noun,11,11,6,calceolaria,21.0,0.11,21.0,0.11,,,,
32931,penury,penury,0.02,Noun,6,7,3,penury,20.6,0.28,20.6,0.28,,,,
25243,kendo,kendo,0.37,Noun,5,5,2,kendo,20.5,0.11,20.5,0.11,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38932,rogation,rogation,,,8,7,3,rogation,,0.00,,0.00,,,,
42089,smilax,smilax,,,6,7,2,smilax,,0.00,,0.00,,,,
46368,thulium,thulium,,,7,6,3,thulium,,0.00,,0.00,,,,
50862,wickiup,wickiup,0.27,Noun,7,6,3,wickiup,,0.00,,0.00,,,,


In [177]:
len(AoA[AoA['AoA_Kup_lem'].isna()])

20

In [178]:
AoA[AoA['AoA_Kup_lem'].isna()]

Unnamed: 0,Word,Alternative.spelling,Freq_pm,Dom_PoS_SUBTLEX,Nletters,Nphon,Nsyll,Lemma_highest_PoS,AoA_Kup,Perc_known,AoA_Kup_lem,Perc_known_lem,AoA_Bird_lem,AoA_Bristol_lem,AoA_Cort_lem,AoA_Schock
442,actinium,actinium,,,8,8,4,actinium,,0.0,,0.0,,,,
1322,ambuscade,ambuscade,,,9,8,3,ambuscade,,0.0,,0.0,,,,
2306,ashlar,ashlar,,,6,5,2,ashlar,,0.0,,0.0,,,,
5095,bosky,bosky,,,5,4,2,bosky,,0.0,,0.0,,,,
6404,canaille,canaille,,,8,5,2,canaille,,0.0,,0.0,,,,
9004,compeer,compeer,,,7,6,3,compeer,,0.0,,0.0,,,,
9005,compeers,compeers,0.02,Noun,8,7,3,compeer,,,,0.0,,,,
16000,europium,europium,,,8,8,4,europium,,0.0,,0.0,,,,
19065,gallimaufry,gallimaufry,,,11,9,4,gallimaufry,,0.0,,0.0,,,,
22498,hutment,hutment,,,7,7,2,hutment,,0.0,,0.0,,,,


In [179]:
# We are going to impute all Nan values in AoA_Kup_lem as the max AoA value 25, as they appear to be hard words.
AoA['AoA_Kup_lem'].fillna(value=AoA['AoA_Kup_lem'].max(), inplace=True)

In [180]:
AoA.sort_values(['AoA_Kup_lem'], ascending=False)

Unnamed: 0,Word,Alternative.spelling,Freq_pm,Dom_PoS_SUBTLEX,Nletters,Nphon,Nsyll,Lemma_highest_PoS,AoA_Kup,Perc_known,AoA_Kup_lem,Perc_known_lem,AoA_Bird_lem,AoA_Bristol_lem,AoA_Cort_lem,AoA_Schock
2306,ashlar,ashlar,,,6,5,2,ashlar,,0.00,25.00,0.00,,,,
38932,rogation,rogation,,,8,7,3,rogation,,0.00,25.00,0.00,,,,
46368,thulium,thulium,,,7,6,3,thulium,,0.00,25.00,0.00,,,,
14878,eisteddfod,eisteddfod,,,10,8,3,eisteddfod,25.00,0.05,25.00,0.05,,,,
5095,bosky,bosky,,,5,4,2,bosky,,0.00,25.00,0.00,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27395,mamma,mamma,3.02,Noun,5,4,2,mama,,,1.89,1.00,,,,
27393,mamas,mamas,0.71,Noun,5,5,2,mama,,,1.89,1.00,,,,
27392,mama,mama,103.71,Noun,4,4,2,mama,1.89,1.00,1.89,1.00,,,,
29050,mommas,mommas,0.10,Noun,6,5,2,momma,,,1.58,1.00,,,,


### AoA values range from 0 - 25, which means the smaller the AoA value, the easier the word is. We could possibly use the AoA value to give easier words less weight.

In [181]:
model_word = set(word_vectors.index_to_key) #around 6k words in the Word2Vec model

In [182]:
len(model_word)

4031

In [183]:
len(model_word.intersection(concreteset))

2048

In [184]:
word_vectors['live']

array([-0.03942373,  1.7966397 , -0.14200447, -0.3486227 ,  0.03800169,
       -0.131528  , -0.35020527, -0.41108254,  1.6974189 ,  0.05236067,
       -0.00799712,  0.7041458 ,  0.03987895, -0.15628862, -0.01195999,
       -0.5276748 , -0.28911754, -0.59118384, -0.29640502, -0.05531655,
        0.05208854,  1.2201724 ,  0.05189531, -1.7594199 ,  0.1870284 ,
        0.26337144,  0.03394307,  0.18608569, -0.55623055, -0.96030605,
       -0.70980805,  0.70162207, -1.1859473 , -0.61812186, -0.6310784 ,
       -0.5761929 , -0.23991399,  0.7104913 ,  1.6777844 , -0.02619001,
        1.5803056 , -0.43163323, -0.37865758, -0.46269822,  1.1120914 ,
       -0.24402891,  0.1181486 , -1.052571  , -0.37906688,  1.0918018 ,
        0.18887055,  0.6383479 ,  0.99249554,  1.1353667 ,  0.57656294,
        0.21096966,  0.22929311, -0.43283883,  0.50271827,  0.54883325,
        0.30768827,  0.03781867, -0.242916  , -1.5159138 , -0.63193476,
        0.3270656 ,  1.6821493 , -0.28726235,  0.28426522,  0.38

In [185]:
lemmatizer = WordNetLemmatizer()
word_list = []
for word in model_word: 
    word_list.append((word,lemmatizer.lemmatize(word.lower())))
df = pd.DataFrame(word_list,columns=['Original','word'])
df = df.merge(AoA,left_on='word',right_on='Word',how='left')
df = df[['Original','word','Perc_known','AoA_Kup_lem']]
word_not_matched = set(df[df['Perc_known'].isnull()].word.values)

for i in range(len(df)):   
    if df['word'][i][0] in set(('0','1','2','3','4','5','6','7','8','9')) or len(df['word'][i])==1:
        df['AoA_Kup_lem'][i] = 3
mean_value = df['AoA_Kup_lem'].mean()
df['AoA_Kup_lem'].fillna(value=mean_value,inplace=True)

In [186]:
#df.loc[df['Original']==['troops','weapons']]
df[df['Original'].isin(['troops','weapon'])]

Unnamed: 0,Original,word,Perc_known,AoA_Kup_lem
3094,weapon,weapon,1.0,6.95


In [187]:
def generate_perc_known(tokenized_text,df):
    avg_perc_know=None
    perc_know_list=[]
    for _ in tokenized_text: 
        words =[word for word in _ if word in word_vectors.key_to_index]
        
        if len(words) >0:
            avg_perc_know = np.mean(df[df['Original'].isin(words)]['AoA_Kup_lem'])
            perc_know_list.append(avg_perc_know)
        else: 
            
            perc_know_list.append(0)
            
    return perc_know_list

In [188]:
df_train = pd.DataFrame(X_train_wv)
df_train['year'] = generate_perc_known(tokenized_text_train,df)

In [189]:
df_test = pd.DataFrame(X_test_wv)
df_test['year'] = generate_perc_known(tokenized_text_test,df)

In [190]:
df_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,year
0,0.163066,0.066503,0.007967,-0.368197,-0.475971,0.174799,-0.226500,0.288741,-0.101118,0.251682,...,0.343301,0.449110,-0.301583,-0.318929,-0.027628,-0.003120,0.640888,0.395134,-0.211103,7.319698
1,0.098105,-0.697004,-0.067849,0.073167,0.001977,-0.519177,-0.064798,-0.384014,0.359658,-0.080730,...,0.100243,-0.152842,0.018108,-0.616458,0.208961,0.239500,-0.078117,0.907243,0.644744,8.900953
2,0.608009,-0.270855,-0.351858,-1.324698,0.509448,0.466696,-0.869674,0.316894,-0.832663,0.482958,...,-0.804097,-1.260673,-0.484280,-1.026836,-0.381989,0.006748,0.651532,0.502151,-1.543706,7.385000
3,-0.231419,-0.460309,-0.321846,-0.401228,-1.299778,-0.461486,0.002258,-0.175611,0.296010,0.373852,...,-0.068769,0.134842,0.026607,0.200088,0.376173,0.175164,-0.239718,0.463941,-0.541556,8.971588
4,-0.155188,0.110082,0.749716,-0.211680,-0.294006,-0.928232,0.095029,0.326077,0.020296,0.458989,...,-0.496064,0.562254,-0.161042,-0.556670,-0.152797,0.216482,-0.109737,1.134926,-0.073294,7.939948
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83349,-0.212178,-0.577913,0.233901,-0.283749,-0.250686,-0.740940,-0.073741,0.163121,-0.268991,0.569778,...,-0.062439,0.181861,-0.294118,0.674152,-0.312687,-0.416863,0.006465,0.296494,0.144787,7.846061
83350,0.083994,-0.119798,0.014636,-0.046240,-0.176528,-0.371178,-0.049741,-0.063575,0.069346,0.097987,...,-0.106163,0.598239,-0.423099,-0.277646,0.249423,0.238795,-0.084238,0.325800,-0.371009,7.653076
83351,-0.027579,-0.583053,-0.212853,0.064448,-0.001676,-0.386104,-0.194504,0.125628,0.087920,0.013556,...,-0.087076,0.200042,0.022237,0.865286,0.345294,0.206362,-0.050420,0.287032,-0.024188,6.618984
83352,0.150752,-0.344787,0.016055,-0.438976,0.105028,-0.072180,-0.229091,0.010541,-0.065317,0.162644,...,-0.090400,-0.352687,-0.262663,-0.028436,0.180446,-0.053098,0.068634,-0.034959,0.074879,7.009195


In [191]:
lr = LogisticRegression(random_state=RANDOM_SEED,max_iter=1000).fit(df_train,y_train)

In [192]:
accuracy_score(y_test,lr.predict(df_test))

0.58372723564556

# 2. Supervised Learning

## Random Classifier

In [194]:
dummy_bow = DummyClassifier(strategy='uniform',random_state=RANDOM_SEED).fit(X_train_transform,y_train)

In [195]:
accuracy_score(y_test, dummy_bow.predict(X_test_transform))

0.5011277203253593

In [196]:
dummy_wv = DummyClassifier(strategy='uniform',random_state=RANDOM_SEED).fit(X_train_wv,y_train)

In [197]:
accuracy_score(y_test,dummy_wv.predict(X_test_wv))

0.5011277203253593

## Logistic Regression Classifier

In [225]:
lr_bow = LogisticRegression(random_state=RANDOM_SEED,max_iter=1000).fit(X_train_transform,y_train)

In [226]:
accuracy_score(y_test,lr_bow.predict(X_test_transform))

0.6570170597691772

In [209]:
lr_wv = LogisticRegression(random_state=RANDOM_SEED,max_iter=1000).fit(X_train_wv,y_train)

In [210]:
accuracy_score(y_test,lr_wv.predict(X_test_wv))

0.5745974998200446

## Random Forest Classifier

In [227]:
rf_bow = RandomForestClassifier(n_estimators=500,max_depth=5,random_state=RANDOM_SEED).fit(X_train_transform,y_train)

In [228]:
accuracy_score(y_test,rf_bow.predict(X_test_transform))

0.6416968591789236

In [None]:
rf_wv = RandomForestClassifier(n_estimators=100,max_depth=5,random_state=RANDOM_SEED).fit(X_train_wv,y_train)

In [None]:
accuracy_score(y_test,rf_wv.predict(X_test_wv))

# 3. Unsupervised Learning

In [None]:
kmeans = KMeans(n_clusters=2,random_state=RANDOM_SEED).fit(X_train_transform)

In [None]:
cluster_df = pd.DataFrame({'cluster':kmeans.labels_,'y_label':y_train,'text':X_train})
cluster_df