In [1]:
import pandas as pd
import numpy as np
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import altair as alt
import seaborn as sns
import re

In [2]:
pip install gensim

Note: you may need to restart the kernel to use updated packages.


In [3]:
from gensim.models.word2vec import Word2Vec
from tqdm.notebook import tqdm
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import Counter
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mryua\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mryua\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mryua\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
RANDOM_SEED=694

Training Dataset

In [6]:
train_path = 'Data/WikiLarge_Train.csv'
df = pd.read_csv(train_path, skiprows=0, skipfooter=0, engine='python')
df

Unnamed: 0,original_text,label
0,There is manuscript evidence that Austen conti...,1
1,"In a remarkable comparative analysis , Mandaea...",1
2,"Before Persephone was released to Hermes , who...",1
3,Cogeneration plants are commonly found in dist...,1
4,"Geneva -LRB- , ; , ; , ; ; -RRB- is the second...",1
...,...,...
416763,A Duke Nukem 3D version has been sold for Xbox...,0
416764,"However , it is becoming replaced as a method ...",0
416765,There are hand gestures in both Hindu and Budd...,0
416766,"If it is necessary to use colors , try to choo...",0


In [7]:
len(df[df['label']==1])/len(df) # the dataset label is well balanced 

0.5

In [8]:
df.iloc[50]['original_text']

'He studied in Armenia and Istanbul , then at Wisconsin University which he finished in 1915 .'

In [9]:
df['original_text'].apply(lambda x: len(x)).mean()
# This means all texts are considered short text, which allows us to use dense representations, 
# as dense representations work well with short text.
# Gensim.KeyedVectors.load('assets/wikipedia.100.word-vecs.kv')??? How to generate and use this???
# Maybe we should train word2vec model on the entire corpus. Just training data? TOP 100 word-vectors(features)
# Alternatively we could use bag-of-words model, which is term-document matrix representation, having much more features

117.921906192414

In [10]:
X = df['original_text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Testing Dataset

In [11]:
test_path = 'Data/WikiLarge_Test.csv'
test_df = pd.read_csv(test_path, skiprows=0, skipfooter=0, engine='python')
test_df

Unnamed: 0,id,original_text,label
0,0,-2011,
1,1,-2011,
2,2,-2000,
3,3,-1997,
4,4,1.636,
...,...,...,...
119087,119087,#NAME?,
119088,119088,#NAME?,
119089,119089,#NAME?,
119090,119090,#NAME?,


In [13]:
test_df.iloc[10000]

id                                                           10000
original_text    An atheist would say that this argument proves...
label                                                          NaN
Name: 10000, dtype: object

Sample Submission

In [14]:
samplesubmission_path = 'Data/sampleSubmission.csv'
samplesubmission_df = pd.read_csv(samplesubmission_path, skiprows=0, skipfooter=0, engine='python')
samplesubmission_df

Unnamed: 0,id,label
0,0,0
1,1,0
2,2,1
3,3,1
4,4,0
...,...,...
119087,119087,0
119088,119088,1
119089,119089,1
119090,119090,1


To conclude, the dataframes we are working with are:

dalechall_df, concreteness_df, aoawords_df, train_df, test_df, samplesubmission_df

# 1. Data Preprocessing

## Bag of Words Model

In [15]:
vectorizer = TfidfVectorizer(min_df=10,stop_words='english',ngram_range=(1,2))
X_train_transform = vectorizer.fit_transform(X_train)
X_test_transform  = vectorizer.transform(X_test)

In [16]:
X_train_transform

<333414x57773 sparse matrix of type '<class 'numpy.float64'>'
	with 4071111 stored elements in Compressed Sparse Row format>

## Word2Vec Model

In [17]:
tokenized_text_train=[]
tokenized_text_test=[]
stopWords = set(stopwords.words('english'))

for text in tqdm(X_train):
    tokens_in_text = word_tokenize(text)
    tokens_in_text = [word for word in tokens_in_text if word.lower() not in stopWords]
    tokenized_text_train.append(tokens_in_text)
    
for text in tqdm(X_test):
    tokens_in_text = word_tokenize(text)
    tokens_in_text = [word for word in tokens_in_text if word.lower() not in stopWords]
    tokenized_text_test.append(tokens_in_text)

HBox(children=(FloatProgress(value=0.0, max=333414.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=83354.0), HTML(value='')))




In [18]:
model = Word2Vec(vector_size=100,window=2,min_count=100,seed= RANDOM_SEED,workers=4)
model.build_vocab(tokenized_text_train)
model.train(tokenized_text_train,total_examples=model.corpus_count,epochs=model.epochs)

(14267386, 24273560)

In [19]:
word_vectors = model.wv

In [20]:
def generate_dense_features(tokenized_text,word_vectors):
    dense_list=[]
    words=[]
    for _ in tokenized_text: 
        words =[word for word in _ if word in word_vectors.key_to_index]
        
        if len(words) >0:
            dense_list.append(np.mean(word_vectors[words],axis=0))
            
        else: 
            dense_list.append(np.zeros(word_vectors.vector_size))
            
    return np.array(dense_list)

In [21]:
X_train_wv = generate_dense_features(tokenized_text_train,word_vectors)
X_test_wv = generate_dense_features(tokenized_text_test,word_vectors)

# Word's Difficulty Considered

== dale_chall.txt ==

This is the Dale Chall 3000 Word List, which is one definition of words that are considered "basic" English.

A summary is at https://www.readabilityformulas.com/articles/dale-chall-readability-word-list.php

In [22]:
#Basic english words
dalechall_path = 'Data/dale_chall.txt'
dale_chall = pd.read_csv(dalechall_path,delimiter='\t',header=None,names=['word'])
dale = set(dale_chall['word'].values)

In [25]:
len(dale)

2946

### The 2946 words in dale can be combined with the nltk stopwords.

== Concreteness_ratings_Brysbaert_et_al_BRM.txt ==

This file contains concreteness ratings for 40 thousand English lemma words gathered via Amazon Mechanical Turk. The ratings come from a larger list of 63 thousand words and represent all English words known to 85% of the raters.

The file contains eight columns:
1. The word
2. Whether it is a single word or a two-word expression 
3. The mean concreteness rating
4. The standard deviation of the concreteness ratings
5. The number of persons indicating they did not know the word
6. The total number of persons who rated the word
7. Percentage participants who knew the word
8. The SUBTLEX-US frequency count (on a total of 51 million; Brysbaert & New, 2009) 
9. The dominant part-of-speech usage

Original source: http://crr.ugent.be/archives/1330

Brysbaert, M., Warriner, A.B., & Kuperman, V. (2014). Concreteness ratings for 40 thousand generally known English word lemmas. Behavior Research Methods, 46, 904-911.
http://crr.ugent.be/papers/Brysbaert_Warriner_Kuperman_BRM_Concreteness_ratings.pdf

In [26]:
#Concreteness rating - the higher Conc.M, the easier the word is.
concreteness_path = 'Data/Concreteness_ratings_Brysbaert_et_al_BRM.txt'
concrete_df = pd.read_csv(concreteness_path,delimiter='\t', keep_default_na=False)
concreteset=(concrete_df['Word'].values)

In [34]:
concrete_df

Unnamed: 0,Word,Bigram,Conc.M,Conc.SD,Unknown,Total,Percent_known,SUBTLEX,Dom_Pos
0,roadsweeper,0,4.85,0.37,1,27,0.96,0,0
1,traindriver,0,4.54,0.71,3,29,0.90,0,0
2,tush,0,4.45,1.01,3,25,0.88,66,0
3,hairdress,0,3.93,1.28,0,29,1.00,1,0
4,pharmaceutics,0,3.77,1.41,4,26,0.85,0,0
...,...,...,...,...,...,...,...,...,...
39949,unenvied,0,1.21,0.62,1,30,0.97,0,
39950,agnostically,0,1.20,0.50,2,27,0.93,0,
39951,conceptualistic,0,1.18,0.50,4,26,0.85,0,
39952,conventionalism,0,1.18,0.48,1,29,0.97,0,


In [33]:
concrete_df.Bigram.value_counts()

0    37058
1     2896
Name: Bigram, dtype: int64

In [40]:
concrete_df[concrete_df.Bigram==1]

Unnamed: 0,Word,Bigram,Conc.M,Conc.SD,Unknown,Total,Percent_known,SUBTLEX,Dom_Pos
28707,baking soda,1,5.00,0.00,0,30,1.00,0,
28709,baseball bat,1,5.00,0.00,0,29,1.00,0,
28710,bath towel,1,5.00,0.00,0,29,1.00,0,
28711,beach ball,1,5.00,0.00,0,28,1.00,0,
28712,bed sheet,1,5.00,0.00,0,28,1.00,0,
...,...,...,...,...,...,...,...,...,...
39619,tantamount to,1,1.52,0.85,4,27,0.85,0,
39857,chance on,1,1.38,0.75,2,28,0.93,0,
39871,free rein,1,1.37,0.63,2,29,0.93,0,
39899,by chance,1,1.34,0.72,1,30,0.97,0,


In [67]:
#There is no Nan value in Conc.M column
concrete_df[concrete_df['Conc.M'].isna()]

Unnamed: 0,Word,Bigram,Conc.M,Conc.SD,Unknown,Total,Percent_known,SUBTLEX,Dom_Pos


### Are we gonna consider bigrams in this dataset, given it's only a small fraction ~ 8% in size?

In [37]:
np.min(concrete_df['Conc.M'])

1.04

In [41]:
np.max(concrete_df['Conc.M'])

5.0

### Concreteness values range from 1 - 5, we could possible use the inverse value of concreteness to scale it to a 0-1 range and give easier words less weight.

== AoA_51715_words.csv ==

This file contains "Age of Acquisition" (AoA) estimates for about 51k English words, which refers to the approximate age (in years) when a word was learned. Early words, being more basic, have lower average AoA.

The main columns you will be interested in are "Word" and "AoA_Kup_lem". But the others may be useful too.

The file contains these columns:

Word :: The word in question
Alternative.spelling :: if the Word may be spelled frequently in another form	
Freq_pm	:: Freq of the Word in general English (larger -> more common)
Dom_PoS_SUBTLEX	:: Dominant part of speech in general usage
Nletters :: number of letters 
Nphon :: number of phonemes
Nsyll :: number of syllables
Lemma_highest_PoS :: the "lemmatized" or "root" form of the word (in the dominant part of speech. e.g. The root form of the verb "abates" is "abate".
AoA_Kup	:: The AoA from a previous study by Kuperman et al.
Perc_known :: Percent of people who knew the word in the Kuperman et al. study
AoA_Kup_lem :: Estimated AoA based on Kuperman et al. study lemmatized words. THIS IS THE MAIN COLUMN OF INTEREST.
Perc_known_lem	:: Estimated percentage of people who would know this form of the word in the Kuperman study.
AoA_Bird_lem :: AoA reported in previous study by Bird (2001) 
AoA_Bristol_lem	:: AoA reported in previous study from Bristol Univ. (2006)
AoA_Cort_lem :: AoA reported in previous study by Cortese & Khanna (2008)
AoA_Schock :: AoA reported in previous study by Schock (2012)

Original source : http://crr.ugent.be/archives/806

In [42]:
#AoA
#Perc_known_lem, AoA_Kup_lem
aoawords_path = 'Data/AoA_51715_words.csv'
AoA = pd.read_csv(aoawords_path,encoding = 'unicode_escape')
AoA_set = set(AoA['Word'].values)
AoA.head(5)

Unnamed: 0,Word,Alternative.spelling,Freq_pm,Dom_PoS_SUBTLEX,Nletters,Nphon,Nsyll,Lemma_highest_PoS,AoA_Kup,Perc_known,AoA_Kup_lem,Perc_known_lem,AoA_Bird_lem,AoA_Bristol_lem,AoA_Cort_lem,AoA_Schock
0,a,a,20415.27,Article,1,1,1,a,2.89,1.0,2.89,1.0,3.16,,,
1,aardvark,aardvark,0.41,Noun,8,7,2,aardvark,9.89,1.0,9.89,1.0,,,,
2,abacus,abacus,0.24,Noun,6,6,3,abacus,8.69,0.65,8.69,0.65,,,,
3,abacuses,abacuses,0.02,Noun,8,9,4,abacus,,,8.69,0.65,,,,
4,abalone,abalone,0.51,Verb,7,7,4,abalone,12.23,0.72,12.23,0.72,,,,


In [51]:
len(AoA)

51715

In [49]:
AoA.AoA_Kup_lem.min()

1.58

In [50]:
AoA.AoA_Kup_lem.max()

25.0

In [54]:
AoA.sort_values(['AoA_Kup_lem'], ascending=False)

Unnamed: 0,Word,Alternative.spelling,Freq_pm,Dom_PoS_SUBTLEX,Nletters,Nphon,Nsyll,Lemma_highest_PoS,AoA_Kup,Perc_known,AoA_Kup_lem,Perc_known_lem,AoA_Bird_lem,AoA_Bristol_lem,AoA_Cort_lem,AoA_Schock
14878,eisteddfod,eisteddfod,,,10,8,3,eisteddfod,25.0,0.05,25.0,0.05,,,,
2084,architrave,architrave,0.04,Noun,10,8,3,architrave,21.0,0.05,21.0,0.05,,,,
6274,calceolaria,calceolaria,0.02,Noun,11,11,6,calceolaria,21.0,0.11,21.0,0.11,,,,
32931,penury,penury,0.02,Noun,6,7,3,penury,20.6,0.28,20.6,0.28,,,,
25243,kendo,kendo,0.37,Noun,5,5,2,kendo,20.5,0.11,20.5,0.11,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38932,rogation,rogation,,,8,7,3,rogation,,0.00,,0.00,,,,
42089,smilax,smilax,,,6,7,2,smilax,,0.00,,0.00,,,,
46368,thulium,thulium,,,7,6,3,thulium,,0.00,,0.00,,,,
50862,wickiup,wickiup,0.27,Noun,7,6,3,wickiup,,0.00,,0.00,,,,


In [59]:
len(AoA[AoA['AoA_Kup_lem'].isna()])

20

In [60]:
AoA[AoA['AoA_Kup_lem'].isna()]

Unnamed: 0,Word,Alternative.spelling,Freq_pm,Dom_PoS_SUBTLEX,Nletters,Nphon,Nsyll,Lemma_highest_PoS,AoA_Kup,Perc_known,AoA_Kup_lem,Perc_known_lem,AoA_Bird_lem,AoA_Bristol_lem,AoA_Cort_lem,AoA_Schock
442,actinium,actinium,,,8,8,4,actinium,,0.0,,0.0,,,,
1322,ambuscade,ambuscade,,,9,8,3,ambuscade,,0.0,,0.0,,,,
2306,ashlar,ashlar,,,6,5,2,ashlar,,0.0,,0.0,,,,
5095,bosky,bosky,,,5,4,2,bosky,,0.0,,0.0,,,,
6404,canaille,canaille,,,8,5,2,canaille,,0.0,,0.0,,,,
9004,compeer,compeer,,,7,6,3,compeer,,0.0,,0.0,,,,
9005,compeers,compeers,0.02,Noun,8,7,3,compeer,,,,0.0,,,,
16000,europium,europium,,,8,8,4,europium,,0.0,,0.0,,,,
19065,gallimaufry,gallimaufry,,,11,9,4,gallimaufry,,0.0,,0.0,,,,
22498,hutment,hutment,,,7,7,2,hutment,,0.0,,0.0,,,,


In [61]:
# We are going to impute all Nan values in AoA_Kup_lem as the max AoA value 25, as they appear to be hard words.
AoA['AoA_Kup_lem'].fillna(value=AoA['AoA_Kup_lem'].max(), inplace=True)

In [69]:
AoA.sort_values(['AoA_Kup_lem'], ascending=False)

Unnamed: 0,Word,Alternative.spelling,Freq_pm,Dom_PoS_SUBTLEX,Nletters,Nphon,Nsyll,Lemma_highest_PoS,AoA_Kup,Perc_known,AoA_Kup_lem,Perc_known_lem,AoA_Bird_lem,AoA_Bristol_lem,AoA_Cort_lem,AoA_Schock
2306,ashlar,ashlar,,,6,5,2,ashlar,,0.00,25.00,0.00,,,,
38932,rogation,rogation,,,8,7,3,rogation,,0.00,25.00,0.00,,,,
46368,thulium,thulium,,,7,6,3,thulium,,0.00,25.00,0.00,,,,
14878,eisteddfod,eisteddfod,,,10,8,3,eisteddfod,25.00,0.05,25.00,0.05,,,,
5095,bosky,bosky,,,5,4,2,bosky,,0.00,25.00,0.00,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27395,mamma,mamma,3.02,Noun,5,4,2,mama,,,1.89,1.00,,,,
27393,mamas,mamas,0.71,Noun,5,5,2,mama,,,1.89,1.00,,,,
27392,mama,mama,103.71,Noun,4,4,2,mama,1.89,1.00,1.89,1.00,,,,
29050,mommas,mommas,0.10,Noun,6,5,2,momma,,,1.58,1.00,,,,


### AoA values range from 0 - 25, which means the smaller the AoA value, the easier the word is. We could possibly use the AoA value to give easier words less weight.

In [70]:
model_word = set(word_vectors.index_to_key) #around 6k words in the Word2Vec model

In [71]:
len(model_word)

5776

In [72]:
len(model_word.intersection(concreteset))

2623

In [77]:
word_vectors['live']

array([-0.6199677 ,  0.3468209 , -0.37597764, -0.84315234,  0.05326695,
       -0.74597925, -0.2600026 , -0.9814865 , -0.85832727, -0.8009238 ,
       -0.40282986, -0.7274299 , -0.46454006, -0.06327678, -0.6668875 ,
        0.68247014, -1.2197053 ,  0.33938536,  0.50376123, -1.0847578 ,
        0.24329747, -0.16985822,  0.36326942,  0.16396426, -0.4165022 ,
        0.6009113 ,  0.18639898, -0.3841828 , -0.6158585 ,  0.8503197 ,
        0.15161711, -0.357656  ,  0.4907954 ,  0.5068166 , -0.46216726,
       -0.10496143,  0.40876412,  0.43992898,  0.13667339,  0.13547929,
       -0.6998609 ,  0.41898432, -0.06581438,  1.3475554 ,  0.08873701,
        0.27507004,  0.5438392 , -2.0921082 ,  0.4102677 ,  0.52933025,
        0.45466015, -0.562415  ,  0.36320716,  0.23065574, -0.54838544,
        0.3496241 ,  0.3873599 , -0.06940398, -1.3185865 ,  0.21078272,
       -0.14478372,  1.3990853 , -0.7048181 , -0.6680918 , -0.6833189 ,
       -0.6162195 , -0.32513937, -0.02604951, -0.59235597, -0.26

In [74]:
lemmatizer = WordNetLemmatizer()
word_list = []
for word in model_word: 
    word_list.append((word,lemmatizer.lemmatize(word.lower())))
df = pd.DataFrame(word_list,columns=['Original','word'])
df = df.merge(AoA,left_on='word',right_on='Word',how='left')
df = df[['Original','word','Perc_known','AoA_Kup_lem']]
word_not_matched = set(df[df['Perc_known'].isnull()].word.values)

for i in range(len(df)):   
    if df['word'][i][0] in set(('0','1','2','3','4','5','6','7','8','9')) or len(df['word'][i])==1:
        df['AoA_Kup_lem'][i] = 3
mean_value = df['AoA_Kup_lem'].mean()
df['AoA_Kup_lem'].fillna(value=mean_value,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['AoA_Kup_lem'][i] = 3


In [75]:
#df.loc[df['Original']==['troops','weapons']]
df[df['Original'].isin(['troops','weapon'])]

Unnamed: 0,Original,word,Perc_known,AoA_Kup_lem
3604,troops,troop,1.0,8.35
5455,weapon,weapon,1.0,6.95


In [34]:
def generate_perc_known(tokenized_text,df):
    avg_perc_know=None
    perc_know_list=[]
    for _ in tokenized_text: 
        words =[word for word in _ if word in word_vectors.key_to_index]
        
        if len(words) >0:
            avg_perc_know = np.mean(df[df['Original'].isin(words)]['AoA_Kup_lem'])
            perc_know_list.append(avg_perc_know)
        else: 
            
            perc_know_list.append(0)
            
    return perc_know_list

In [35]:
df_train = pd.DataFrame(X_train_wv)
df_train['year'] = generate_perc_known(tokenized_text_train,df)

In [None]:
df_test = pd.DataFrame(X_test_wv)
df_test['year'] = generate_perc_known(tokenized_text_test,df)

In [None]:
df_test

In [None]:
lr = LogisticRegression(random_state=RANDOM_SEED,max_iter=1000).fit(df_train,y_train)

In [None]:
accuracy_score(y_test,lr.predict(df_test))

# 2. Supervised Learning

## Random Classifier

In [None]:
dummy_bow = DummyClassifier(strategy='uniform',random_state=RANDOM_SEED).fit(X_train_transform,y_train)

In [None]:
accuracy_score(y_test, dummy_bow.predict(X_test_transform))

In [None]:
dummy_wv = DummyClassifier(strategy='uniform',random_state=RANDOM_SEED).fit(X_train_wv,y_train)

In [None]:
accuracy_score(y_test,dummy_wv.predict(X_test_wv))

## Logistic Regression Classifier

In [None]:
lr_bow = LogisticRegression(random_state=RANDOM_SEED,max_iter=1000).fit(X_train_transform,y_train)

In [None]:
accuracy_score(y_test,lr_bow.predict(X_test_transform))

In [None]:
lr_wv = LogisticRegression(random_state=RANDOM_SEED,max_iter=1000).fit(X_train_wv,y_train)

In [None]:
accuracy_score(y_test,lr_wv.predict(X_test_wv))

## Random Forest Classifier

In [None]:
rf_bow = RandomForestClassifier(n_estimators=500,max_depth=5,random_state=RANDOM_SEED).fit(X_train_transform,y_train)

In [None]:
accuracy_score(y_test,rf_bow.predict(X_test_transform))

In [None]:
rf_wv = RandomForestClassifier(n_estimators=100,max_depth=5,random_state=RANDOM_SEED).fit(X_train_wv,y_train)

In [None]:
accuracy_score(y_test,rf_wv.predict(X_test_wv))

# 3. Unsupervised Learning

In [None]:
kmeans = KMeans(n_clusters=2,random_state=RANDOM_SEED).fit(X_train_transform)

In [None]:
cluster_df = pd.DataFrame({'cluster':kmeans.labels_,'y_label':y_train,'text':X_train})
cluster_df