In [1]:
import pandas as pd
import numpy as np
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import altair as alt
import re

In [2]:
pip install gensim

Collecting gensim
  Downloading gensim-4.1.2-cp38-cp38-win_amd64.whl (24.0 MB)
Collecting smart-open>=1.8.1
  Downloading smart_open-6.0.0-py3-none-any.whl (58 kB)
Collecting Cython==0.29.23
  Downloading Cython-0.29.23-cp38-cp38-win_amd64.whl (1.7 MB)
Installing collected packages: smart-open, Cython, gensim
  Attempting uninstall: Cython
    Found existing installation: Cython 0.29.21
    Uninstalling Cython-0.29.21:
      Successfully uninstalled Cython-0.29.21
Successfully installed Cython-0.29.23 gensim-4.1.2 smart-open-6.0.0
Note: you may need to restart the kernel to use updated packages.


In [3]:
from gensim.models.word2vec import Word2Vec
from tqdm.notebook import tqdm
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import Counter
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mryua\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [4]:
RANDOM_SEED=694

In [5]:
df = pd.read_csv('WikiLarge_Train.csv')
len(df[df['label']==1])/len(df) # the dataset label is well balanced 

FileNotFoundError: [Errno 2] File WikiLarge_Train.csv does not exist: 'WikiLarge_Train.csv'

In [222]:
X = df['original_text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 1. Data Preprocessing

## Bag of Words Model

In [223]:
vectorizer = TfidfVectorizer(min_df=10,stop_words='english',ngram_range=(1,2))
X_train_transform = vectorizer.fit_transform(X_train)
X_test_transform  = vectorizer.transform(X_test)

In [224]:
X_train_transform

<333414x57516 sparse matrix of type '<class 'numpy.float64'>'
	with 4053454 stored elements in Compressed Sparse Row format>

## Word2Vec Model

In [225]:
tokenized_text_train=[]
tokenized_text_test=[]
stopWords = set(stopwords.words('english'))

for text in tqdm(X_train):
    tokens_in_text = word_tokenize(text)
    tokens_in_text = [word for word in tokens_in_text if word.lower() not in stopWords]
    tokenized_text_train.append(tokens_in_text)
    
for text in tqdm(X_test):
    tokens_in_text = word_tokenize(text)
    tokens_in_text = [word for word in tokens_in_text if word.lower() not in stopWords]
    tokenized_text_test.append(tokens_in_text)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=333414.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=83354.0), HTML(value='')))




In [273]:
model = Word2Vec(vector_size=100,window=2,min_count=100,seed= RANDOM_SEED,workers=4)
model.build_vocab(tokenized_text_train)
model.train(tokenized_text_train,total_examples=model.corpus_count,epochs=model.epochs)

(14257763, 24263135)

In [274]:
word_vectors = model.wv

In [275]:
def generate_dense_features(tokenized_text,word_vectors):
    dense_list=[]
    words=[]
    for _ in tokenized_text: 
        words =[word for word in _ if word in word_vectors.key_to_index]
        
        if len(words) >0:
            dense_list.append(np.mean(word_vectors[words],axis=0))
            
        else: 
            dense_list.append(np.zeros(word_vectors.vector_size))
            
    return np.array(dense_list)

In [276]:
X_train_wv = generate_dense_features(tokenized_text_train,word_vectors)
X_test_wv = generate_dense_features(tokenized_text_test,word_vectors)


# Word's Difficulty Considered

In [277]:
#Basic english words
dale_chall = pd.read_csv('dale_chall.txt',delimiter='\t',header=None,names=['word'])
dale = set(dale_chall['word'].values)

In [278]:
#Concreteness rating
concrete_df = pd.read_csv('Concreteness_ratings_Brysbaert_et_al_BRM.txt',delimiter='\t')
concreteset=(concrete_df['Word'].values)

In [279]:
#AoA
#Perc_known_lem, AoA_Kup_lem
AoA = pd.read_csv('AoA_51715_words.csv',encoding = 'unicode_escape')
AoA_set = set(AoA['Word'].values)
AoA.head(5)

Unnamed: 0,Word,Alternative.spelling,Freq_pm,Dom_PoS_SUBTLEX,Nletters,Nphon,Nsyll,Lemma_highest_PoS,AoA_Kup,Perc_known,AoA_Kup_lem,Perc_known_lem,AoA_Bird_lem,AoA_Bristol_lem,AoA_Cort_lem,AoA_Schock
0,a,a,20415.27,Article,1,1,1,a,2.89,1.0,2.89,1.0,3.16,,,
1,aardvark,aardvark,0.41,Noun,8,7,2,aardvark,9.89,1.0,9.89,1.0,,,,
2,abacus,abacus,0.24,Noun,6,6,3,abacus,8.69,0.65,8.69,0.65,,,,
3,abacuses,abacuses,0.02,Noun,8,9,4,abacus,,,8.69,0.65,,,,
4,abalone,abalone,0.51,Verb,7,7,4,abalone,12.23,0.72,12.23,0.72,,,,


In [280]:
model_word = set(word_vectors.index_to_key) #around 6k words in the Word2Vec model

In [281]:
len(model_word.intersection(concreteset))

2623

In [282]:
lemmatizer = WordNetLemmatizer()
word_list = []
for word in model_word: 
    word_list.append((word,lemmatizer.lemmatize(word.lower())))
df = pd.DataFrame(word_list,columns=['Original','word'])
df = df.merge(AoA,left_on='word',right_on='Word',how='left')
df = df[['Original','word','Perc_known','AoA_Kup_lem']]
word_not_matched = set(df[df['Perc_known'].isnull()].word.values)

for i in range(len(df)):   
    if df['word'][i][0] in set(('0','1','2','3','4','5','6','7','8','9')) or len(df['word'][i])==1:
        df['AoA_Kup_lem'][i] = 3
mean_value = df['AoA_Kup_lem'].mean()
df['AoA_Kup_lem'].fillna(value=mean_value,inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['AoA_Kup_lem'][i] = 3


In [283]:
#df.loc[df['Original']==['troops','weapons']]
df[df['Original'].isin(['troops','weapon'])]

Unnamed: 0,Original,word,Perc_known,AoA_Kup_lem
0,troops,troop,1.0,8.35
1,weapon,weapon,1.0,6.95


In [284]:
def generate_perc_known(tokenized_text,df):
    avg_perc_know=None
    perc_know_list=[]
    for _ in tokenized_text: 
        words =[word for word in _ if word in word_vectors.key_to_index]
        
        if len(words) >0:
            avg_perc_know = np.mean(df[df['Original'].isin(words)]['AoA_Kup_lem'])
            perc_know_list.append(avg_perc_know)
        else: 
            
            perc_know_list.append(0)
            
    return perc_know_list

In [285]:
df_train = pd.DataFrame(X_train_wv)
df_train['year'] = generate_perc_known(tokenized_text_train,df)

In [286]:
df_test = pd.DataFrame(X_test_wv)
df_test['year'] = generate_perc_known(tokenized_text_test,df)

In [287]:
df_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,year
0,0.311034,-0.044982,-0.188059,-0.036853,0.053964,-0.075969,-0.573179,0.142119,0.038524,0.011229,...,0.140362,0.229184,-0.046725,-0.456503,-0.757063,-0.267589,0.390128,0.055417,0.151433,5.156867
1,-0.100743,-0.236878,-0.186871,-0.475519,0.342335,0.070849,-0.316106,-0.209397,0.046328,0.050351,...,-0.414860,-0.051380,-0.257673,0.067684,-0.184270,-0.094727,0.147250,0.273635,-0.004352,6.040000
2,0.083303,-0.488813,0.342551,0.181661,-0.427759,-0.216358,-1.041624,-0.118173,-0.336361,-0.128601,...,0.152526,0.411480,0.557212,-0.095473,-0.539461,-0.470343,-0.006639,0.140359,0.200645,7.175730
3,-0.037667,-0.479091,-0.312405,-0.369330,-0.456307,0.190385,0.258977,0.020776,-0.173443,0.177531,...,0.201434,-0.380838,-0.132189,-0.206441,-0.454263,0.163056,-0.168300,0.022108,-0.615601,6.441667
4,0.142891,-0.380996,-0.318780,-0.123600,-0.229757,0.281040,-0.462546,0.183915,-0.537223,0.394007,...,0.420755,0.118566,-0.368935,-0.032998,-0.360825,0.115795,0.066207,0.037782,-0.053558,7.493333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83349,-0.119182,-0.004474,-0.032353,-0.331377,-0.243901,0.046180,-0.114272,-0.024914,-0.006346,0.419893,...,0.159321,-0.389408,-0.268235,-0.397462,-0.291322,0.215912,0.134387,0.355377,-0.154329,7.422743
83350,-0.088119,-0.660035,-0.288880,-0.008632,0.287710,-0.047573,-0.319038,-0.172171,-0.037463,0.367047,...,-0.007213,0.120631,-0.100991,-0.154148,-0.306328,0.017333,0.272786,0.166727,-0.075412,7.026164
83351,-0.088393,-0.268005,-0.301353,-0.239260,-0.152188,-0.250156,-0.128421,-0.140322,0.039296,0.029445,...,0.025348,-0.259513,-0.283999,-0.144472,-0.217558,0.018160,0.017978,0.210068,0.010174,6.400714
83352,0.021214,-0.067990,-0.190103,0.006632,-0.135657,0.150053,-0.263083,-0.235765,-0.327293,0.037924,...,-0.059949,0.160235,-0.085067,-0.065448,-0.389915,-0.297451,0.177708,-0.012893,-0.207058,5.905209


In [288]:
lr = LogisticRegression(random_state=RANDOM_SEED,max_iter=1000).fit(df_train,y_train)

In [289]:
accuracy_score(y_test,lr.predict(df_test))

0.6353384360678551

# 2. Supervised Learning

## Random Classifier

In [290]:
dummy_bow = DummyClassifier(strategy='uniform',random_state=RANDOM_SEED).fit(X_train_transform,y_train)

In [291]:
accuracy_score(y_test, dummy_bow.predict(X_test_transform))

0.5011277203253593

In [292]:
dummy_wv = DummyClassifier(strategy='uniform',random_state=RANDOM_SEED).fit(X_train_wv,y_train)

In [293]:
accuracy_score(y_test,dummy_wv.predict(X_test_wv))

0.5011277203253593

## Logistic Regression Classifier

In [294]:
lr_bow = LogisticRegression(random_state=RANDOM_SEED,max_iter=1000).fit(X_train_transform,y_train)

In [295]:
accuracy_score(y_test,lr_bow.predict(X_test_transform))

0.6847301869136454

In [296]:
lr_wv = LogisticRegression(random_state=RANDOM_SEED,max_iter=1000).fit(X_train_wv,y_train)

In [297]:
accuracy_score(y_test,lr_wv.predict(X_test_wv))

0.6249130215706504

## Random Forest Classifier

In [298]:
rf_bow = RandomForestClassifier(n_estimators=500,max_depth=5,random_state=RANDOM_SEED).fit(X_train_transform,y_train)

In [299]:
accuracy_score(y_test,rf_bow.predict(X_test_transform))

0.6215898457182619

In [300]:
rf_wv = RandomForestClassifier(n_estimators=100,max_depth=5,random_state=RANDOM_SEED).fit(X_train_wv,y_train)

In [301]:
accuracy_score(y_test,rf_wv.predict(X_test_wv))

0.6205940926650191

# 3. Unsupervised Learning

In [302]:
kmeans = KMeans(n_clusters=2,random_state=RANDOM_SEED).fit(X_train_transform)

In [303]:
cluster_df = pd.DataFrame({'cluster':kmeans.labels_,'y_label':y_train,'text':X_train})
cluster_df

Unnamed: 0,cluster,y_label,text
304501,0,0,1979-80 Buffalo Sabres NHL 32 1880 74 1 4 2.36...
162313,0,1,Diseases Lentils in culture Lentils are mentio...
336845,0,0,"Railroads , like the Lehigh Valley Railroad , ..."
150625,0,1,An example of this would be an individual anim...
40240,0,1,Both the Matanuska and Susitna Rivers have maj...
...,...,...,...
259178,0,0,After the Germans invaded Norway in April 1940...
365838,1,0,"July 28 - Henry Bennet , 1st Earl of Arlington..."
131932,0,1,Pancake restaurants are popular family restaur...
146867,0,1,A cycling domestique


In [304]:
kmeans

KMeans(n_clusters=2, random_state=694)