In [1]:
import pandas as pd
import numpy as np
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import altair as alt
import re

In [2]:
pip install gensim

Note: you may need to restart the kernel to use updated packages.


In [3]:
from gensim.models.word2vec import Word2Vec
from tqdm.notebook import tqdm
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import Counter
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zhouwei/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
RANDOM_SEED=694

In [5]:
df = pd.read_csv('WikiLarge_Train.csv')
len(df[df['label']==1])/len(df) # the dataset label is well balanced 

0.5

In [6]:
X = df['original_text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [172]:
dale_chall = pd.read_csv('dale_chall.txt',delimiter='\t',header=None,names=['word'])
dale = set(dale_chall['word'].values)

# 1. Data Preprocessing

## Bag of Words Model

In [177]:
stop_words = list(set(stopwords.words('english'))|dale)
vectorizer = TfidfVectorizer(min_df=5,stop_words=stop_words,ngram_range=(1,2))
X_train_transform = vectorizer.fit_transform(X_train)
X_test_transform  = vectorizer.transform(X_test)



In [180]:
vectorizer.get_feature_names()

['00',
 '00 00',
 '00 04',
 '00 10',
 '00 30',
 '00 cdt',
 '00 edt',
 '00 households',
 '00 lrb',
 '00 on16',
 '00 pm',
 '00 rrb',
 '00 utc',
 '000',
 '000 00',
 '000 000',
 '000 001',
 '000 10',
 '000 100',
 '000 11',
 '000 12',
 '000 13',
 '000 150',
 '000 1989',
 '000 20',
 '000 200',
 '000 2001',
 '000 2005',
 '000 2006',
 '000 2008',
 '000 22',
 '000 24',
 '000 25',
 '000 30',
 '000 35',
 '000 40',
 '000 400',
 '000 500',
 '000 600',
 '000 70',
 '000 80',
 '000 according',
 '000 acre',
 '000 acres',
 '000 applicants',
 '000 armenian',
 '000 articles',
 '000 bangladesh',
 '000 bc',
 '000 bce',
 '000 births',
 '000 bp',
 '000 buildings',
 '000 burma',
 '000 casualties',
 '000 chinese',
 '000 chitral',
 '000 copies',
 '000 deaths',
 '000 described',
 '000 employees',
 '000 ethnic',
 '000 euros',
 '000 event',
 '000 fans',
 '000 fatalities',
 '000 ft',
 '000 haiti',
 '000 hectares',
 '000 highest',
 '000 homeless',
 '000 homes',
 '000 horses',
 '000 hours',
 '000 houses',
 '000 inhabi

## Word2Vec Model

In [9]:
tokenized_text_train=[]
tokenized_text_test=[]
stopWords = set(stopwords.words('english'))

for text in tqdm(X_train):
    tokens_in_text = word_tokenize(text)
    tokens_in_text = [word for word in tokens_in_text if word.lower() not in stopWords]
    tokenized_text_train.append(tokens_in_text)
    
for text in tqdm(X_test):
    tokens_in_text = word_tokenize(text)
    tokens_in_text = [word for word in tokens_in_text if word.lower() not in stopWords]
    tokenized_text_test.append(tokens_in_text)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=333414.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=83354.0), HTML(value='')))




In [10]:
model = Word2Vec(vector_size=100,window=2,min_count=100,seed= RANDOM_SEED,workers=4)
model.build_vocab(tokenized_text_train)
model.train(tokenized_text_train,total_examples=model.corpus_count,epochs=model.epochs)

(14256286, 24263135)

In [11]:
word_vectors = model.wv

In [12]:
def generate_dense_features(tokenized_text,word_vectors):
    dense_list=[]
    words=[]
    for _ in tokenized_text: 
        words =[word for word in _ if word in word_vectors.key_to_index]
        
        if len(words) >0:
            dense_list.append(np.mean(word_vectors[words],axis=0))
            
        else: 
            dense_list.append(np.zeros(word_vectors.vector_size))
            
    return np.array(dense_list)

In [13]:
X_train_wv = generate_dense_features(tokenized_text_train,word_vectors)
X_test_wv = generate_dense_features(tokenized_text_test,word_vectors)


# Word's Difficulty Considered

In [170]:
#Basic english words
dale_chall = pd.read_csv('dale_chall.txt',delimiter='\t',header=None,names=['word'])
dale = set(dale_chall['word'].values)
dale

{"couldn't",
 'dump',
 'burst',
 'guard',
 'pork',
 'proud',
 'rat',
 'toward',
 'runner',
 'proper',
 'firearm',
 'army',
 'still',
 'manger',
 'minute',
 'rice',
 'firecracker',
 'workman',
 'cover',
 'join',
 'balloon',
 'ought',
 'couch',
 'drag',
 'flutter',
 'hind',
 'loaf',
 'pint',
 'could',
 'sure',
 'frighten',
 'crossing',
 'us',
 'cardboard',
 'feather',
 'sign',
 'heart',
 'rug',
 'click',
 'tape',
 'lost',
 'apiece',
 'pull',
 'canoe',
 'cake',
 'elder',
 'magic',
 'April',
 'shaking',
 'taught',
 'has',
 'weaken',
 'billboard',
 'kick',
 'reader',
 'interesting',
 'unhappy',
 'sly',
 'soil',
 'defense',
 'earn',
 'wildcat',
 'coast',
 'quarter',
 'necktie',
 'visit',
 'both',
 'pave',
 'running',
 'doorbell',
 'thimble',
 'Monday',
 'sir',
 'sleeve',
 'mailman',
 'jacks',
 'shadow',
 'instead',
 'term',
 'month',
 'cowardly',
 'law',
 'scream',
 'third',
 'jellyfish',
 'wore',
 'possible',
 'reach',
 'vessel',
 'twenty',
 'say',
 'sail',
 'at',
 'fresh',
 'caterpillar',


In [15]:
#Concreteness rating
concrete_df = pd.read_csv('Concreteness_ratings_Brysbaert_et_al_BRM.txt',delimiter='\t')
concreteset=(concrete_df['Word'].values)

In [16]:
#AoA
#Perc_known_lem, AoA_Kup_lem
AoA = pd.read_csv('AoA_51715_words.csv',encoding = 'unicode_escape')
AoA_set = set(AoA['Word'].values)
AoA.head(5)

Unnamed: 0,Word,Alternative.spelling,Freq_pm,Dom_PoS_SUBTLEX,Nletters,Nphon,Nsyll,Lemma_highest_PoS,AoA_Kup,Perc_known,AoA_Kup_lem,Perc_known_lem,AoA_Bird_lem,AoA_Bristol_lem,AoA_Cort_lem,AoA_Schock
0,a,a,20415.27,Article,1,1,1,a,2.89,1.0,2.89,1.0,3.16,,,
1,aardvark,aardvark,0.41,Noun,8,7,2,aardvark,9.89,1.0,9.89,1.0,,,,
2,abacus,abacus,0.24,Noun,6,6,3,abacus,8.69,0.65,8.69,0.65,,,,
3,abacuses,abacuses,0.02,Noun,8,9,4,abacus,,,8.69,0.65,,,,
4,abalone,abalone,0.51,Verb,7,7,4,abalone,12.23,0.72,12.23,0.72,,,,


In [17]:
model_word = set(word_vectors.index_to_key) #around 6k words in the Word2Vec model

In [18]:
len(model_word.intersection(concreteset))

2623

In [19]:
lemmatizer = WordNetLemmatizer()
word_list = []
for word in model_word: 
    word_list.append((word,lemmatizer.lemmatize(word.lower())))
df = pd.DataFrame(word_list,columns=['Original','word'])
df = df.merge(AoA,left_on='word',right_on='Word',how='left')
df = df[['Original','word','Perc_known','AoA_Kup_lem']]
word_not_matched = set(df[df['Perc_known'].isnull()].word.values)

for i in range(len(df)):   
    if df['word'][i][0] in set(('0','1','2','3','4','5','6','7','8','9')) or len(df['word'][i])==1:
        df['AoA_Kup_lem'][i] = 3
mean_value = df['AoA_Kup_lem'].mean()
df['AoA_Kup_lem'].fillna(value=mean_value,inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['AoA_Kup_lem'][i] = 3


In [20]:
#df.loc[df['Original']==['troops','weapons']]
df[df['Original'].isin(['troops','weapon'])]

Unnamed: 0,Original,word,Perc_known,AoA_Kup_lem
1828,troops,troop,1.0,8.35
3521,weapon,weapon,1.0,6.95


In [21]:
def generate_perc_known(tokenized_text,df):
    avg_perc_know=None
    perc_know_list=[]
    for _ in tokenized_text: 
        words =[word for word in _ if word in word_vectors.key_to_index]
        
        if len(words) >0:
            avg_perc_know = np.mean(df[df['Original'].isin(words)]['AoA_Kup_lem'])
            perc_know_list.append(avg_perc_know)
        else: 
            
            perc_know_list.append(0)
            
    return perc_know_list

In [23]:
df_train = pd.DataFrame(X_train_wv)
df_train['year'] = generate_perc_known(tokenized_text_train,df)

KeyboardInterrupt: 

In [None]:
df_test = pd.DataFrame(X_test_wv)
df_test['year'] = generate_perc_known(tokenized_text_test,df)

In [None]:
df_test

In [None]:
lr = LogisticRegression(random_state=RANDOM_SEED,max_iter=1000).fit(df_train,y_train)

In [None]:
accuracy_score(y_test,lr.predict(df_test))

# 2. Supervised Learning

## Random Classifier

In [88]:
dummy_bow = DummyClassifier(strategy='uniform',random_state=RANDOM_SEED).fit(X_train_transform,y_train)

In [89]:
accuracy_score(y_test, dummy_bow.predict(X_test_transform))

0.5011277203253593

In [26]:
dummy_wv = DummyClassifier(strategy='uniform',random_state=RANDOM_SEED).fit(X_train_wv,y_train)

In [27]:
accuracy_score(y_test,dummy_wv.predict(X_test_wv))

0.5011277203253593

## Logistic Regression Classifier

In [175]:
lr_bow = LogisticRegression(random_state=RANDOM_SEED,max_iter=1000).fit(X_train_transform,y_train)

In [176]:
accuracy_score(y_test,lr_bow.predict(X_test_transform))

0.6690260815317801

In [72]:
lr_wv = LogisticRegression(random_state=RANDOM_SEED,max_iter=1000).fit(X_train_wv,y_train)

In [73]:
accuracy_score(y_test,lr_wv.predict(X_test_wv))

0.6191424526717374

# Ridge

In [117]:
from sklearn.linear_model import RidgeClassifier

In [178]:
clf_ridge = RidgeClassifier(alpha=0.5,random_state=RANDOM_SEED,max_iter=1000).fit(X_train_transform,y_train)

In [179]:
accuracy_score(y_test,clf_ridge.predict(X_test_transform))

0.6544376994505363

In [148]:
clf_ridge_wv = RidgeClassifier(alpha=0.5,random_state=RANDOM_SEED,max_iter=1000).fit(X_train_wv,y_train)

In [149]:
accuracy_score(y_test,clf_ridge_wv.predict(X_test_wv))

0.6191304556469995

# GradientBoosting Classifer

In [152]:
from sklearn.ensemble import GradientBoostingClassifier

In [154]:
clf_gb = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, 
                                 random_state=0).fit(X_train_transform, y_train)

In [155]:
accuracy_score(y_test,clf_gb.predict(X_test_transform))

0.6165510953283586

In [157]:
clf_gb_wv = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, 
                                 random_state=0).fit(X_train_wv, y_train)

KeyboardInterrupt: 

In [None]:
accuracy_score(y_test,clf_gb.predict(X_test_wv))

## Random Forest Classifier

In [80]:
rf_bow = RandomForestClassifier(n_estimators=500,max_depth=5,random_state=RANDOM_SEED).fit(X_train_transform,y_train)

In [81]:
accuracy_score(y_test,rf_bow.predict(X_test_transform))

0.6257408162775632

In [34]:
rf_wv = RandomForestClassifier(n_estimators=100,max_depth=5,random_state=RANDOM_SEED).fit(X_train_wv,y_train)

KeyboardInterrupt: 

In [None]:
accuracy_score(y_test,rf_wv.predict(X_test_wv))

# 3. Unsupervised Learning

# Topic Modeling 

In [35]:
from sklearn.decomposition import NMF

In [222]:
nmf = NMF(n_components=5,random_state=RANDOM_SEED)
W = nmf.fit_transform(X_train_transform)
H = nmf.components_

In [221]:
W.shape

(333414, 50)

In [225]:
W_test = nmf.transform(X_test_transform)

In [98]:
words = np.array(vectorizer.get_feature_names())
for i, topic in enumerate(H):
     print("Topic {}: {}".format(i + 1, ",".join([str(x) for x in words[topic.argsort()[-10:]]])))

Topic 1: commune pas,france,department,region,nord,nord pas,calais department,pas calais,calais,pas
Topic 2: lrb known,lrb called,km rrb,lrb died,german,km,ndash,rrb lrb,lrb,rrb
Topic 3: department,region,north,aisne,aisne department,department north,north france,picardie aisne,region picardie,picardie
Topic 4: united kingdom,county iowa,kentucky,kentucky united,city iowa,iowa,iowa united,united states,states,united
Topic 5: references reading,references websites,list,references external,links,external links,external,notes references,notes,references
Topic 6: basse,basse normandie,normandie,calvados department,calvados,northwest,normandie calvados,region basse,department northwest,northwest france
Topic 7: brazilian football,player plays,player currently,lrb born,rrb japanese,japanese football,japanese,football player,football,player
Topic 8: region france,la,aisne commune,romania,county romania,sur,le,saint,commune region,commune
Topic 9: department,sarthe department,sarthe,region,reg

In [223]:
lr_tm = LogisticRegression(random_state=RANDOM_SEED,max_iter=1000).fit(W,y_train)

In [226]:
accuracy_score(y_test,lr_tm.predict(W_test))

0.5311802672937112

In [46]:
from sklearn.decomposition import TruncatedSVD

In [237]:
svd = TruncatedSVD(n_components=100, n_iter=5,random_state=RANDOM_SEED)
X_LSI_train = svd.fit_transform(X_train_transform)

In [238]:
X_LSI_train.shape

(333414, 100)

In [239]:
X_LSI_test = svd.transform(X_test_transform)

In [240]:
lr_lsi = LogisticRegression(random_state=RANDOM_SEED,max_iter=1000).fit(X_LSI_train,y_train)

In [241]:
accuracy_score(y_test,lr_lsi.predict(X_LSI_test))

0.5800201550015596