In [1]:
import pandas as pd
import numpy as np
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import altair as alt
import re

In [2]:
pip install gensim

Note: you may need to restart the kernel to use updated packages.


In [15]:
from gensim.models.word2vec import Word2Vec
from tqdm.notebook import tqdm
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import Counter
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mryua\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mryua\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mryua\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [16]:
RANDOM_SEED=694

In [17]:
df = pd.read_csv('Data\WikiLarge_Train.csv')
len(df[df['label']==1])/len(df) # the dataset label is well balanced 

0.5

In [18]:
X = df['original_text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 1. Data Preprocessing

## Bag of Words Model

In [19]:
vectorizer = TfidfVectorizer(min_df=10,stop_words='english',ngram_range=(1,2))
X_train_transform = vectorizer.fit_transform(X_train)
X_test_transform  = vectorizer.transform(X_test)

In [20]:
X_train_transform

<333414x57516 sparse matrix of type '<class 'numpy.float64'>'
	with 4053454 stored elements in Compressed Sparse Row format>

## Word2Vec Model

In [11]:
tokenized_text_train=[]
tokenized_text_test=[]
stopWords = set(stopwords.words('english'))

for text in tqdm(X_train):
    tokens_in_text = word_tokenize(text)
    tokens_in_text = [word for word in tokens_in_text if word.lower() not in stopWords]
    tokenized_text_train.append(tokens_in_text)
    
for text in tqdm(X_test):
    tokens_in_text = word_tokenize(text)
    tokens_in_text = [word for word in tokens_in_text if word.lower() not in stopWords]
    tokenized_text_test.append(tokens_in_text)

HBox(children=(FloatProgress(value=0.0, max=333414.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=83354.0), HTML(value='')))




In [21]:
model = Word2Vec(vector_size=100,window=2,min_count=100,seed= RANDOM_SEED,workers=4)
model.build_vocab(tokenized_text_train)
model.train(tokenized_text_train,total_examples=model.corpus_count,epochs=model.epochs)

(14256767, 24263135)

In [22]:
word_vectors = model.wv

In [23]:
def generate_dense_features(tokenized_text,word_vectors):
    dense_list=[]
    words=[]
    for _ in tokenized_text: 
        words =[word for word in _ if word in word_vectors.key_to_index]
        
        if len(words) >0:
            dense_list.append(np.mean(word_vectors[words],axis=0))
            
        else: 
            dense_list.append(np.zeros(word_vectors.vector_size))
            
    return np.array(dense_list)

In [24]:
X_train_wv = generate_dense_features(tokenized_text_train,word_vectors)
X_test_wv = generate_dense_features(tokenized_text_test,word_vectors)


# Word's Difficulty Considered

In [26]:
#Basic english words
dale_chall = pd.read_csv('Data\dale_chall.txt',delimiter='\t',header=None,names=['word'])
dale = set(dale_chall['word'].values)

In [27]:
#Concreteness rating
concrete_df = pd.read_csv('Data\Concreteness_ratings_Brysbaert_et_al_BRM.txt',delimiter='\t')
concreteset=(concrete_df['Word'].values)

In [28]:
#AoA
#Perc_known_lem, AoA_Kup_lem
AoA = pd.read_csv('Data\AoA_51715_words.csv',encoding = 'unicode_escape')
AoA_set = set(AoA['Word'].values)
AoA.head(5)

Unnamed: 0,Word,Alternative.spelling,Freq_pm,Dom_PoS_SUBTLEX,Nletters,Nphon,Nsyll,Lemma_highest_PoS,AoA_Kup,Perc_known,AoA_Kup_lem,Perc_known_lem,AoA_Bird_lem,AoA_Bristol_lem,AoA_Cort_lem,AoA_Schock
0,a,a,20415.27,Article,1,1,1,a,2.89,1.0,2.89,1.0,3.16,,,
1,aardvark,aardvark,0.41,Noun,8,7,2,aardvark,9.89,1.0,9.89,1.0,,,,
2,abacus,abacus,0.24,Noun,6,6,3,abacus,8.69,0.65,8.69,0.65,,,,
3,abacuses,abacuses,0.02,Noun,8,9,4,abacus,,,8.69,0.65,,,,
4,abalone,abalone,0.51,Verb,7,7,4,abalone,12.23,0.72,12.23,0.72,,,,


In [29]:
model_word = set(word_vectors.index_to_key) #around 6k words in the Word2Vec model

In [30]:
len(model_word.intersection(concreteset))

2623

In [31]:
lemmatizer = WordNetLemmatizer()
word_list = []
for word in model_word: 
    word_list.append((word,lemmatizer.lemmatize(word.lower())))
df = pd.DataFrame(word_list,columns=['Original','word'])
df = df.merge(AoA,left_on='word',right_on='Word',how='left')
df = df[['Original','word','Perc_known','AoA_Kup_lem']]
word_not_matched = set(df[df['Perc_known'].isnull()].word.values)

for i in range(len(df)):   
    if df['word'][i][0] in set(('0','1','2','3','4','5','6','7','8','9')) or len(df['word'][i])==1:
        df['AoA_Kup_lem'][i] = 3
mean_value = df['AoA_Kup_lem'].mean()
df['AoA_Kup_lem'].fillna(value=mean_value,inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['AoA_Kup_lem'][i] = 3


In [32]:
#df.loc[df['Original']==['troops','weapons']]
df[df['Original'].isin(['troops','weapon'])]

Unnamed: 0,Original,word,Perc_known,AoA_Kup_lem
8,troops,troop,1.0,8.35
1254,weapon,weapon,1.0,6.95


In [33]:
def generate_perc_known(tokenized_text,df):
    avg_perc_know=None
    perc_know_list=[]
    for _ in tokenized_text: 
        words =[word for word in _ if word in word_vectors.key_to_index]
        
        if len(words) >0:
            avg_perc_know = np.mean(df[df['Original'].isin(words)]['AoA_Kup_lem'])
            perc_know_list.append(avg_perc_know)
        else: 
            
            perc_know_list.append(0)
            
    return perc_know_list

In [None]:
df_train = pd.DataFrame(X_train_wv)
df_train['year'] = generate_perc_known(tokenized_text_train,df)

In [None]:
df_test = pd.DataFrame(X_test_wv)
df_test['year'] = generate_perc_known(tokenized_text_test,df)

In [None]:
df_test

In [None]:
lr = LogisticRegression(random_state=RANDOM_SEED,max_iter=1000).fit(df_train,y_train)

In [None]:
accuracy_score(y_test,lr.predict(df_test))

# 2. Supervised Learning

## Random Classifier

In [None]:
dummy_bow = DummyClassifier(strategy='uniform',random_state=RANDOM_SEED).fit(X_train_transform,y_train)

In [None]:
accuracy_score(y_test, dummy_bow.predict(X_test_transform))

In [None]:
dummy_wv = DummyClassifier(strategy='uniform',random_state=RANDOM_SEED).fit(X_train_wv,y_train)

In [None]:
accuracy_score(y_test,dummy_wv.predict(X_test_wv))

## Logistic Regression Classifier

In [None]:
lr_bow = LogisticRegression(random_state=RANDOM_SEED,max_iter=1000).fit(X_train_transform,y_train)

In [None]:
accuracy_score(y_test,lr_bow.predict(X_test_transform))

In [None]:
lr_wv = LogisticRegression(random_state=RANDOM_SEED,max_iter=1000).fit(X_train_wv,y_train)

In [None]:
accuracy_score(y_test,lr_wv.predict(X_test_wv))

## Random Forest Classifier

In [None]:
rf_bow = RandomForestClassifier(n_estimators=500,max_depth=5,random_state=RANDOM_SEED).fit(X_train_transform,y_train)

In [None]:
accuracy_score(y_test,rf_bow.predict(X_test_transform))

In [None]:
rf_wv = RandomForestClassifier(n_estimators=100,max_depth=5,random_state=RANDOM_SEED).fit(X_train_wv,y_train)

In [None]:
accuracy_score(y_test,rf_wv.predict(X_test_wv))

# 3. Unsupervised Learning

In [None]:
kmeans = KMeans(n_clusters=2,random_state=RANDOM_SEED).fit(X_train_transform)

In [None]:
cluster_df = pd.DataFrame({'cluster':kmeans.labels_,'y_label':y_train,'text':X_train})
cluster_df

In [None]:
kmeans