In [1]:
import pandas as pd
import numpy as np
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import altair as alt
import seaborn as sns
import re

In [2]:
pip install gensim

Note: you may need to restart the kernel to use updated packages.


In [18]:
from gensim.models.word2vec import Word2Vec
from tqdm.notebook import tqdm
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import Counter
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mryua\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mryua\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [4]:
RANDOM_SEED=694

In [7]:
aoawords_path = 'Data/AoA_51715_words.csv'
aoawords_df = pd.read_csv(aoawords_path, skiprows=0, skipfooter=0, engine='python')
aoawords_df

Unnamed: 0,Word,Alternative.spelling,Freq_pm,Dom_PoS_SUBTLEX,Nletters,Nphon,Nsyll,Lemma_highest_PoS,AoA_Kup,Perc_known,AoA_Kup_lem,Perc_known_lem,AoA_Bird_lem,AoA_Bristol_lem,AoA_Cort_lem,AoA_Schock
0,a,a,20415.27,Article,1,1,1,a,2.89,1.00,2.89,1.00,3.16,,,
1,aardvark,aardvark,0.41,Noun,8,7,2,aardvark,9.89,1.00,9.89,1.00,,,,
2,abacus,abacus,0.24,Noun,6,6,3,abacus,8.69,0.65,8.69,0.65,,,,
3,abacuses,abacuses,0.02,Noun,8,9,4,abacus,,,8.69,0.65,,,,
4,abalone,abalone,0.51,Verb,7,7,4,abalone,12.23,0.72,12.23,0.72,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51710,zucchini,zucchini,0.96,Noun,8,6,3,zucchini,8.57,1.00,6.79,1.00,,,,
51711,zucchinis,zucchinis,0.04,Noun,9,7,3,zucchini,,,6.79,1.00,,,,
51712,zwieback,zwieback,0.04,Noun,8,6,2,zwieback,16.10,0.53,16.10,0.53,,,,
51713,zygote,zygote,0.14,Noun,6,5,2,zygote,15.38,0.91,15.38,0.91,,,,


Training Dataset

In [9]:
train_path = 'Data/WikiLarge_Train.csv'
df = pd.read_csv(train_path, skiprows=0, skipfooter=0, engine='python')
df

Unnamed: 0,original_text,label
0,There is manuscript evidence that Austen conti...,1
1,"In a remarkable comparative analysis , Mandaea...",1
2,"Before Persephone was released to Hermes , who...",1
3,Cogeneration plants are commonly found in dist...,1
4,"Geneva -LRB- , ; , ; , ; ; -RRB- is the second...",1
...,...,...
416763,A Duke Nukem 3D version has been sold for Xbox...,0
416764,"However , it is becoming replaced as a method ...",0
416765,There are hand gestures in both Hindu and Budd...,0
416766,"If it is necessary to use colors , try to choo...",0


In [10]:
len(df[df['label']==1])/len(df) # the dataset label is well balanced 

0.5

In [11]:
df.iloc[50]['original_text']

'He studied in Armenia and Istanbul , then at Wisconsin University which he finished in 1915 .'

In [12]:
df['original_text'].apply(lambda x: len(x)).mean()
# This means all texts are considered short text, which allows us to use dense representations, 
# as dense representations work well with short text.
# Gensim.KeyedVectors.load('assets/wikipedia.100.word-vecs.kv')??? How to generate and use this???
# Maybe we should train word2vec model on the entire corpus. Just training data? TOP 100 word-vectors(features)
# Alternatively we could use bag-of-words model, which is term-document matrix representation, having much more features

117.921906192414

In [13]:
X = df['original_text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Testing Dataset

In [57]:
test_path = 'Data/WikiLarge_Test.csv'
test_df = pd.read_csv(test_path, skiprows=0, skipfooter=0, engine='python')
test_df

Unnamed: 0,id,original_text,label
0,0,-2011,
1,1,-2011,
2,2,-2000,
3,3,-1997,
4,4,1.636,
...,...,...,...
119087,119087,#NAME?,
119088,119088,#NAME?,
119089,119089,#NAME?,
119090,119090,#NAME?,


Sample Submission

In [60]:
samplesubmission_path = 'Data/sampleSubmission.csv'
samplesubmission_df = pd.read_csv(samplesubmission_path, skiprows=0, skipfooter=0, engine='python')
samplesubmission_df

Unnamed: 0,id,label
0,0,0
1,1,0
2,2,1
3,3,1
4,4,0
...,...,...
119087,119087,0
119088,119088,1
119089,119089,1
119090,119090,1


To conclude, the dataframes we are working with are:

dalechall_df, concreteness_df, aoawords_df, train_df, test_df, samplesubmission_df

# 1. Data Preprocessing

## Bag of Words Model

In [14]:
vectorizer = TfidfVectorizer(min_df=10,stop_words='english',ngram_range=(1,2))
X_train_transform = vectorizer.fit_transform(X_train)
X_test_transform  = vectorizer.transform(X_test)

In [15]:
X_train_transform

<333414x57773 sparse matrix of type '<class 'numpy.float64'>'
	with 4071111 stored elements in Compressed Sparse Row format>

## Word2Vec Model

In [19]:
tokenized_text_train=[]
tokenized_text_test=[]
stopWords = set(stopwords.words('english'))

for text in tqdm(X_train):
    tokens_in_text = word_tokenize(text)
    tokens_in_text = [word for word in tokens_in_text if word.lower() not in stopWords]
    tokenized_text_train.append(tokens_in_text)
    
for text in tqdm(X_test):
    tokens_in_text = word_tokenize(text)
    tokens_in_text = [word for word in tokens_in_text if word.lower() not in stopWords]
    tokenized_text_test.append(tokens_in_text)

HBox(children=(FloatProgress(value=0.0, max=333414.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=83354.0), HTML(value='')))




In [20]:
model = Word2Vec(vector_size=100,window=2,min_count=100,seed= RANDOM_SEED,workers=4)
model.build_vocab(tokenized_text_train)
model.train(tokenized_text_train,total_examples=model.corpus_count,epochs=model.epochs)

(14267494, 24273560)

In [21]:
word_vectors = model.wv

In [22]:
def generate_dense_features(tokenized_text,word_vectors):
    dense_list=[]
    words=[]
    for _ in tokenized_text: 
        words =[word for word in _ if word in word_vectors.key_to_index]
        
        if len(words) >0:
            dense_list.append(np.mean(word_vectors[words],axis=0))
            
        else: 
            dense_list.append(np.zeros(word_vectors.vector_size))
            
    return np.array(dense_list)

In [23]:
X_train_wv = generate_dense_features(tokenized_text_train,word_vectors)
X_test_wv = generate_dense_features(tokenized_text_test,word_vectors)

# Word's Difficulty Considered

== dale_chall.txt ==

This is the Dale Chall 3000 Word List, which is one definition of words that are considered "basic" English.

A summary is at https://www.readabilityformulas.com/articles/dale-chall-readability-word-list.php

In [25]:
#Basic english words
dalechall_path = 'Data/dale_chall.txt'
dale_chall = pd.read_csv(dalechall_path,delimiter='\t',header=None,names=['word'])
dale = set(dale_chall['word'].values)

== Concreteness_ratings_Brysbaert_et_al_BRM.txt ==

This file contains concreteness ratings for 40 thousand English lemma words gathered via Amazon Mechanical Turk. The ratings come from a larger list of 63 thousand words and represent all English words known to 85% of the raters.

The file contains eight columns:
1. The word
2. Whether it is a single word or a two-word expression 
3. The mean concreteness rating
4. The standard deviation of the concreteness ratings
5. The number of persons indicating they did not know the word
6. The total number of persons who rated the word
7. Percentage participants who knew the word
8. The SUBTLEX-US frequency count (on a total of 51 million; Brysbaert & New, 2009) 
9. The dominant part-of-speech usage

Original source: http://crr.ugent.be/archives/1330

Brysbaert, M., Warriner, A.B., & Kuperman, V. (2014). Concreteness ratings for 40 thousand generally known English word lemmas. Behavior Research Methods, 46, 904-911.
http://crr.ugent.be/papers/Brysbaert_Warriner_Kuperman_BRM_Concreteness_ratings.pdf

In [26]:
#Concreteness rating
concreteness_path = 'Data/Concreteness_ratings_Brysbaert_et_al_BRM.txt'
concrete_df = pd.read_csv(concreteness_path,delimiter='\t', keep_default_na=False)
concreteset=(concrete_df['Word'].values)

== AoA_51715_words.csv ==

This file contains "Age of Acquisition" (AoA) estimates for about 51k English words, which refers to the approximate age (in years) when a word was learned. Early words, being more basic, have lower average AoA.

The main columns you will be interested in are "Word" and "AoA_Kup_lem". But the others may be useful too.

The file contains these columns:

Word :: The word in question
Alternative.spelling :: if the Word may be spelled frequently in another form	
Freq_pm	:: Freq of the Word in general English (larger -> more common)
Dom_PoS_SUBTLEX	:: Dominant part of speech in general usage
Nletters :: number of letters 
Nphon :: number of phonemes
Nsyll :: number of syllables
Lemma_highest_PoS :: the "lemmatized" or "root" form of the word (in the dominant part of speech. e.g. The root form of the verb "abates" is "abate".
AoA_Kup	:: The AoA from a previous study by Kuperman et al.
Perc_known :: Percent of people who knew the word in the Kuperman et al. study
AoA_Kup_lem :: Estimated AoA based on Kuperman et al. study lemmatized words. THIS IS THE MAIN COLUMN OF INTEREST.
Perc_known_lem	:: Estimated percentage of people who would know this form of the word in the Kuperman study.
AoA_Bird_lem :: AoA reported in previous study by Bird (2001) 
AoA_Bristol_lem	:: AoA reported in previous study from Bristol Univ. (2006)
AoA_Cort_lem :: AoA reported in previous study by Cortese & Khanna (2008)
AoA_Schock :: AoA reported in previous study by Schock (2012)

Original source : http://crr.ugent.be/archives/806

In [27]:
#AoA
#Perc_known_lem, AoA_Kup_lem
aoawords_path = 'Data/AoA_51715_words.csv'
AoA = pd.read_csv(aoawords_path,encoding = 'unicode_escape')
AoA_set = set(AoA['Word'].values)
AoA.head(5)

Unnamed: 0,Word,Alternative.spelling,Freq_pm,Dom_PoS_SUBTLEX,Nletters,Nphon,Nsyll,Lemma_highest_PoS,AoA_Kup,Perc_known,AoA_Kup_lem,Perc_known_lem,AoA_Bird_lem,AoA_Bristol_lem,AoA_Cort_lem,AoA_Schock
0,a,a,20415.27,Article,1,1,1,a,2.89,1.0,2.89,1.0,3.16,,,
1,aardvark,aardvark,0.41,Noun,8,7,2,aardvark,9.89,1.0,9.89,1.0,,,,
2,abacus,abacus,0.24,Noun,6,6,3,abacus,8.69,0.65,8.69,0.65,,,,
3,abacuses,abacuses,0.02,Noun,8,9,4,abacus,,,8.69,0.65,,,,
4,abalone,abalone,0.51,Verb,7,7,4,abalone,12.23,0.72,12.23,0.72,,,,


In [28]:
model_word = set(word_vectors.index_to_key) #around 6k words in the Word2Vec model