In [3]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import nltk
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
nltk.download('punkt')
nltk.download('stopwords')
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn import svm



[nltk_data] Downloading package punkt to /opt/conda/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /opt/conda/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
train_df = pd.read_csv('.git/text-difficulty-prediction/dataset/WikiLarge_Train.csv')
test_df = pd.read_csv('.git/text-difficulty-prediction/dataset/WikiLarge_Test.csv')

In [5]:
X_train, X_val, y_train, y_val = train_test_split(train_df['original_text'], train_df['label'], test_size=0.15, random_state=42)

In [6]:
#making sure class balance is still okay
class_counts = y_train.value_counts()
class_counts

0    177224
1    177028
Name: label, dtype: int64

In [7]:
#set up tf-idf for machine learning on just training set
#then set up using features

In [8]:
X_val.head()

8521      Diego María de la Concepción Juan Nepomuceno E...
182810    Some of the 1930s trams are still in regular s...
275464                Emperor Go-Momozono -LRB- Japan -RRB-
176814    In other countries , potassium iodate is used ...
196293    Located in a region called Planalto Central , ...
Name: original_text, dtype: object

In [9]:

stemmer = PorterStemmer()
#need to provide a series or dataframe
def text_clean(text):
    #Tokenize
    tokens = text.apply(word_tokenize)
    #Stem
    stemmed = tokens.apply(text_stemmer)
    #rejoin from list to sentence
    cleaned_text = stemmed.apply(" ".join)
    return cleaned_text
def text_stemmer(token_text):
    stemmed_words = [stemmer.stem(word) for word in token_text]
    return stemmed_words

In [10]:
X_val_clean = text_clean(X_val)
X_train_clean = text_clean(X_train)

In [11]:
vectorizer = TfidfVectorizer(stop_words='english',min_df = 20, max_df = 0.90)

In [12]:


# # Now we convert the cleaned text to a TF-IDF representation
# vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X_train_clean)
X_val = vectorizer.transform(X_val_clean)

In [13]:
#functions that do the following -- tokenize, stem, , vectorizer mindf maxdf stopword removal
# split data into train and validation

In [14]:
lr_clf = LogisticRegression(random_state=42).fit(X,y_train)

In [15]:
preds = lr_clf.predict(X_val)

In [16]:
accuracy = accuracy_score(y_val, preds)
precision = precision_score(y_val, preds)
recall = recall_score(y_val, preds)
f1 = f1_score(y_val, preds)
roc_auc = roc_auc_score(y_val, preds)
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'ROC AUC Score: {roc_auc}')

Accuracy: 0.6786902552946446
Precision: 0.6861844496579982
Recall: 0.662265595101416
F1 Score: 0.6740128856359241
ROC AUC Score: 0.6787419117997452


In [17]:
# #parameter tuning
# model = LogisticRegression()
# param_grid = {
#     'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
#     'penalty': ['l1', 'l2'],
#     'solver': ['liblinear', 'saga']
# }

# # Create the GridSearchCV object
# grid_search = GridSearchCV(model, param_grid, cv=5)

# # Fit the GridSearchCV object to the data
# grid_search.fit(X,y_train)
# print('Best parameters:', grid_search.best_params_)
# preds = grid_search.predict(X_val)

In [18]:
# accuracy = accuracy_score(y_val, preds)
# precision = precision_score(y_val, preds)
# recall = recall_score(y_val, preds)
# f1 = f1_score(y_val, preds)
# roc_auc = roc_auc_score(y_val, preds)
# print(f'Accuracy: {accuracy}')
# print(f'Precision: {precision}')
# print(f'Recall: {recall}')
# print(f'F1 Score: {f1}')
# print(f'ROC AUC Score: {roc_auc}')

In [None]:
# Initialize the models
rf_model = RandomForestClassifier()
# gb_model = GradientBoostingClassifier()
# svm_model = svm.SVC()

# Fit the models
rf_model.fit(X, y_train)
# gb_model.fit(X, y_train)
# svm_model.fit(X, y_train)

# Make predictions on validation for now 
rf_predictions = rf_model.predict(X_val)
# gb_predictions = gb_model.predict(X_val)
# svm_predictions = svm_model.predict(X_val)



In [None]:

rf_f1 = f1_score(y_val, rf_predictions)
# gb_f1 = f1_score(y_val, gb_predictions)
# svm_f1 = f1_score(y_val, svm_predictions)

print(f'F1 Score for random forest: {rf_f1}')
# print(f'F1 Score for gradient boosted forest: {gb_f1}')
# print(f'F1 Score for SVM: {svm_f1}')

In [25]:
features_aoa = pd.read_csv('.git/text-difficulty-prediction/features/AoA_51715_words.csv')
features_concrete = pd.read_csv('.git/text-difficulty-prediction/features/Concreteness_ratings_Brysbaert_et_al_BRM.txt', delimiter = '\t')
features_dale_chall = pd.read_csv('.git/text-difficulty-prediction/features/dale_chall.txt',delimiter='\t')

In [26]:

"""
This file contains "Age of Acquisition" (AoA) estimates for about 51k English words, which refers to the approximate age (in years) when a word was learned. Early words, being more basic, have lower average AoA.

The main columns you will be interested in are "Word" and "AoA_Kup_lem". But the others may be useful too.

Word :: The word in question
Alternative.spelling :: if the Word may be spelled frequently in another form	
Freq_pm	:: Freq of the Word in general English (larger -> more common)
Dom_PoS_SUBTLEX	:: Dominant part of speech in general usage
Nletters :: number of letters 
Nphon :: number of phonemes
Nsyll :: number of syllables
Lemma_highest_PoS :: the "lemmatized" or "root" form of the word (in the dominant part of speech. e.g. The root form of the verb "abates" is "abate".
AoA_Kup	:: The AoA from a previous study by Kuperman et al.
Perc_known :: Percent of people who knew the word in the Kuperman et al. study
AoA_Kup_lem :: Estimated AoA based on Kuperman et al. study lemmatized words. THIS IS THE MAIN COLUMN OF INTEREST.
Perc_known_lem	:: Estimated percentage of people who would know this form of the word in the Kuperman study.
AoA_Bird_lem :: AoA reported in previous study by Bird (2001) 
AoA_Bristol_lem	:: AoA reported in previous study from Bristol Univ. (2006)
AoA_Cort_lem :: AoA reported in previous study by Cortese & Khanna (2008)
AoA_Schock :: AoA reported in previous study by Schock (2012)

"""
features_aoa.head()

Unnamed: 0,Word,Alternative.spelling,Freq_pm,Dom_PoS_SUBTLEX,Nletters,Nphon,Nsyll,Lemma_highest_PoS,AoA_Kup,Perc_known,AoA_Kup_lem,Perc_known_lem,AoA_Bird_lem,AoA_Bristol_lem,AoA_Cort_lem,AoA_Schock
0,a,a,20415.27,Article,1,1,1,a,2.89,1.0,2.89,1.0,3.16,,,
1,aardvark,aardvark,0.41,Noun,8,7,2,aardvark,9.89,1.0,9.89,1.0,,,,
2,abacus,abacus,0.24,Noun,6,6,3,abacus,8.69,0.65,8.69,0.65,,,,
3,abacuses,abacuses,0.02,Noun,8,9,4,abacus,,,8.69,0.65,,,,
4,abalone,abalone,0.51,Verb,7,7,4,abalone,12.23,0.72,12.23,0.72,,,,


In [27]:
"""
This file contains concreteness ratings for 40 thousand English lemma words gathered via Amazon Mechanical Turk. The ratings come from a larger list of 63 thousand words and represent all English words known to 85% of the raters.

The file contains eight columns:
1. The word
2. Whether it is a single word or a two-word expression 
3. The mean concreteness rating
4. The standard deviation of the concreteness ratings
5. The number of persons indicating they did not know the word
6. The total number of persons who rated the word
7. Percentage participants who knew the word
8. The SUBTLEX-US frequency count (on a total of 51 million; Brysbaert & New, 2009) 
9. The dominant part-of-speech usage

"""
features_concrete.head()

Unnamed: 0,Word,Bigram,Conc.M,Conc.SD,Unknown,Total,Percent_known,SUBTLEX,Dom_Pos
0,roadsweeper,0,4.85,0.37,1,27,0.96,0,0
1,traindriver,0,4.54,0.71,3,29,0.9,0,0
2,tush,0,4.45,1.01,3,25,0.88,66,0
3,hairdress,0,3.93,1.28,0,29,1.0,1,0
4,pharmaceutics,0,3.77,1.41,4,26,0.85,0,0


In [28]:
"""
This is the Dale Chall 3000 Word List, which is one definition of words that are considered "basic" English.
A summary is at https://www.readabilityformulas.com/articles/dale-chall-readability-word-list.php

"""
features_dale_chall.head()

Unnamed: 0,a
0,able
1,aboard
2,about
3,above
4,absent
