In [3]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from gensim.test.utils import common_texts
from nltk.tokenize import TweetTokenizer
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import get_tmpfile
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [5]:
!pip install ../input/textstat-pypi/Pyphen-0.9.3-py2.py3-none-any.whl
!pip install ../input/textstat-pypi/textstat-0.7.0-py3-none-any.whl

Processing /kaggle/input/textstat-pypi/Pyphen-0.9.3-py2.py3-none-any.whl
Installing collected packages: Pyphen
Successfully installed Pyphen-0.9.3
Processing /kaggle/input/textstat-pypi/textstat-0.7.0-py3-none-any.whl
Installing collected packages: textstat
Successfully installed textstat-0.7.0


In [6]:
import textstat

## Data Analysis

In [8]:
for dirname, _, filenames in os.walk('/kaggle/input/commonlitreadabilityprize'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/commonlitreadabilityprize/sample_submission.csv
/kaggle/input/commonlitreadabilityprize/train.csv
/kaggle/input/commonlitreadabilityprize/test.csv


In [9]:
train_df = pd.read_csv('/kaggle/input/commonlitreadabilityprize/train.csv')
test_df = pd.read_csv('/kaggle/input/commonlitreadabilityprize/test.csv')
sample_submission_df = pd.read_csv('/kaggle/input/commonlitreadabilityprize/sample_submission.csv')

In [10]:
train_df.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845


In [11]:
print('train data stats')
train_df.describe()

train data stats


Unnamed: 0,target,standard_error
count,2834.0,2834.0
mean,-0.959319,0.491435
std,1.033579,0.034818
min,-3.676268,0.0
25%,-1.69032,0.468543
50%,-0.91219,0.484721
75%,-0.20254,0.506268
max,1.71139,0.649671


In [12]:
test_df.head()

Unnamed: 0,id,url_legal,license,excerpt
0,c0f722661,,,My hope lay in Jack's promise that he would ke...
1,f0953f0a5,,,Dotty continued to go to Mrs. Gray's every nig...
2,0df072751,,,It was a bright and cheerful scene that greete...
3,04caf4e0c,https://en.wikipedia.org/wiki/Cell_division,CC BY-SA 3.0,Cell division is the process by which a parent...
4,0e63f8bea,https://en.wikipedia.org/wiki/Debugging,CC BY-SA 3.0,Debugging is the process of finding and resolv...


The test data doesn't have the standard_error column

Useful information from the Kaggle competition page:
- the hidden private test set includes only blank license / legal information
- the training set has excerpts from several time periods and a wide range of reading ease scores; however, the test set includes a slightly larger proportion of modern texts (the type of texts we want to generalize to) than the training set

In [13]:
sample_submission_df.head()

Unnamed: 0,id,target
0,c0f722661,0.0
1,f0953f0a5,0.0
2,0df072751,0.0
3,04caf4e0c,0.0
4,0e63f8bea,0.0


In [14]:
train_data = train_df[['id', 'excerpt', 'target']]  # not using standard_error for now
train_excerpts = list(train_data['excerpt'])
train_targets = list(train_data['target'])

# getting average number of words in excerpts
word_counts = [len(excerpt.split()) for excerpt in train_excerpts]
print('train excerpts word counts stats:')
print('min: {}, max: {}, average: {}'.format(np.min(word_counts), np.max(word_counts), np.mean(word_counts)))

train excerpts word counts stats:
min: 135, max: 205, average: 172.98200423429782


In [15]:
# split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_excerpts, train_targets, test_size=0.2, random_state=42)
print('# train examples: {}\n# validation examples: {}'.format(len(X_train), len(X_val)))

# train examples: 2267
# validation examples: 567


In [16]:
common_texts  # required format for gensim to train a Doc2Vec model

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

In [17]:
def tokenize(excerpt):
    tknzr = TweetTokenizer()
    return tknzr.tokenize(excerpt)

In [18]:
tokenized_X_train = [tokenize(x) for x in X_train]

In [19]:
all_train_words = list(set([x for sublist in tokenized_X_train for x in sublist]))
total_token_count = len(all_train_words)  # number of unique words (tokens) in the train corpus

# number of embedding dimensions = fourth root of number of possible values (rule of thumb)
num_embedding_dim = int(total_token_count ** (1/4))
print(num_embedding_dim, 'embedding dimensions')

13 embedding dimensions


In [20]:
print('sample: {}...'.format(tokenized_X_train[10][:15]))  # showing first 15 tokens in sample

sample: ['There', 'are', 'two', 'types', 'of', 'lithosphere', ':', 'Oceanic', 'lithosphere', ',', 'which', 'is', 'associated', 'with', 'oceanic']...


In [25]:
# train a doc2vec model
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(tokenized_X_train)]
doc2vec_model = Doc2Vec(documents, vector_size=num_embedding_dim, window=2, min_count=1, workers=4)

In [26]:
# persist doc2vec model
fname = get_tmpfile("/kaggle/working/doc2vec_model")
doc2vec_model.save(fname)

In [27]:
def generate_embedding_vector(tokenized_excerpt, doc2vec_model):
    return doc2vec_model.infer_vector(tokenized_excerpt)

In [28]:
# get train embeddings
doc2vec_model = Doc2Vec.load(fname)  # load persisted model
X_train_embeddings = [generate_embedding_vector(t, doc2vec_model) for t in tokenized_X_train]

In [29]:
print('sample embedding: {}'.format(X_train_embeddings[0]))

sample embedding: [ 0.41311997 -0.4820411   0.6738869  -0.16968697 -0.5235968  -0.05713882
  1.0323118  -1.3283008  -0.5420746   0.8708484   1.3804501  -1.1645277
 -1.0705831 ]


In [30]:
# training a random forest regressor
rf_regressor = RandomForestRegressor()
rf_regressor.fit(X_train_embeddings, y_train)

RandomForestRegressor()

In [31]:
def predict_readability(excerpt):
    tokenized = tokenize(excerpt)
    embedding = generate_embedding_vector(tokenized, doc2vec_model)
    return rf_regressor.predict([embedding])[0]

In [32]:
# get train, val set predictions
X_train_predictions = [predict_readability(x) for x in X_train]
X_val_predictions = [predict_readability(x) for x in X_val]

In [33]:
def rmse(actual_scores, predicted_scores):
    return mean_squared_error(actual_scores, predicted_scores, squared=False)

In [34]:
print('train set rmse: {}'.format(rmse(y_train, X_train_predictions)))
print('validation set rmse: {}'.format(rmse(y_val, X_val_predictions)))

train set rmse: 0.6968522710210939
validation set rmse: 0.8218857266510295


In [35]:
# make test set predictions
test_ids = list(test_df['id'])
test_excerpts = list(test_df['excerpt'])
test_predictions = [predict_readability(excerpt) for excerpt in test_excerpts]

In [36]:
# write to submission.csv file
submission_df = pd.DataFrame()
submission_df['id'] = test_ids
submission_df['target'] = test_predictions
print(submission_df.head())
submission_df.to_csv('/kaggle/working/submission.csv', index=False)

          id    target
0  c0f722661 -0.655321
1  f0953f0a5 -0.102582
2  0df072751  0.049701
3  04caf4e0c -1.190967
4  0e63f8bea -1.163791


In [42]:
# using textstat to add well-known readability scores as features
def compute_readability_scores(excerpt):
    scores = [textstat.flesch_reading_ease(excerpt),
              textstat.smog_index(excerpt),
              textstat.flesch_kincaid_grade(excerpt),
              textstat.coleman_liau_index(excerpt),
              textstat.automated_readability_index(excerpt),
              textstat.dale_chall_readability_score(excerpt),
              textstat.difficult_words(excerpt),
              textstat.linsear_write_formula(excerpt),
              textstat.gunning_fog(excerpt),
              textstat.text_standard(excerpt, float_output=True)
             ]
    return scores

In [48]:
def generate_record(excerpt):
    tokenized = tokenize(excerpt)
    embedding = generate_embedding_vector(tokenized, doc2vec_model)
    record = np.append(embedding, compute_readability_scores(excerpt))
    
    return record

In [49]:
generate_record(X_train[0])

array([ 0.40152466, -0.50437999,  0.827335  , -0.12957533, -0.77087545,
       -0.30823734,  0.81785798, -1.47232091, -0.49432743,  0.88223726,
        1.31406903, -1.05178201, -1.22974789, 45.8       , 14.8       ,
       13.2       , 11.84      , 15.3       ,  8.55      , 41.        ,
       16.5       , 14.64      , 15.        ])

In [50]:
X_train_records = [generate_record(excerpt) for excerpt in X_train]
X_val_records = [generate_record(excerpt) for excerpt in X_val]

In [51]:
new_rf_regressor = RandomForestRegressor()
new_rf_regressor.fit(X_train_records, y_train)

RandomForestRegressor()

In [52]:
def predict_readability_new(record):
    return new_rf_regressor.predict([record])[0]

In [53]:
# get train, val set predictions
X_train_predictions = [predict_readability_new(x) for x in X_train_records]
X_val_predictions = [predict_readability_new(x) for x in X_val_records]

In [54]:
print('train set rmse: {}'.format(rmse(y_train, X_train_predictions)))
print('validation set rmse: {}'.format(rmse(y_val, X_val_predictions)))

train set rmse: 0.2832265399038581
validation set rmse: 0.7545356816664058


In [55]:
test_records = [generate_record(excerpt) for excerpt in test_excerpts]
test_predictions = [predict_readability_new(record) for record in test_records]

In [56]:
# write to submission.csv file
submission_df = pd.DataFrame()
submission_df['id'] = test_ids
submission_df['target'] = test_predictions
print(submission_df.head())
submission_df.to_csv('/kaggle/working/submission.csv', index=False)

          id    target
0  c0f722661 -0.546765
1  f0953f0a5 -0.029065
2  0df072751 -0.660822
3  04caf4e0c -2.062113
4  0e63f8bea -2.055672
