## Table of Contents
* [Import and first glance](#import)
* [Target Exploration](#target)
* [Feature Engineering](#FE)
* [Correlations](#corr)
* [A simple linear model](#model)

In [None]:
# packages

# standard
import numpy as np 
import pandas as pd

# plots
import matplotlib.pyplot as plt
import seaborn as sns

# H2O
import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

In [None]:
# show files
!ls -l '../input/learning-agency-lab-automated-essay-scoring-2/'

In [None]:
# config
default_color_1 = 'darkblue'
default_color_2 = 'darkgreen'
default_color_3 = 'darkred'

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None) # columns can be as wide as necessary to show full content

my_random_seed = 123

<a id='import'></a>
# Import and first glance

In [None]:
# read files
df_train = pd.read_csv('../input/learning-agency-lab-automated-essay-scoring-2/train.csv')
df_test = pd.read_csv('../input/learning-agency-lab-automated-essay-scoring-2/test.csv')
df_sub = pd.read_csv('../input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv')

In [None]:
# remove redundant spaces
df_train.full_text = df_train.full_text.str.strip()
df_test.full_text = df_test.full_text.str.strip()

In [None]:
# preview training data
df_train.head()

In [None]:
# format target as integer
df_train.score = df_train.score.astype(int)

In [None]:
# more details for training data
df_train.info()

In [None]:
# show test set
df_test

<a id='target'></a>
# Target Exploration

In [None]:
# plot target
plt.figure(figsize=(8,4))
df_train.score.value_counts().sort_index().plot(kind='bar', 
                                                color=default_color_3)
plt.grid()
plt.title('Score')
plt.show()

In [None]:
# basis stats
df_train.score.describe()

<a id='FE'></a>
# Feature Engineering

In [None]:
# add a few features
df_train['n_char'] = df_train.full_text.str.len()
df_train['n_word'] = df_train.full_text.str.split().map(lambda x : len(x))
df_train['char_per_word'] = df_train.n_char / df_train.n_word

df_test['n_char'] = df_test.full_text.str.len()
df_test['n_word'] = df_test.full_text.str.split().map(lambda x : len(x))
df_test['char_per_word'] = df_test.n_char / df_test.n_word

features_new = ['n_char', 'n_word', 'char_per_word']

In [None]:
# show stats
df_train[features_new].describe()

In [None]:
# plot distributions of new features - histogram
for f in features_new:
    plt.figure(figsize=(10,3))
    df_train[f].plot(kind='hist', bins=50, color=default_color_1)
    plt.title(f)
    plt.grid()
    plt.show()

In [None]:
# plot distributions of new features - boxplot
for f in features_new:
    plt.figure(figsize=(10,1))
    plt.boxplot(df_train[f], vert=False)
    plt.title(f)
    plt.grid()
    plt.show()

In [None]:
# log features
df_train['log_n_char'] = np.log10(df_train.n_char)
df_train['log_n_word'] = np.log10(df_train.n_word)
df_train['log_char_per_word'] = np.log10(df_train.char_per_word)

df_test['log_n_char'] = np.log10(df_test.n_char)
df_test['log_n_word'] = np.log10(df_test.n_word)
df_test['log_char_per_word'] = np.log10(df_test.char_per_word)

features_log = ['log_n_char', 'log_n_word', 'log_char_per_word']

In [None]:
# plot distributions of log features - histogram
for f in features_log:
    plt.figure(figsize=(10,3))
    df_train[f].plot(kind='hist', bins=50, color=default_color_1)
    plt.title(f)
    plt.grid()
    plt.show()

In [None]:
# plot distributions of log features - boxplot
for f in features_log:
    plt.figure(figsize=(10,1))
    plt.boxplot(df_train[f], vert=False)
    plt.title(f)
    plt.grid()
    plt.show()

<a id='corr'></a>
# Correlations

In [None]:
# correlations
corr_pearson = df_train[['n_char','n_word','char_per_word',
                         'score']].corr(method='pearson')
fig = plt.figure(figsize = (5,4))
sns.heatmap(corr_pearson, annot=True, 
            cmap='RdYlGn', vmin=-1, vmax=+1,
            fmt='.3f', linecolor='black', linewidths=0.5)
plt.title('Pearson Correlation')
plt.show()

In [None]:
# correlations - log features
corr_pearson = df_train[['log_n_char','log_n_word','log_char_per_word',
                         'score']].corr(method='pearson')
fig = plt.figure(figsize = (5,4))
sns.heatmap(corr_pearson, annot=True, 
            cmap='RdYlGn', vmin=-1, vmax=+1,
            fmt='.3f', linecolor='black', linewidths=0.5)
plt.title('Pearson Correlation')
plt.show()

## Plot target vs features:

In [None]:
# target vs number of characters (log)
sns.jointplot(data=df_train, x='log_n_char', y='score', 
              color=default_color_1)
plt.show()

In [None]:
# target vs number of words (log)
sns.jointplot(data=df_train, x='log_n_word', y='score',
              color=default_color_1)
plt.show()

In [None]:
# target vs characters per word (log)
sns.jointplot(data=df_train, x='log_char_per_word', y='score',
              color=default_color_1)              
plt.show()

In [None]:
# export prepared training data
df_train.to_csv('training_data.csv')

<a id='model'></a>
# A simple linear model

In [None]:
# start H2O
h2o.init(max_mem_size='8G', nthreads=4) # Use maximum of 8 GB RAM and 4 cores

In [None]:
# upload data in H2O environment
col4upload = ['essay_id'] + features_log # upload only necessary stuff
train_hex = h2o.H2OFrame(df_train[col4upload+['score']])
test_hex = h2o.H2OFrame(df_test[col4upload])

In [None]:
# force categorical target
train_hex['score'] = train_hex['score'].asfactor()

# define predictors
predictors = features_log

In [None]:
# define GLM
glm_model = H2OGeneralizedLinearEstimator(family = 'multinomial',
                                          standardize = True,
                                          nfolds = 5,
                                          alpha = 0.5, # 0:Ridge (L2 regularization), 1:LASSO (L1 regularization)
                                          score_each_iteration = True,                                          
                                          seed=my_random_seed)

# and train model
glm_model.train(predictors, 'score', training_frame = train_hex);

In [None]:
# show cross validation results
glm_model.cross_validation_metrics_summary().as_data_frame()

In [None]:
# show coefficients
glm_model.coef()

In [None]:
# variable importance
glm_model.varimp_plot();

In [None]:
# predict on training data
pred_train = glm_model.predict(train_hex)
pred_train = pred_train.as_data_frame();
pred_train.head()

In [None]:
# summary of predictions
print(pred_train.predict.value_counts().sort_index())
plt.figure(figsize=(8,4))
pred_train.predict.value_counts().sort_index().plot(kind='bar', color=default_color_3)
plt.title('Predictions - Train')
plt.grid()
plt.show()

In [None]:
# confusion matrix
conf_train = pd.crosstab(pred_train.predict, df_train['score'])
sns.heatmap(conf_train, annot=True, cmap='Reds', 
            fmt='.0f', linecolor='black', linewidths=0.5)
plt.title('Confusion Matrix - Training')
plt.show()

In [None]:
# scatterplot of probabilities and predictions
sns.pairplot(data=pred_train, hue='predict')
plt.show()

In [None]:
# predict on test data
pred_test = glm_model.predict(test_hex)
pred_test = pred_test.as_data_frame();
pred_test.head()

In [None]:
# summary of predictions
print(pred_test.predict.value_counts().sort_index())
pred_test.predict.value_counts().sort_index().plot(kind='bar', color=default_color_3)
plt.title('Predictions - Test')
plt.grid()
plt.show()

In [None]:
# prepare submission data
df_sub.score = pred_test.predict
# and save
df_sub.to_csv('submission.csv', index = False)