In [None]:
import data_loader
import numpy as np
import pandas as pd
import pickle
import os
import nltk
import re
import timeit

from torch.autograd import Variable
import torch

from sklearn import preprocessing, svm, metrics
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.externals import joblib
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import cross_val_score

from util.classification.lstm_pos_tagger import LSTMPOSTagger

meta_list, data_list = data_loader.load_data(load_train=True, load_dev=True, load_test=True)

train_meta, train_meta_corrected, \
dev_meta, dev_meta_corrected, \
test_meta, test_meta_corrected = meta_list

train_data, train_data_corrected, \
dev_data, dev_data_corrected, \
test_data, test_data_corrected = data_list

In [16]:
languages = train_meta["native_language"].unique()
print("# of Sentence: {}".format(len(train_meta)))
print("Sentence distribution:")
stats = []
for language in languages:
    stats.append(len(train_meta[train_meta["native_language"]==language]))
stats_df = pd.DataFrame(stats, columns=["# of sentences"], index=languages)
print(stats_df)

print("Author distribution:")
stats = []
for language in languages:
    stats.append(len(train_meta[train_meta["native_language"]==language]["doc_id"].unique()))
stats_df = pd.DataFrame(stats, columns=["# of authors"], index=languages)
print(stats_df)

stats = []
languages = train_meta["native_language"].unique()
print("Exam score stats:")
for language in languages:
    stats.append(train_meta[train_meta["native_language"]==language]["score"].describe()[['count', 'mean', 'std', 'max', 'min']])
stats_df = pd.DataFrame(stats, index=languages)
print(stats_df)

# of Sentence: 4124
Sentence distribution:
            # of sentences
Russian                427
French                 401
Spanish                428
Japanese               407
Chinese                414
Turkish                404
Portuguese             407
Korean                 413
German                 400
Italian                423
Author distribution:
            # of authors
Russian               81
French               131
Spanish              175
Japanese              81
Chinese               66
Turkish               73
Portuguese            68
Korean                84
German                69
Italian               76
Exam score stats:
            count       mean       std   max   min
Russian     427.0  26.288056  6.179166  40.0   9.0
French      401.0  27.630923  4.666738  40.0  17.0
Spanish     428.0  26.789720  5.349402  40.0  11.0
Japanese    407.0  27.547912  5.040432  39.0  15.0
Chinese     414.0  26.268116  6.210832  40.0  14.0
Turkish     404.0  27.834158  5.494389  

In [17]:
post_df = data_loader.load_post_metadata()
post_df["native_language"].value_counts()*2

Spanish       400
French        292
Korean        172
Russian       166
Japanese      162
Italian       152
Polish        152
Turkish       150
Greek         148
German        138
Portuguese    136
Chinese       132
Catalan       128
Thai          126
Swedish        30
Dutch           4
Name: native_language, dtype: int64

In [19]:
post_df.head()

Unnamed: 0,doc_id,native_language,age_range,score,ans1,ans1_errors,ans1_score,ans2,ans2_errors,ans2_score
0,doc3139,Japanese,26-30,36.0,"<p>Dear Ms. Belinda Marriott,</p>\n <...","[RJ, MN, RP, RT, UD]",4.3,"<p>I had never taken birthdays <NS type=""DY""><...","[DY, RQ, RC, MA, FD, RV, AGA, S, FN, L, RT]",4.3
1,doc2280,French,16-20,31.0,<p>Dear Mr Robertson</p>\n <p>I am wr...,"[ID, TV, RT, AGV, RT, ID, UT, UD, DJ, UD, W, R...",4.3,"<p>Famous people, such as politicians and film...","[S, MP, SA, MD, FN, FN, FN, RV, ID, AGD, RN, R...",4.2
2,doc944,Thai,21-25,24.0,"<p>Dear Mr. Robertson,</p>\n <p>As we...","[S, RT, DN, RV, AGN, DJ, MA, R, RV, MP, TV, R,...",3.1,<p>I had never wanted to be a hero</p>\n ...,"[TV, RT, UQ, MP, CL, X, RT, DJ, TV, TV, UA, RT...",2.3
3,doc102,Korean,21-25,23.0,"<p>Dear Mr Robertson,</p>\n <p>Thank ...","[MD, MA, R, MA, RT, MT, RV, S, RT, MP, SX, MD,...",2.3,"<p>It was dangerous, but I knew I had to do it...","[S, RT, MD, RT, MD, M, RN, RY, RA, RV, UP]",3.1
4,doc941,French,16-20,38.0,"<p>Dear Mr Robertson,</p>\n <p>I am w...","[RT, MT, S, MT, RY, MP, UD, RP, MY]",5.1,<p>Terror in the Alps</p>\n <p>It was...,"[RP, RD, DY, RP, UY, RV, SX]",5.2
