In [None]:
import re
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
from textblob import TextBlob

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression as LogReg
from sklearn.linear_model import LogisticRegressionCV as LogRegCV

from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import cohen_kappa_score
%matplotlib inline
import os

In [None]:
def append_regularized_scores(old_df):
    new_df = old_df.copy()
    new_df['std_score'] = new_df.groupby(['essay_set'])[['score']].apply(lambda x: (x - np.mean(x)) / (np.std(x)))
    return new_df

def create_regularization_data(old_df):
    #getting the number of datasets
    max_essay_set = max(old_df['essay_set'])
    #list of the regularized values
    regularization_data = []
    for i in range(max_essay_set+1):
        mean = np.mean((old_df[old_df['essay_set'] == i + 1])['score'])
        std = np.std((old_df[old_df['essay_set'] == i + 1])['score'])
        regularization_data.append([i + 1, mean, std])
    return regularization_data

In [None]:

train_cols = ['essay_id', 'essay_set', 'essay', 'domain1_score', 'domain2_score']
train_df = pd.read_csv('training_set_rel3.tsv', delimiter='\t', usecols=train_cols,encoding='iso-8859-1')
for i in range(train_df.shape[0]):
    if not np.isnan(train_df.get_value(i, 'domain2_score')):
        assert train_df.get_value(i, 'essay_set') == 2
        new_val = train_df.get_value(i, 'domain1_score') + train_df.get_value(i, 'domain2_score')
        train_df.set_value(i, 'domain1_score', new_val) 
train_df = train_df.drop('domain2_score', axis=1)
train_df = train_df.rename(columns={'domain1_score': 'score'})



In [None]:
import re
def clean_str(string):
  
    string = re.sub(r"[^A-Za-z0-9(),.!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    #string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " ( ", string)
    string = re.sub(r"\)", " ) ", string)
    string = re.sub(r"\?", " ? ", string)
    string = re.sub(r"\s{2,}", " ", string)

    return string.strip().lower()


# cleaning training data

In [None]:
for i in range(0,len(train_df)):
    train_df.essay[i]=clean_str(train_df.essay[i])


In [None]:
################
regularization_data = create_regularization_data(train_df)
train_df = append_regularized_scores(train_df)

#validate that the standardization works
max_essay_set = max(train_df['essay_set'])
for i in range (max_essay_set):
    valid = train_df[train_df["essay_set"] == i + 1]["std_score"]
################

In [None]:
# Read in validation data
valid_cols = ['essay_id', 'essay_set', 'essay', 'domain1_predictionid', 'domain2_predictionid']
valid_df = pd.read_csv('valid_set.tsv', delimiter='\t', usecols=valid_cols,encoding='iso-8859-1')
valid_df['score'] = pd.Series([0] * valid_df.shape[0], index=valid_df.index)

# scores are stored in separate data set, we'll put them in same one
valid_scores = pd.read_csv('valid_sample_submission_5_column.csv', delimiter=',')

# put each score in our data set, and make sure to handle essay set 2
for i in range(valid_df.shape[0]):
    dom1_predid = valid_df.get_value(i, 'domain1_predictionid')
    row = valid_scores[valid_scores['prediction_id'] == dom1_predid]
    score = row.get_value(row.index[0], 'predicted_score')
    
    dom2_predid = valid_df.get_value(i, 'domain2_predictionid')
    if not np.isnan(dom2_predid):
        assert valid_df.get_value(i, 'essay_set') == 2
        rowB = valid_scores[valid_scores['prediction_id'] == dom2_predid]
        scoreB = rowB.get_value(rowB.index[0], 'predicted_score')
        score += scoreB
        
    valid_df.set_value(i, 'score', score)
        
valid_df = valid_df.drop(['domain1_predictionid', 'domain2_predictionid'], axis=1)


# cleaning test data

In [None]:
for i in range(0,len(valid_df)):
    valid_df.essay[i]=clean_str(valid_df.essay[i])


In [None]:
def append_standardized_column(train_df, valid_df, non_std_col_name):
    std_col_name = "std_" + non_std_col_name
    train_df = append_zeros_column(train_df, std_col_name)
    valid_df = append_zeros_column(valid_df, std_col_name)

    std_data = create_standardization_data(train_df, non_std_col_name)

    dfs = [train_df, valid_df]
    for df in dfs:
        for i in range(df.shape[0]):
            essay_set = df.get_value(i, 'essay_set')
            non_std_val = df.get_value(i, non_std_col_name)
            if std_data[essay_set - 1][0] < std_data[essay_set - 1][1]:
                df = df.set_value(i, std_col_name, (non_std_val - std_data[essay_set - 1][0]) / std_data[essay_set - 1][1])
            else:
                df=df.set_value(i, std_col_name,non_std_val)
            
    return train_df,valid_df
                
           
                    
# returns a column to place data
def append_zeros_column(df, title):
    df[title] = pd.Series([0.0] * df.shape[0], index=df.index)
    return df

In [None]:
def create_standardization_data(train_df, column_name):
    #getting the number of datasets
    max_essay_set = max(train_df['essay_set'])
    #list of the standardized values
    standardization_data = []
    for i in range(1, max_essay_set+1):
        mean = np.mean((train_df[train_df['essay_set'] == i])[column_name])
        std = np.std((train_df[train_df['essay_set'] == i])[column_name])
        standardization_data.append([mean, std])
    return standardization_data

# Feature Extraction

# COUNTING THE NUMBER OF UNIQUE WORDS

In [None]:
from collections import Counter



def fill_unique_words_column(train_df, valid_df):

    #percentage of unique words to the total number of words
    unique_word_percentages_train = []
    unique_word_percentages_valid = []

    for i in range(len(train_df)):
        splits = train_df.iloc[i]["essay"].split()
        total_words = len(splits)
        unique_words = len(Counter(splits))
        percentage = float(unique_words) / total_words
        unique_word_percentages_train.append(percentage)
        
    for i in range(len(valid_df)):
        splits = valid_df.iloc[i]["essay"].split()
        total_words = len(splits)
        unique_words = len(Counter(splits))
        percentage = float(unique_words) / total_words
        unique_word_percentages_valid.append(percentage)    

    #Add the features to the dataset
    train_df["unique_words"] = unique_word_percentages_train
    valid_df["unique_words"] = unique_word_percentages_valid

    train_df, valid_df = append_standardized_column(train_df, valid_df, 'unique_words')

    return train_df, valid_df

In [None]:
fill_unique_words_column(train_df, valid_df)


# https://textblob.readthedocs.io/en/dev/quickstart.html

# Sentiment Analysis

In [None]:
train_df['sentiment']=float(0)
valid_df['sentiment']=float(0)



In [None]:
for i in range(0,len(train_df)):
    testimonial=TextBlob(train_df.essay[i])
    train_df['sentiment'][i]=testimonial.sentiment.polarity


In [None]:
for i in range(0,len(valid_df)):
    testimonial=TextBlob(valid_df.essay[i])
    valid_df.sentiment[i]=testimonial.sentiment.polarity


# sentence count and word count

In [None]:
train_df['sent_len']=0
train_df['word_count']=0
valid_df['sent_len']=0
valid_df['word_count']=0
from textblob import TextBlob

In [None]:
for i in range(0,len(train_df)):
    zen=TextBlob(train_df.essay[i])
    train_df['word_count'][i]=len(zen.words)
    train_df['sent_len'][i]=len(zen.sentences)
    
for i in range(0,len(valid_df)):
    zen=TextBlob(valid_df.essay[i])
    valid_df['word_count'][i]=len(zen.words)
    valid_df['sent_len'][i]=len(zen.sentences)                    

# Spelling Correction


In [None]:
train_df['mistake_count']=0
valid_df['mistake_count']=0


In [None]:
for i in range(0,len(train_df)):
    word=list(train_df.essay[i])
    wrong=0
    for j in range(0,len(word)):
        if word[j]!=TextBlob(word[j]).correct():
            wrong=wrong+1
    train_df.mistake_count[i]=wrong
for i in range(0,len(valid_df)):
    word=list(valid_df.essay[i])
    wrong=0
    for j in range(0,len(word)):
        if word[j]!=TextBlob(word[j]).correct():
            wrong=wrong+1
    valid_df.mistake_count[i]=wrong            

# Grammar mistake check

In [None]:
import language_check
tool = language_check.LanguageTool('en-US')

In [None]:
train_df['gramm_mistake']=0
valid_df['gramm_mistake']=0
import grammar_check
tool = grammar_check.LanguageTool('en-GB')

In [None]:
for i in range(0,len(train_df)):
    matches=tool.check(train_df.essay[i])
    train_df.gramm_mistake[i]=len(matches)

for i in range(0,len(valid_df)):
    matches=tool.check(valid_df.essay[i])
    valid_df.gramm_mistake[i]=len(matches)


# Noun count

In [None]:
train_df['noun_count']=0
valid_df['noun_count']=0


In [None]:
import nltk
nltk.download('brown')

In [None]:
for i in range(0,len(train_df)):
    zen=TextBlob(train_df.essay[i])
    train_df['noun_count'][i]=len(zen.np_counts)
for i in range(0,len(valid_df)):
    zen=TextBlob(valid_df.essay[i])
    valid_df['noun_count'][i]=len(zen.np_counts)

# avg word length and avg setence length


In [None]:
train_df['avg_word_len']=float(0)
valid_df['avg_word_len']=float(0)

train_df['avg_sent_len']=float(0)
valid_df['avg_sent_len']=float(0)

In [None]:
for i in range(0,len(train_df)):
#     word_len=[]
    sent_len=[]
    zen=TextBlob(train_df.essay[i])
#     for i in range(0,len(zen.words)):
#         word_len.append(len(zen.words[i]))
#     train_df['avg_word_len'][i]=sum(word_len)/len(word_len)
    for j in range(0,len(zen.sentences)):
        sent_len.append(len(zen.sentences[j]))
    train_df['avg_sent_len'][i]=sum(sent_len)/len(sent_len)
    
    
    
for i in range(0,len(valid_df)):
#     word_len=[]
    sent_len=[]
    zen=TextBlob(valid_df.essay[i])
#     for i in range(0,len(zen.words)):
#         word_len.append(len(zen.words[i]))
#     valid_df['avg_word_len'][i]=sum(word_len)/len(word_len)
    for j in range(0,len(zen.sentences)):
        sent_len.append(len(zen.sentences[j]))
    valid_df['avg_sent_len'][i]=sum(sent_len)/len(sent_len)
    

# Long word length

In [None]:
train_df['long_word_count']=0
valid_df['long_word_count']=0



In [None]:
for i in range(0,len(train_df)):
    long_word=0
    zen=TextBlob(train_df.essay[i])
    for i in range(0,len(zen.words)):
        if len(zen.words[i])>7:
            long_word=long_word+1
    train_df['long_word_count'][i]=long_word
    
    
    
for i in range(0,len(valid_df)):
    long_word=0
    zen=TextBlob(valid_df.essay[i])
    for i in range(0,len(zen.words)):
        if len(zen.words[i])>7:
            long_word=long_word+1
    valid_df['long_word_count'][i]=long_word
    

# Vocab richness
https://swizec.com/blog/measuring-vocabulary-richness-with-python/swizec/2528

In [None]:
train_df['vocab_richness']=float(0)
valid_df['vocab_richness']=float(0)




In [None]:
for i in range(0,len(train_df)):
    long_word=0
    zen=TextBlob(train_df.essay[i])
    M1=len(zen.words)
    m2=[]
    values, counts = np.unique(zen.words, return_counts=True)
    for i in range(0,len(values)):
        m2.append(counts[i]^2)
    M2=sum(m2)
    try:
        train_df.vocab_richness[i]=(M1*M1)/(M2-M1)
    except:
        train_df.vocab_richness[i]=0
        
for i in range(0,len(valid_df)):
    long_word=0
    zen=TextBlob(valid_df.essay[i])
    M1=len(zen.words)
    m2=[]
    values, counts = np.unique(zen.words, return_counts=True)
    for i in range(0,len(values)):
        m2.append(counts[i]^2)
    M2=sum(m2)
    try:
        valid_df.vocab_richness[i]=(M1*M1)/(M2-M1)
    except:
        valid_df.vocab_richness[i]=0

# proper noun and adjective count

In [None]:
# wiki = TextBlob("Python is a high-level, general-purpose programming language.")
# list(wiki.tags[0])
nltk.download('averaged_perceptron_tagger')
train_df['proper_noun_count']=0
valid_df['proper_noun_count']=0

train_df['adj_count']=0
valid_df['adj_count']=0


In [None]:
for i in range(0,len(train_df)):
    zen=list(TextBlob(train_df.essay[i]).tags)
    train_df['proper_noun_count'][i]=len([x[0] for x in zen if 'NNP'==x[1] or 'NNPS'== x[1]])
    train_df['adj_count'][i]=len([x[0] for x in zen if 'JJR'==x[1] or 'JJS'== x[1]])
    

    
for i in range(0,len(valid_df)):
    zen=list(TextBlob(valid_df.essay[i]).tags)
    valid_df['proper_noun_count'][i]=len([x[0] for x in zen if 'NNP'==x[1] or 'NNPS'== x[1]])
    valid_df['adj_count'][i]=len([x[0] for x in zen if 'JJR'==x[1] or 'JJS'== x[1]])

# Standardizing columns


In [None]:
for i in ['word_count', 'noun_count',
       'avg_word_len', 'avg_sent_len', 'long_word_count', 'vocab_richness',
       'proper_noun_count', 'adj_count','sent_len']:
    append_standardized_column(train_df, valid_df, i)
train_df = train_df.drop(['word_count', 'essay','essay_id','Unnamed: 0','unique_words','noun_count', 
       'avg_word_len', 'avg_sent_len', 'long_word_count', 'vocab_richness',
       'proper_noun_count', 'adj_count'], axis=1)
valid_df = valid_df.drop(['word_count','essay','Unnamed: 0','essay','essay_id','unique_words', 'noun_count',
       'avg_word_len', 'avg_sent_len', 'long_word_count', 'vocab_richness',
       'proper_noun_count', 'adj_count'], axis=1)


# Random Forest regression

In [None]:
train=train_df[['essay_set', 'score', 'std_score', 'std_unique_words', 'sentiment',
        'std_word_count', 'std_noun_count', 'std_avg_word_len',
       'std_avg_sent_len', 'std_long_word_count', 'std_vocab_richness',
       'std_proper_noun_count', 'std_adj_count', 'std_sent_len']]
valid=valid_df[['essay_set', 'score', 'std_unique_words', 'sentiment',
        'std_word_count', 'std_noun_count', 'std_avg_word_len',
       'std_avg_sent_len', 'std_long_word_count', 'std_vocab_richness',
       'std_proper_noun_count', 'std_adj_count', 'std_sent_len']]

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor  
  
# create a regressor object 

regressor = RandomForestRegressor(n_estimators=6, random_state=0)  
regressor.fit(train.drop(['score', 'std_score'],axis=1), train.std_score)  
y_pred = regressor.predict(valid.drop('score',axis=1))  
valid_df["Log_L2 predicted_scores"] =y_pred

In [None]:
stand_pred_values_l1 = []
for i in range(8):
    current_set = valid_df[valid_df['essay_set'] == i + 1]['Log_L2 predicted_scores']
    for value in current_set:
        stand_pred_values_l1.append(int(float(value) * float(regularization_data[i][2]) + (regularization_data[i][1])))
# print stand_pred_values_l1

#adding the denormalizede predicted values to the valid_df dataset
valid_df['newly_predicted_scores_log_l2'] = stand_pred_values_l1

In [None]:
mylist=[]
for i in range(0,len(valid_df)):
    mylist.append(abs(valid_df['score'][i]-valid_df['newly_predicted_scores_log_l2'][i]))
uniq, counts = np.unique(mylist, return_counts=True)
pd.DataFrame({'Difference':uniq,'Counts':counts})

# Gradient Boosting Regression

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import metrics
gbrt=GradientBoostingRegressor(loss='ls', learning_rate=0.1,n_estimators=100,max_depth=3)
gbrt.fit(train.drop(['score', 'std_score'],axis=1), train.std_score) 
y_pred=gbrt.predict(valid.drop('score',axis=1))
valid_df["Log_L2 predicted_scores"] =y_pred

In [None]:
stand_pred_values_l1 = []
for i in range(8):
    current_set = valid_df[valid_df['essay_set'] == i + 1]['Log_L2 predicted_scores']
    for value in current_set:
        stand_pred_values_l1.append(int(float(value) * float(regularization_data[i][2]) + (regularization_data[i][1])))
# print stand_pred_values_l1

#adding the denormalizede predicted values to the valid_df dataset
valid_df['newly_predicted_scores_log_l2'] = stand_pred_values_l1

In [None]:
mylist=[]
for i in range(0,len(valid_df)):
    mylist.append(abs(valid_df['score'][i]-valid_df['newly_predicted_scores_log_l2'][i]))
uniq, counts = np.unique(mylist, return_counts=True)
pd.DataFrame({'Difference':uniq,'Counts':counts})