## In this project, we are building a model which grade unseen essay. 
### Let's Import required libraries

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#### For google colabs only

In [0]:
from google.colab import files
files.upload()

In [0]:
import language_check
from textblob import TextBlob

In [0]:
data = pd.read_csv('./datas/training_set_rel3.tsv',sep='\t', encoding = "ISO-8859-1")

In [0]:
data.head(2)

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,...,rater2_trait3,rater2_trait4,rater2_trait5,rater2_trait6,rater3_trait1,rater3_trait2,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6
0,1,1,"Dear local newspaper, I think effects computer...",4,4,,8,,,,...,,,,,,,,,,
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5,4,,9,,,,...,,,,,,,,,,


##### Selecting important features from dataset

In [0]:
col_to_keep =['essay','domain1_score']

##### saving extracted dataframe

In [0]:
filter_data = data[col_to_keep]

In [0]:
print(len(filter_data))
filter_data[:5]

12976


Unnamed: 0,essay,domain1_score
0,"Dear local newspaper, I think effects computer...",8
1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9
2,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7
3,"Dear Local Newspaper, @CAPS1 I have found that...",10
4,"Dear @LOCATION1, I know having computers has a...",8


## Feature engineering:

#### An essay can be graded on the basis of these factors:
- Grammatical errors in essay

- Number of words an essay has

- Number of sentence an essay has

- Sentiment of essay

- average length of essay

- Number of misspell word #due to less memory of our device, we are unable to perform it, although code is wriiten.

##### We, here make a new column 'word_counting' where we calculate and save number of words in essay

In [0]:
def word_counting(x):
    return (len(TextBlob(x).words))
filter_data['word_length'] = filter_data['essay'].apply(word_counting)

##### We, are counting number of sentences in essay and assigning them to a column 'sentence-counting'.

In [0]:
def sentence_counting(x):
    sentence_len = len([len(sentence.split(' ')) for sentence in TextBlob(x).sentences])
    return sentence_len
filter_data['no_of_sentence'] = filter_data['essay'].apply(sentence_counting)

##### In this feature engineering approach, we calculate sentiment of each sentences and average them down, and assign them to a column 'avg_sentence_sentiment'.

In [0]:
def avg_sentence_sentiment(x):
    sentiment_essay = TextBlob(x).sentiment.polarity
    return sentiment_essay
filter_data['sentiment_essay'] = filter_data['essay'].apply(avg_sentence_sentiment)

##### Generally, In acedemia grading, average length also plays role in essay grade. So, we are calculating average length of words in our essay and assigning them in column 'avg_length_of_words'.

In [0]:
def avg_length_of_words(x):
    word_len = [len(word) for word in TextBlob(x).words]
    return ( sum(word_len)/len(word_len) )
filter_data['avg_word_len']  = filter_data['essay'].apply(avg_length_of_words)

###### Let's view our dataset so far.

In [0]:
filter_data.head(3)

Unnamed: 0,essay,domain1_score,word_length,no_of_sentence,sentiment_essay,avg_word_len
0,"Dear local newspaper, I think effects computer...",8,343,16,0.310471,4.358601
1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9,422,20,0.274,4.331754
2,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7,283,14,0.340393,4.35689


### We, here check the grammatical correctness of essay and count number of errors in essay. Those errors are assigned to a column 'grammar_check'.

In [0]:
def grammar_check(x):
    tool = language_check.LanguageTool('en-US')
    matches = tool.check(x)
    return len(matches)
filter_data['Grammar_check'] = filter_data['essay'].apply(grammar_check)

### We, are now interested in finding number of mis-spelled words in essay

In [None]:
#Processing of this function is taking huge time. So, we are skipping this step for now.
from nltk.corpus import words
word_list = words.words()
def num_word_mis_spell(x):
    print('next')
    every = 0
    print(len(TextBlob(x).words))
    for wor in TextBlob(x).words:
#         w = Word(wor)
        print(wor)
        if wor.lower() not in word_list:
            every += 1
            print(every)
        print(every)
    return every    
filter_data['mis_spell_word'] = filter_data['essay'].apply(num_word_mis_spell)

In [None]:
filter_data.to_csv('filter_data.csv', sep='\t')

In [0]:
filter_data['Grammar_check'] = filter_data['Grammar_check']/5 

### This is the final dataset we are set to use different machine learning model on.

In [0]:
filter_data.head(5)

Unnamed: 0,essay,domain1_score,word_length,no_of_sentence,sentiment_essay,avg_word_len,Grammar_check
0,"Dear local newspaper, I think effects computer...",8,343,16,0.310471,4.358601,2.2
1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9,422,20,0.274,4.331754,3.8
2,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7,283,14,0.340393,4.35689,1.8
3,"Dear Local Newspaper, @CAPS1 I have found that...",10,527,27,0.266828,4.851992,7.0
4,"Dear @LOCATION1, I know having computers has a...",8,470,30,0.199684,4.378723,3.4


In [0]:
filter_data = pd.read_csv('filter_data.csv', sep='\t')

In [4]:
# filter_data.drop('Unnamed: 0',axis=1, inplace=True)
filter_data.head(3)

Unnamed: 0,essay,domain1_score,word_length,no_of_sentence,sentiment_essay,avg_word_len,Grammar_check
0,"Dear local newspaper, I think effects computer...",8,343,16,0.310471,4.358601,11
1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9,422,20,0.274,4.331754,19
2,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7,283,14,0.340393,4.35689,9


### Normalizing dataset

In [None]:
# normalized_df=(df-df.mean())/df.std()
from sklearn import preprocessing

# data = {'score': [234,24,14,27,-74,46,73,-18,59,160]}
# df = pd.DataFrame(data)
# df

min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(filter_data.drop('essay',axis=1))
df_normalized = pd.DataFrame(np_scaled, columns = ['score','word_length','no_of_sentence','sentiment_essay','avg_word_len','Grammar_check'])
df_normalized.head(5)

In [0]:
x = df_normalized['score']

In [0]:
y = df_normalized.drop('score',axis=1)

In [8]:
y.head(3)

Unnamed: 0,word_length,no_of_sentence,sentiment_essay,avg_word_len,Grammar_check
0,0.31899,0.157895,0.655235,0.484731,0.085938
1,0.392891,0.2,0.637,0.475809,0.148438
2,0.262862,0.136842,0.670196,0.484163,0.070312


#### We are using different regression model from sklearn  to our dataset.

In [0]:
from sklearn.linear_model import LogisticRegression

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
x_train, x_test, y_train, y_test = train_test_split(y, x, test_size=0.25, random_state=42)

In [0]:
import numpy as np
from sklearn import linear_model
from sklearn import svm
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [None]:
# import numpy as np
# from sklearn import linear_model
# from sklearn import svm
result = []
classifiers = [
    svm.SVR(),
    linear_model.SGDRegressor(),
    linear_model.BayesianRidge(),
    linear_model.LassoLars(),
    DecisionTreeRegressor(random_state = 0),
    RandomForestRegressor(random_state=0,n_estimators=100),
    linear_model.PassiveAggressiveRegressor(),
    linear_model.TheilSenRegressor(),
    linear_model.LinearRegression(),
    linear_model.Ridge(alpha=1.0),
    linear_model.ElasticNet(random_state=23)
]



for item in classifiers:
    print(item)
    clf = item
    clf.fit(x_train, y_train)
#     print(clf.predict(predictionData),'\n')
    a = [item.__class__.__name__,clf.score(x_test, y_test, sample_weight=None)]
    print(item,':    ',a[1])
    result.append(a)

#### Score we gain using different regression model.

In [14]:
result

[['SVR', 0.5088554660112243],
 ['SGDRegressor', 0.4194873891624755],
 ['BayesianRidge', 0.47782025610749496],
 ['LassoLars', -0.00083156872616863],
 ['DecisionTreeRegressor', 0.19079996102015295],
 ['RandomForestRegressor', 0.6071590075261986],
 ['PassiveAggressiveRegressor', -22.852702249161165],
 ['TheilSenRegressor', 0.3734631557334913],
 ['LinearRegression', 0.47782613303407506],
 ['Ridge', 0.47775937057294576],
 ['ElasticNet', -0.00083156872616863]]

### As, RandomForestRegressor gives R square score of about 0.6, which is considered good fit, our essay_prediction model will be in RandomForestRegressor.

In [None]:
model = RandomForestRegressor(random_state=0,n_estimators=100)
model.fit(x_train, y_train)

In [None]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 

from operator import itemgetter

# Utility function to report best scores
def report(grid_scores, n_top):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.4f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")


param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid,cv = 3, n_jobs = -1, verbose = 2)
grid_search.fit(x_train, y_train)
report(grid_search.grid_scores_, 4)