In [None]:
# Import libraries and data.
import numpy as np
import pandas as pd

essay_data = pd.read_csv("domain1.csv")
print(essay_data.columns)
print(len(essay_data))


In [None]:
#Preprocessing, extracting the features.
def somefunc(X,target_col):
    #feature_columns = ["essay","word_count","sentence_count","wrong_words","spelling_mistakes","no_of_domain_words","word_to_sent_ratio","num_of_characters",'NN','NNP','VBZ','NNPS','NNS','IN','PRP','VB','JJ','VBP','VBG',"target"]
    feature_columns = ["essay","word_count","long_word_count","avg_word_length_per_essay","wrong_words","no_of_domain_words","word_to_sent_ratio","num_of_characters","sentence_count","noun_count","verb_count","comma_count","punctuation_count","adjective_count","adverb_count","quotation_mark_count","spelling_mistakes","target"]
    feature_pd = pd.DataFrame(index = X.index, columns = feature_columns)
    feature_pd['essay'] = X['essay']
    feature_pd['target'] = X[target_col]
    return feature_pd
feature_data = somefunc(essay_data,'domain1_score')


In [None]:
from sklearn.cross_validation import train_test_split

feature_cols = list(feature_data.columns[:-1])
X_all = feature_data[feature_cols]
y_all = feature_data['target'] #domain1_score is equal to sum of rater_domain1 and rater_domain2, so that columns are not needed


In [None]:
#Preprocessing.
import nltk
import re
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
import grammar_check
from collections import Counter
import textmining
from time import time

def featureSet(X): #X would be X_train and X_test
    #WordCount
    for index,row in X.iterrows():
        #Add the sentence count
        text = unicode(row['essay'],errors='ignore') 
        text = " ".join(filter(lambda x:x[0]!='@', text.split())) #To remove proper nouns tagged in the data-set which may result into false positives during POS tagging.
        
        tokenized_essay = nltk.sent_tokenize(text)
        sent_count = len(tokenized_essay)
        row['sentence_count'] = sent_count
        
        #Add word count after removing the stop words.
        words = nltk.word_tokenize(text)
        stop_words = set(stopwords.words('english'))
        stop_words.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}']) # remove it if you need punctuation 
        
        for word in words:
            if word in stop_words:
                words.remove(word)
        word_count = len(words)
        
        row['word_count'] = word_count
        row['word_to_sent_ratio'] = round(float(word_count/float(sent_count)),2)
        row['num_of_characters'] = nltk.FreqDist(text).N()

        tool = grammar_check.LanguageTool('en-US')
        matches = tool.check(text)
        row['spelling_mistakes'] = len(matches)
        
        #No_of_domain_words after removing the stop words and punctuations from the essay.
        cnt = 0
        wrong_word_count = 0
        for word in words:
            if wn.synsets(word):
                cnt += 1
            else:
                wrong_word_count += 1
        row['no_of_domain_words'] = cnt
        row['wrong_words'] = wrong_word_count
        
        
        #POS TAGS
        count= Counter([j for i,j in nltk.pos_tag(words)])
        pos_list = ['NN','NNP','VBZ','NNPS','NNS','IN','PRP','VB','JJ','VBP','VBG']
        
        for i in pos_list:
            row[i] = count[i]
        

'''start = time()        
featureSet(X_train)
end = time()
print ("Generated the features for training data in {:.4f} minutes".format((end - start)/60.0))

start = time()
featureSet(X_test)
end = time()
print ("Generated the features for testing data in {:.4f} minutes".format((end - start)/60.0))
'''


In [None]:
import nltk
import re
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
import grammar_check
from collections import Counter
import textmining
from time import time

def featureSet2(X): #X would be X_train and X_test
    #WordCount
    for index,row in X.iterrows():
        
        text = unicode(row['essay'],errors='ignore') 
        text = " ".join(filter(lambda x:x[0]!='@', text.split())) #To remove proper nouns tagged in the data-set which may result into false positives during POS tagging.
        
        punctuation = ['.','?', '!', ':', ';']
        #Comma count
        comma_count = text.count(',')
        row['comma_count'] = comma_count
        
        #Punctuation count
        punctuation_count = 0
        for punct in punctuation:
            punctuation_count += text.count(punct)
        row['punctuation_count'] = punctuation_count
        
        #Quotation marks count
        quotation_mark_count = text.count('"')
        quotation_mark_count += text.count("'")
        row['quotation_mark_count'] = quotation_mark_count
        
        #Add the sentence count
               
        tokenized_essay = nltk.sent_tokenize(text)
        sent_count = len(tokenized_essay)
        row['sentence_count'] = sent_count
        
        #Add word count after removing the stop words.
        words = nltk.word_tokenize(text)
        stop_words = set(stopwords.words('english'))
        stop_words.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}']) # remove it if you need punctuation 
        
        for word in words:
            if word in stop_words:
                words.remove(word)
        word_count = len(words)
        
        row['word_count'] = word_count
        
        #Long word count
        long_word_count = 0
        total_word_length = 0
        for word in words:
            total_word_length += len(word)
            if len(word) > 6:
                long_word_count +=1
        row['long_word_count'] = long_word_count
        
        #Average word length per essay
        row['avg_word_length_per_essay'] = round((total_word_length/float(len(words))),2)
        
        
        tool = grammar_check.LanguageTool('en-US')
        matches = tool.check(text)
        row['spelling_mistakes'] = len(matches)
           
        #POS TAGS
        count= Counter([j for i,j in nltk.pos_tag(words)])
               
        row['noun_count'] = count['NN'] + count['NNS'] + count['NNPS'] + count['NNP']
        row['verb_count'] = count['VB'] + count['VBG'] + count['VBP'] + count['VBN'] + count['VBZ']
        row['adjective_count'] = count['JJ'] + count['JJR'] 
        row['adverb_count'] = count['RB'] + count['RBR'] + count['RBS']
        
        #No_of_domain_words and wrong words after removing the stop words and punctuations from the essay.
        cnt = 0
        wrong_word_count = 0
        for word in words:
            if wn.synsets(word):
                cnt += 1
            else:
                wrong_word_count += 1
        row['no_of_domain_words'] = cnt
        row['wrong_words'] = wrong_word_count        
        
        #Word to sentence ratio
        row['word_to_sent_ratio'] = round(float(word_count/float(sent_count)),2)
        
        #Number of characters
        row['num_of_characters'] = nltk.FreqDist(text).N()

In [None]:
#Generate the feature set.
def GenerateFeatures(X):
    start = time()
    featureSet2(X)
    end = time()
    print ("Generated the features for the entire data-set in {:.4f} minutes".format((end - start)/60.0))

GenerateFeatures(X_all)


In [None]:
X_all.to_csv('features_set_1.csv', sep='\t')
X_all

In [None]:
#Fitting, predicting and calculating error. 
#Using LinearRegression, 5 fold cross validation and quadratic kappa as an error metric.

from sklearn import cross_validation
from sklearn.linear_model import LinearRegression
from metrics import kappa

def Evaluate(feature_list):
    model = LinearRegression()

    #Simple K-Fold cross validation. 5 folds.
    cv = cross_validation.KFold(len(X_all), n_folds=5,shuffle=True)
    results = []
    
    for traincv, testcv in cv:
            X_test, X_train, y_test, y_train = X_all.ix[testcv], X_all.ix[traincv], y_all.ix[testcv], y_all.ix[traincv]

            #final_train_data = X_train.drop('essay',axis = 1)
            #final_test_data = X_test.drop('essay',axis = 1)
            
            final_train_data = X_train[feature_list]
            final_test_data = X_test[feature_list]
            
            model.fit(final_train_data,y_train)
            start = time()
            y_pred = model.predict(final_test_data)
            end = time()

            #print ("Made predictions in {:.4f} seconds.".format(end - start))
            result = kappa(y_test.values,y_pred,weights='quadratic')
            results.append(result)
            
            #probas = model.fit(train[traincv], target[traincv]).predict_proba(train[testcv])


    #print "Results: " + str( np.array(results).mean() )
    return str(np.array(results).mean())


In [None]:
feature_columns = ["word_count","long_word_count","avg_word_length_per_essay","wrong_words","no_of_domain_words","word_to_sent_ratio","num_of_characters","sentence_count","noun_count","verb_count","comma_count","punctuation_count","adjective_count","adverb_count","quotation_mark_count","spelling_mistakes"]
feature_dict = {}
for i in feature_columns:
    feature_dict[i] = 0


In [None]:
for f in feature_columns:
    score_ = Evaluate([f])
    feature_dict[f] = round(float(score)*100,4)


In [None]:
#Single feature Kappa
import operator
sorted_feature_list = sorted(feature_dict.items(), key=operator.itemgetter(1),reverse=True)
sorted_feature_list

In [None]:
#Forward feature selection.
sorted_f = [i[0] for i in sorted_feature_list]
for i in range(1,len(sorted_f)+1):
    forward_feature_list = sorted_f[:i]
    print forward_feature_list, Evaluate(forward_feature_list)

In [113]:
selected_indexes  = [0,1,2,3,5,8,10,11,13,14,15] #Do not delete
selected_features = [sorted_f[i] for i in selected_indexes]
print Evaluate(selected_features)
print Evaluate(sorted_f)
selected_features

0.680805858355
0.69033122005


['wrong_words',
 'verb_count',
 'punctuation_count',
 'adverb_count',
 'quotation_mark_count',
 'adjective_count',
 'spelling_mistakes',
 'sentence_count',
 'avg_word_length_per_essay',
 'no_of_domain_words',
 'word_to_sent_ratio']

In [120]:
final_train_data.describe()


Unnamed: 0,word_count,long_word_count,avg_word_length_per_essay,wrong_words,no_of_domain_words,word_to_sent_ratio,num_of_characters,sentence_count,noun_count,verb_count,comma_count,punctuation_count,adjective_count,adverb_count,quotation_mark_count,spelling_mistakes
count,1427,1427,1427.0,1427,1427,1427.0,1427,1427,1427,1427,1427,1427,1427,1427,1427,1427
unique,374,140,177.0,114,306,725.0,1040,49,160,97,48,53,59,47,26,26
top,249,59,4.86,46,199,14.0,239,23,81,52,2,24,24,19,0,2
freq,14,28,23.0,38,18,13.0,7,87,33,50,83,65,62,71,241,187


In [None]:
#Redundant code
'''
#Training testing using train_test_split

num_train = 1450

# Set the number of testing points
num_test = X_all.shape[0] - num_train


#Shuffle and split the dataset into the number of training and testing points above
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, train_size=num_train,random_state=42)

# Show the results of the split
print ("Training set has {} samples.".format(X_train.shape[0]))
print ("Testing set has {} samples.".format(X_test.shape[0]))
print type(X_test)
print type(y_test)
y_test.name = "scores"


def termdocumentmatrix():
    # Initialize class to create term-document matrix
    tdm = textmining.TermDocumentMatrix()
    # Add the documents
    for index,row in essay_data.iterrows():
        tdm.add_doc(row['essay'])
    # Write out the matrix to a csv file. Note that setting cutoff=2 means
    # that words which appear in 1 or more documents will be included in
    # the output (i.e. every word will appear in the output). The default
    # for cutoff is 2, since we usually aren't interested in words which
    # appear in a single document. For this example we want to see all
    # words however, hence cutoff=1.
    tdm.write_csv('matrix.csv', cutoff=1)
    # Instead of writing out the matrix you can also access its rows directly.
    # Let's print them to the screen.
    return tdm
    
def domainInformationContent(X,Y):
    #Get essay with maximum score.
    essay_index_with_max_score = Y.idxmax()
    essay_with_max_score = X.ix[essay_index_with_max_score]['essay']
    #print essay_with_max_score
    
    #Extract the noun words from this essay and feed it to WordNet.
    lines = 'lines is some string of words'
    # function to test if something is a noun
    is_noun = lambda pos: pos[:2] == 'NN'
    # do the nlp stuff
    tokenized_max = nltk.word_tokenize(essay_with_max_score)
    nouns_max = [word for (word, pos) in nltk.pos_tag(tokenized_max) if is_noun(pos)] 
    return nouns_max
      


#Some random plots
from matplotlib import pyplot as plt
from matplotlib import style
%matplotlib inline 

style.use('ggplot')

plt.plot(X_test['no_of_domain_words'],y_test,'.')
plt.plot(X_test['no_of_domain_words'],y_pred,'-')
plt.show()


#Feature wise training
def featurewisetraining(feature):
    _1dfeature = X_train[feature].reshape(-1,1)
    lr.fit(_1dfeature,y_train)
    X_T = X_test[feature].reshape(-1,1)
    print "Score for feature ",feature," is ",lr.score(X_T,y_test)
    plt.plot(X_test[feature],y_test,'.')
    plt.plot(X_test[feature],y_pred,'-')
    plt.show()
    
featurewisetraining('word_count')
featurewisetraining('sentence_count')
featurewisetraining('spelling_mistakes')
featurewisetraining('no_of_domain_words')
featurewisetraining('word_to_sent_ratio')
featurewisetraining('wrong_words')
featurewisetraining('num_of_characters')

#Hence we drop the word_to_sent ratio feature.
'''