In [1]:
from utlis import *
import numpy as np
import pandas as pd
import time
import nltk
from collections import Counter
from nltk import word_tokenize, pos_tag
import string
import enchant
from enchant.checker import SpellChecker
import re
from sklearn.model_selection import train_test_split

In [2]:

def _cleanText(t):
    '''
    t string, raw text input
    ret t string, a list of words
    '''
    t = t.lower()
    t = re.sub(r'[^\w\s]','',t)
    t = re.sub(r'\s*(\(\d)|(\))\s*', '', t)
    #t = t.split()
    return t

def _nltktag(text):
    """
    Using nltk.word_tokenize to tag words as 'NN', 'DT'
    for extracting noun, verb, adj
    """
    words = word_tokenize(text)
    tagged_words = pos_tag(words)
    return tagged_words

def _wordCount(text):
    """
    input: string 
    output: int -- Count of words
    """
    return sum(Counter(text.split()).values())

def _longWordCount(text):
    """
    input: string
    output: int -- Count of Long words
    
    """
    #Average word length without stop words is 5.6
    ##threshold = 6
    long_words = [word for word in text.split() if len(word)>6]
    return sum(Counter(long_words).values())

def _partOfSpeechCount(text):
    """
    input: string
    output: pos count
    
    """
    tagged_words = _nltktag(text)
    #Noun Count
    listnn = [w[0] for w in tagged_words if w[1] in ['NN', 'NNP', 'NNPS','NNS']]
    nnCount = sum(Counter(listnn).values())
    #Verb Count
    listvb = [w[0] for w in tagged_words if w[1] in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']]
    verbCount = sum(Counter(listvb).values())
    #Adjective Count
    listadj = [w[0] for w in tagged_words if w[1] in ['JJ', 'JJR', 'JJS']]
    adjCount = sum(Counter(listadj).values())
    #Adverb Count
    listadvb = [w[0] for w in tagged_words if w[1] in ['RR', 'RBR', 'RBS']]
    advbCount = sum(Counter(listadvb).values())
    return nnCount, verbCount, adjCount, advbCount

def _commaCount(text):
    return text.count(',')

def _punctuationCount(text):
    count = lambda l1,l2: sum([1 for x in l1 if x in l2])
    return count(text,set(string.punctuation)) 

def _sentenceCount(text):
    return len(nltk.sent_tokenize(text))

def _wordLengthAvg(text):
    l = text.split()
    return sum(map(len, l))/float(len(l))



def _spellingError(text):
    """
    return: Count of misspelled words
    """
    my_dict = enchant.Dict("en_US")
    my_checker = SpellChecker(my_dict)
    my_checker.set_text(text)
    return len([error.word for error in my_checker])

def _lexicalDiversity(t):
    """
    t input seq, String
    ---------
    return float ratio
    """
    return len(set(t)) / len(t)

def _quotationMark(t):
    '''
    t string, raw input
    ret li, ceil of pairs of quatation contained in input text
    '''
    li = re.findall('"',t)
    n = len(li)
    n = int(np.ceil(n/2))
    return n
    
def _exclamationMarks(text):
    return text.count('!')

def _featureExtraction(text):
    """
    input: essay as a long string
    
    output:feature vector
    elements in output: 
    1. word count 
    2. long word count
    3. noun word count
    4. verb count
    5. comma count
    6. punctuation count
    7. sentence count
    8. adjective count
    9. adverb count
    10. lexical diversity
    11. quatation mark
    12. word length
    13. spelling error
    14*.bracket count
    15*.exclamation count
    16*. Foreign words count
    """
    wordCount = _wordCount(text)
    longWordCount = _longWordCount(text)
    nounCount, verbCount, adjCount, advbCount = _partOfSpeechCount(text)
    commaCount = _commaCount(text)
    puncCount = _punctuationCount(text)
    sentCount = _sentenceCount(text)
    lexDiv = _lexicalDiversity(text)
    quatMarkCount = _quotationMark(text)
    avgWordLen = _wordLengthAvg(text)
    spelErrorCount = _spellingError(text)
    #brcktCount = _br
    exclamationCount = _exclamationMarks(text)
    
    
    f = [wordCount, longWordCount, nounCount, verbCount, commaCount, puncCount, sentCount, 
                 adjCount, advbCount, lexDiv, quatMarkCount, avgWordLen, spelErrorCount]
    
    return f#_res #feature vector


In [5]:
#read training data
training = pd.read_csv("./data/training_final_orig.csv", sep=',',header=0, index_col=0)
training.head()

Unnamed: 0_level_0,essay_set,essay,final_score,scaled_score
essay_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,"Dear local newspaper, I think effects computer...",8.0,0.6
2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9.0,0.7
3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7.0,0.5
4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10.0,0.8
5,1,"Dear @LOCATION1, I know having computers has a...",8.0,0.6


In [6]:
for i in range(1,9):
    print("{} essays in Topic {}.".format(training[training['essay_set']==i].shape[0], i))

1783 essays in Topic 1.
1800 essays in Topic 2.
1726 essays in Topic 3.
1771 essays in Topic 4.
1805 essays in Topic 5.
1800 essays in Topic 6.
1569 essays in Topic 7.
723 essays in Topic 8.


## Create Features

In [7]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/liuzhaopeng/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/liuzhaopeng/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [8]:
#generate feature vector for all essays
start = time.time()
training['f_vec'] = [_featureExtraction(essay) for essay in training['essay']]
print(time.time() - start)

617.2019567489624


In [9]:
training.shape

(12977, 5)

# Baseline Model
* Split data to train_val, test data
* Use Train_val data for cross-validation/forward-selection
* Use Test data to test.

## import pickle
with open('training_final_train_val.pk', 'wb') as handle:
    pickle.dump(train_val, handle)
with open('training_final_test.pk', 'wb') as handle:
    pickle.dump(test, handle)

In [64]:
with open('training_final_train_val.pk', 'rb') as handle:
    train_val = pickle.load(handle)

In [39]:
t1_data = training[training['essay_set']==1]
t2_data = training[training['essay_set']==2]
t3_data = training[training['essay_set']==3]
t4_data = training[training['essay_set']==4]
t5_data = training[training['essay_set']==5]
t6_data = training[training['essay_set']==6]
t7_data = training[training['essay_set']==7]
t8_data = training[training['essay_set']==8]

In [34]:
def split_test(X):
    nn = int(np.ceil(len(X)*0.9))

    X_train = X[0:nn]
    X_test = X[nn:]
    return X_train, X_test

In [40]:
t1_training, t1_test = split_test(t1_data)
t2_training, t2_test = split_test(t2_data)
t3_training, t3_test = split_test(t3_data)
t4_training, t4_test = split_test(t4_data)
t5_training, t5_test = split_test(t5_data)
t6_training, t6_test = split_test(t6_data)
t7_training, t7_test = split_test(t7_data)
t8_training, t8_test = split_test(t8_data)
test = pd.concat([t1_test,t2_test,t3_test,t4_test,t5_test,t6_test,t7_test,t8_test])

# 5-fold cross validation

In [50]:
from sklearn.model_selection import train_test_split
from sklearn import linear_model
#from sklearn.metrics import cohen_kappa_score
from metrics import kappa
from skll.metrics import kappa
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn import svm
from sklearn.metrics.scorer import make_scorer
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neural_network import MLPRegressor

## Cross validation without Forward Selection
* train:val = 10 : 2

In [54]:
def fit_predict(data_x, data_y, model):
    # data is x
    X_train, X_test, y_train, y_test = train_test_split(list(data_x),list(data_y), test_size=0.2)
    
    # define score function
    scoring = make_scorer(kappa, weights='quadratic', allow_off_by_one=False)
    cv=5
    
    # linear model
    if model == 'lr':
        clf = make_pipeline(preprocessing.StandardScaler(), linear_model.LinearRegression())
        return cross_val_score(clf, X_train, y_train, cv=cv,scoring = scoring)

    if model == 'svm':
        clf = make_pipeline(preprocessing.StandardScaler(), svm.SVR(C=1))
        return cross_val_score(clf, X_train, y_train, cv=cv,scoring = scoring)
    
    if model == 'rf':
        clf = make_pipeline(preprocessing.StandardScaler(), RandomForestRegressor(max_depth=2, random_state=0))
        return cross_val_score(clf, X_train, y_train, cv=cv,scoring = scoring)
        
    if model == 'adaboost':
        clf = make_pipeline(preprocessing.StandardScaler(), AdaBoostRegressor())
        return cross_val_score(clf, X_train, y_train, cv=cv,scoring = scoring)
        
    if model == 'mlp':
        clf = make_pipeline(preprocessing.StandardScaler(), MLPRegressor())
        return cross_val_score(clf, X_train, y_train, cv=cv,scoring = scoring)

### QWK on each validation set

In [61]:
for i,dataset in enumerate([t1_training,t2_training,t3_training,t4_training,t5_training,t6_training,t7_training,t8_training]):
    print('Dataset: t{}_training'.format(i))
    for model in ['lr']:
        #print('Model: {}'.format(model))
        
        print('result: {}'.format(fit_predict(dataset.f_vec, dataset.final_score,model)))

Dataset: t0_training
result: [ 0.79342736  0.8088748   0.83077584  0.8296578   0.81370813]
Dataset: t1_training
result: [ 0.67707968  0.64069557  0.71833058  0.61397299  0.66174788]
Dataset: t2_training
result: [ 0.6959707   0.68638153  0.62209964  0.61577667  0.61796421]
Dataset: t3_training
result: [ 0.65321342  0.66315823  0.66866567  0.61992973  0.61585908]
Dataset: t4_training
result: [ 0.79366473  0.77425461  0.76303394  0.79156389  0.79546575]
Dataset: t5_training
result: [ 0.62693608  0.65890088  0.6327659   0.6480367   0.66104406]
Dataset: t6_training
result: [ 0.65480185  0.70018328  0.6945689   0.69114     0.69616808]
Dataset: t7_training
result: [ 0.71550213  0.61572929  0.67051217  0.59179927  0.55019081]


## Forward Selection

In [62]:

def forward_selection(dataset):
    # selected features
    selected = [] # from 0 to 12
    # max kappa value
    max_result = 0
    while(1):
        # features we will add 
        add_feature = None
        for i in range(13):
            temp_selected = selected.copy()
            
            # only select features not selected
            if i not in selected:
                temp_selected.append(i)
                print('searching range: ', temp_selected)
            
            # calculate kappa for current feature set
            data_x = dataset.f_vec.apply(lambda x: [x[i] for i in temp_selected])
            data_y = dataset.final_score
            # take mean of each fold qwk
            temp_result = np.mean(fit_predict(data_x, data_y, model = 'lr'))
            
            # get better result, update
            if temp_result>max_result:
                add_feature = i
                max_result = temp_result
        
        if add_feature != None:
            selected.append(add_feature)
        # 
        else:
            break
        print('temp_result: ', temp_result)
        print('add_feature: ',add_feature, max_result)
        print('*'*60)
    return selected, max_result

## Cross Validation with forward selection

In [63]:
result_dict = {}
kappa_dict = {}
for i,dataset in enumerate([t1_training,t2_training,t3_training,t4_training,t5_training,t6_training,t7_training,t8_training]):
    print('Dataset: t{}_training'.format(i))
    a,b = forward_selection(dataset)
    result_dict['Dataset: t{}_training'.format(i)] = a
    kappa_dict['Dataset: t{}_training'.format(i)] = b

Dataset: t0_training
searching range:  [0]
searching range:  [1]
searching range:  [2]
searching range:  [3]
searching range:  [4]
searching range:  [5]
searching range:  [6]
searching range:  [7]
searching range:  [8]
searching range:  [9]
searching range:  [10]
searching range:  [11]
searching range:  [12]
temp_result:  0.128180158205
add_feature:  0 0.75376865152
************************************************************
searching range:  [0, 1]
searching range:  [0, 2]
searching range:  [0, 3]
searching range:  [0, 4]
searching range:  [0, 5]
searching range:  [0, 6]
searching range:  [0, 7]
searching range:  [0, 8]
searching range:  [0, 9]
searching range:  [0, 10]
searching range:  [0, 11]
searching range:  [0, 12]
temp_result:  0.753977588462
add_feature:  11 0.799378663748
************************************************************
searching range:  [0, 11, 1]
searching range:  [0, 11, 2]
searching range:  [0, 11, 3]
searching range:  [0, 11, 4]
searching range:  [0, 11, 5]


searching range:  [0, 6]
searching range:  [0, 7]
searching range:  [0, 8]
searching range:  [0, 9]
searching range:  [0, 10]
searching range:  [0, 11]
searching range:  [0, 12]
temp_result:  0.740229323722
add_feature:  11 0.783721648791
************************************************************
searching range:  [0, 11, 1]
searching range:  [0, 11, 2]
searching range:  [0, 11, 3]
searching range:  [0, 11, 4]
searching range:  [0, 11, 5]
searching range:  [0, 11, 6]
searching range:  [0, 11, 7]
searching range:  [0, 11, 8]
searching range:  [0, 11, 9]
searching range:  [0, 11, 10]
searching range:  [0, 11, 12]
temp_result:  0.7691740755
add_feature:  6 0.791201654833
************************************************************
searching range:  [0, 11, 6, 1]
searching range:  [0, 11, 6, 2]
searching range:  [0, 11, 6, 3]
searching range:  [0, 11, 6, 4]
searching range:  [0, 11, 6, 5]
searching range:  [0, 11, 6, 7]
searching range:  [0, 11, 6, 8]
searching range:  [0, 11, 6, 9]
sear

searching range:  [1, 6, 2, 9, 4, 3, 10]
searching range:  [1, 6, 2, 9, 4, 3, 11]
searching range:  [1, 6, 2, 9, 4, 3, 12]
temp_result:  0.635840167358
add_feature:  5 0.657347919828
************************************************************
searching range:  [1, 6, 2, 9, 4, 3, 5, 0]
searching range:  [1, 6, 2, 9, 4, 3, 5, 7]
searching range:  [1, 6, 2, 9, 4, 3, 5, 8]
searching range:  [1, 6, 2, 9, 4, 3, 5, 10]
searching range:  [1, 6, 2, 9, 4, 3, 5, 11]
searching range:  [1, 6, 2, 9, 4, 3, 5, 12]
temp_result:  0.638659035081
add_feature:  5 0.675196802779
************************************************************
searching range:  [1, 6, 2, 9, 4, 3, 5, 5, 0]
searching range:  [1, 6, 2, 9, 4, 3, 5, 5, 7]
searching range:  [1, 6, 2, 9, 4, 3, 5, 5, 8]
searching range:  [1, 6, 2, 9, 4, 3, 5, 5, 10]
searching range:  [1, 6, 2, 9, 4, 3, 5, 5, 11]
searching range:  [1, 6, 2, 9, 4, 3, 5, 5, 12]
temp_result:  0.619836689789
add_feature:  2 0.680077020924
***********************************

In [64]:
result_dict

{'Dataset: t0_training': [0, 11, 9, 2, 12],
 'Dataset: t1_training': [1, 9, 12, 3],
 'Dataset: t2_training': [1, 0, 9],
 'Dataset: t3_training': [0, 1, 6],
 'Dataset: t4_training': [0, 11, 6],
 'Dataset: t5_training': [1, 7, 6, 12, 10],
 'Dataset: t6_training': [0, 6, 9, 1],
 'Dataset: t7_training': [1, 6, 2, 9, 4, 3, 5, 5, 2]}

In [65]:
kappa_dict

{'Dataset: t0_training': 0.82556606811417443,
 'Dataset: t1_training': 0.70307598532160664,
 'Dataset: t2_training': 0.6643251757640064,
 'Dataset: t3_training': 0.65441690345151082,
 'Dataset: t4_training': 0.7912016548334222,
 'Dataset: t5_training': 0.65812662736749716,
 'Dataset: t6_training': 0.68039186340976465,
 'Dataset: t7_training': 0.68007702092374622}

# Run Model on Test Set

In [66]:
test['predicted'] = None
for i,dataset in enumerate([t1_training,t2_training,t3_training,t4_training,t5_training,t6_training,t7_training,t8_training]):
    print('Dataset: t{}_testing'.format(i+1))
    features = result_dict['Dataset: t{}_training'.format(i)]
    clf = make_pipeline(preprocessing.StandardScaler(), linear_model.LinearRegression())
    data_x = dataset.f_vec.apply(lambda x: [x[i] for i in features])
    data_y = dataset.final_score
    clf.fit(list(data_x), list(data_y))
    
    # prediction
    test_x = test[test.essay_set==(i+1)].f_vec.apply(lambda x: [x[i] for i in features])
    true = test[test.essay_set==(i+1)].final_score
    pred = clf.predict(list(test_x))
    print(kappa(true, pred, weights='quadratic'))
    #test.loc[test.essay_set==(i+1),'predicted'] = pred

Dataset: t1_testing
0.818928699001
Dataset: t2_testing
0.64509394572
Dataset: t3_testing
0.615365764
Dataset: t4_testing
0.685709275051
Dataset: t5_testing
0.771739130435
Dataset: t6_testing
0.681596482792
Dataset: t7_testing
0.717383188306
Dataset: t8_testing
0.596317469537
