In [1]:
import pandas as pd
from scipy import spatial
import numpy as np
import re
import nltk
from sklearn.pipeline import make_pipeline
import sklearn.preprocessing as preprocess
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [2]:
train = pd.read_excel('./Data/training_set_rel3_set1.xlsx')
test = pd.read_excel('./Data/valid_set_set1.xlsx')
train.set_index('ID')
test.set_index('essay_id')
y_train = train['Score']
y_test = test['Score']
X = pd.concat([train,test])

## Removing stopwords and stemming

In [3]:
def StemmingWordList(arrayList):
    ps = nltk.PorterStemmer()
    newList = [ps.stem(word) for word in arrayList]
    return ''.join(newList)

content = X['Essay Content']
content = content.apply(lambda x: re.sub('@.+?\s', '', x))
content = content.apply(lambda x: re.sub('\W|_|[0-9]', ' ', x))
content = content.apply(lambda x: StemmingWordList(x))

In [4]:
tfidf = TfidfVectorizer(max_df=0.9, stop_words=nltk.corpus.stopwords.words('english'))
features = tfidf.fit_transform(content)
scores = X['Score']

In [5]:
knn = KNeighborsClassifier()
svd = TruncatedSVD(n_iter=50)
pipeline = make_pipeline(svd, knn)
params = {
    'truncatedsvd__n_components': range(80, 400, 10),
    'kneighborsclassifier__n_neighbors': range(3,10,1),
    'kneighborsclassifier__weights': ['distance']
}

optimized_knn = GridSearchCV(pipeline, params, n_jobs=10)
optimized_knn.fit(features, scores)



GridSearchCV(estimator=Pipeline(steps=[('truncatedsvd',
                                        TruncatedSVD(n_iter=50)),
                                       ('kneighborsclassifier',
                                        KNeighborsClassifier())]),
             n_jobs=10,
             param_grid={'kneighborsclassifier__n_neighbors': range(3, 10),
                         'kneighborsclassifier__weights': ['distance'],
                         'truncatedsvd__n_components': range(80, 400, 10)})

In [6]:
optimized_knn.best_estimator_

Pipeline(steps=[('truncatedsvd', TruncatedSVD(n_components=80, n_iter=50)),
                ('kneighborsclassifier',
                 KNeighborsClassifier(n_neighbors=8, weights='distance'))])

#### knn = KNeighborsClassifier()
svd = TruncatedSVD(n_iter=50)
pipeline = make_pipeline(svd, knn)
params = {
    'truncatedsvd__n_components': range(50, 400, 5),
    'kneighborsclassifier__n_neighbors': range(3,10,1),
    'kneighborsclassifier__weights': ['distance']
}

optimized_knn = GridSearchCV(pipeline, params, n_jobs=10)
optimized_knn.fit(X, y)

In [7]:
optimized_knn.best_score_

0.32208927381745506

In [15]:
optimized_knn.best_params_

{'kneighborsclassifier__n_neighbors': 8,
 'kneighborsclassifier__weights': 'distance',
 'truncatedsvd__n_components': 80}

In [9]:
optimized_knn.cv_results_.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_kneighborsclassifier__n_neighbors', 'param_kneighborsclassifier__weights', 'param_truncatedsvd__n_components', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])

In [10]:
result = zip(optimized_knn.cv_results_.get('mean_test_score'), optimized_knn.cv_results_.get('params'))

In [11]:
listResult = list(result)

In [12]:
listResult.sort(key = lambda t: t[0], reverse=True)

In [13]:
listResult[:5]

[(0.32208927381745506,
  {'kneighborsclassifier__n_neighbors': 8,
   'kneighborsclassifier__weights': 'distance',
   'truncatedsvd__n_components': 80}),
 (0.3203997335109927,
  {'kneighborsclassifier__n_neighbors': 9,
   'kneighborsclassifier__weights': 'distance',
   'truncatedsvd__n_components': 90}),
 (0.31871641128136796,
  {'kneighborsclassifier__n_neighbors': 9,
   'kneighborsclassifier__weights': 'distance',
   'truncatedsvd__n_components': 80}),
 (0.31028114590273154,
  {'kneighborsclassifier__n_neighbors': 8,
   'kneighborsclassifier__weights': 'distance',
   'truncatedsvd__n_components': 90}),
 (0.3098503220075505,
  {'kneighborsclassifier__n_neighbors': 9,
   'kneighborsclassifier__weights': 'distance',
   'truncatedsvd__n_components': 100})]

In [14]:
dataResult = []
for e in listResult:
    score, params = e
    metric = params['kneighborsclassifier__metric']
    neighbors = params['kneighborsclassifier__n_neighbors']
    components = params['truncatedsvd__n_components']
    weight = params['kneighborsclassifier__weights']
    dataResult.append((score, metric, neighbors, components, weight))

KeyError: 'kneighborsclassifier__metric'

In [None]:
dataFrame = pd.DataFrame(data=dataResult, columns=['score', 'metric', 'neighbors', 'components', 'weight'])

In [None]:
dataFrame.to_csv('score_metric_n_c_w_1.csv')

# After found the best estimator, try implement the "new median"

In [None]:
train = pd.read_excel('./Data/training_set_rel3_set1.xlsx')
test = pd.read_excel('./Data/valid_set_set1.xlsx')

y_train = train['Score']
y_test = test['Score']


### Build additional features: number of sentences and number words

In [None]:
train.set_index('ID')
test.set_index('ID')
x_numberOfSentences = train['Essay Content'].apply(lambda x: len(x.split('.')))
x_numberOfWords = train['Essay Content'].apply(lambda x: len(x.split()))

y_numberOfSentences = test['essay'].apply(lambda x: len(x.split('.')))
y_numberOfWords = test['essay'].apply(lambda x: len(x.split()))

Processing Data

In [None]:
content = train['Essay Content']
content = content.apply(lambda x: re.sub('@.+?\s', '', x))
content = content.apply(lambda x: re.sub('\W|_|[0-9]', ' ', x))
content = content.apply(lambda x: StemmingWordList(x))
x_train = tfidf.fit_transform(content)
x_train['#Sentences'] = x_numberOfSentences
x_train['#Words'] = x_numberOfWords

content = test['essay']
content = content.apply(lambda x: re.sub('@.+?\s', '', x))
content = content.apply(lambda x: re.sub('\W|_|[0-9]', ' ', x))
content = content.apply(lambda x: StemmingWordList(x))
x_test = tfidf.fit_transform(content)
x_test['#Sentences'] = y_numberOfSentences
x_test['#Words'] = y_numberOfWords

## Using Naive Bayes GaussianNB