In [1]:
import pandas as pd
from scipy import spatial
import numpy as np
import re
import nltk
from sklearn.pipeline import make_pipeline
import sklearn.preprocessing as preprocess
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [2]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\buiqu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\buiqu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
data = pd.read_excel('./training_set_rel3.xlsx')
data = data[['essay_id', 'essay_set', 'essay', 'domain1_score']]

In [4]:
data = data.dropna()

## Removing stopwords and stemming

In [5]:
def StemmingWordList(arrayList):
    ps = nltk.PorterStemmer()
    newList = [ps.stem(word) for word in arrayList]
    return ''.join(newList)

content = data['essay']
content = content.apply(lambda x: re.sub('@.+?\s', '', x))
content = content.apply(lambda x: re.sub('\W|_|[0-9]', ' ', x))
content = content.apply(lambda x: StemmingWordList(x))

In [6]:
tfidf = TfidfVectorizer(max_df=0.9, stop_words=nltk.corpus.stopwords.words('english'))
X = tfidf.fit_transform(content)
y = data['domain1_score']
tfidf.get_feature_names()

['aa',
 'aamerica',
 'aamous',
 'aand',
 'aare',
 'aas',
 'aase',
 'aasked',
 'ab',
 'aback',
 'abad',
 'abadond',
 'abait',
 'abalt',
 'abanded',
 'abandon',
 'abandond',
 'abandone',
 'abandoned',
 'abandoness',
 'abandoning',
 'abandonment',
 'abandonned',
 'abandured',
 'abanodoned',
 'abanoned',
 'abat',
 'abbandon',
 'abbanond',
 'abbreviated',
 'abc',
 'abd',
 'abdomens',
 'abdominal',
 'abducted',
 'abduction',
 'abe',
 'abease',
 'abeast',
 'abel',
 'abenefit',
 'abese',
 'abesity',
 'abetter',
 'abhor',
 'abhors',
 'abide',
 'abided',
 'abig',
 'abiity',
 'abile',
 'abilit',
 'abilitie',
 'abilities',
 'ability',
 'abilitys',
 'abiliy',
 'abiliyt',
 'abillity',
 'abillty',
 'abilty',
 'abl',
 'ablaze',
 'able',
 'abled',
 'ables',
 'ablities',
 'ablitiy',
 'ablity',
 'ablle',
 'ablt',
 'ablut',
 'ably',
 'abnacious',
 'abnormal',
 'abnormality',
 'abnormally',
 'abnovius',
 'aboard',
 'aboat',
 'abody',
 'aboe',
 'abole',
 'abolish',
 'abolished',
 'abolishing',
 'abominate',

In [7]:
def cosine(x,y):
    return 1 - spatial.distance.cosine(x,y)

In [None]:
knn = KNeighborsClassifier()
svd = TruncatedSVD(n_iter=50)
pipeline = make_pipeline(svd, knn)
params = {
    'truncatedsvd__n_components': range(50, 2000, 5),
    'kneighborsclassifier__n_neighbors': range(5,30,1),
    'kneighborsclassifier__weights': ['distance'],
    'kneighborsclassifier__metric': ['manhattan', 'chebyshev', 'minkowski']
}

optimized_knn = GridSearchCV(pipeline, params, n_jobs=10)
optimized_knn.fit(X, y)



In [None]:
optimized_knn.best_estimator_

In [None]:
optimized_knn.best_score_

In [None]:
optimized_knn.cv_results_

In [None]:
optimized_knn.cv_results_.keys()

In [None]:
result = zip(optimized_knn.cv_results_.get('mean_test_score'), optimized_knn.cv_results_.get('params'))


In [None]:
listResult = list(result)

In [None]:
listResult.sort(key = lambda t: t[0], reverse=True)

In [None]:
listResult[:5]

In [None]:
dataResult = []
for e in listResult:
    score, params = e
    metric = params['kneighborsclassifier__metric']
    neighbors = params['kneighborsclassifier__n_neighbors']
    components = params['truncatedsvd__n_components']
    weight = params['kneighborsclassifier__weights']
    dataResult.append((score, metric, neighbors, components, weight))

In [None]:
dataFrame = pd.DataFrame(data=dataResult, columns=['score', 'metric', 'neighbors', 'components', 'weight'])

In [None]:
dataFrame.to_csv('score_metric_n_c_w_1.csv')

# After found the best estimator, try implement the "new median"

In [None]:
train = pd.read_excel('./Data/training_set_rel3_set1.xlsx')
test = pd.read_excel('./Data/valid_set_set1.xlsx')

y_train = train['Score']
y_test = test['Score']


### Build additional features: number of sentences and number words

In [None]:
train.set_index('ID')
test.set_index('ID')
x_numberOfSentences = train['Essay Content'].apply(lambda x: len(x.split('.')))
x_numberOfWords = train['Essay Content'].apply(lambda x: len(x.split()))

y_numberOfSentences = test['essay'].apply(lambda x: len(x.split('.')))
y_numberOfWords = test['essay'].apply(lambda x: len(x.split()))

Processing Data

In [None]:
content = train['Essay Content']
content = content.apply(lambda x: re.sub('@.+?\s', '', x))
content = content.apply(lambda x: re.sub('\W|_|[0-9]', ' ', x))
content = content.apply(lambda x: StemmingWordList(x))
x_train = tfidf.fit_transform(content)
x_train['#Sentences'] = x_numberOfSentences
x_train['#Words'] = x_numberOfWords

content = test['essay']
content = content.apply(lambda x: re.sub('@.+?\s', '', x))
content = content.apply(lambda x: re.sub('\W|_|[0-9]', ' ', x))
content = content.apply(lambda x: StemmingWordList(x))
x_test = tfidf.fit_transform(content)
x_test['#Sentences'] = y_numberOfSentences
x_test['#Words'] = y_numberOfWords

## Using Naive Bayes GaussianNB