Performed by Mikhail Smolin

In [38]:
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.ensemble.forest import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor 
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re

In [2]:
reviews = []
with open("train_feedback.txt", encoding = "ISO-8859-1") as f:
    for line in f:
        review = eval(line)
        reviews.append(review)

In [3]:
feedbacks = []
with open('test_feedback.txt', encoding = "ISO-8859-1") as f:
    for line in f:
        feedback = eval(line)
        feedbacks.append(feedback)

In [None]:
with open('movie_descriptions.txt', encoding='utf-8') as f:   # python 3 syntax here
    descriptions=[]
    for line in f:
        description = eval(line)
        descriptions.append(description)

In [None]:
feedbacks = feedbacks.merge()

In [15]:
def text_to_wordlist(text, remove_stopwords=False, stem_words=True):
    text = text.lower().split()
    
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)

    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

In [25]:
texts = []
scores = np.zeros(len(reviews))
for rev in reviews:
    texts.append(text_to_wordlist(rev["text"]))

for i in range(len(reviews)):  
    scores[i] = reviews[i]["score"]

In [26]:
texts_test = []
ids = []
for fed in feedbacks:
    ids.append(fed["id"])
    texts_test.append(text_to_wordlist(fed["text"]))


The optimal min_df=100 value is obtained by manual selection.

In [27]:
vectorizer = TfidfVectorizer(min_df=100, ngram_range=(2, 2))
vectorizer.fit(texts + texts_test)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=100,
        ngram_range=(2, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [28]:
tfids_texts = vectorizer.transform(texts)

In [29]:
tfids_test_texts = vectorizer.transform(texts_test)

In [30]:
X_train, X_test, y_train, y_test = train_test_split(tfids_texts, scores, test_size=0.3, random_state=42)

The optimal n_estimators=500 value is obtained by manual selection.

In [45]:
xgb = XGBRegressor(max_depth=100, gamma=1)

In [46]:
xgb.fit(tfids_texts, scores)

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=1,
       learning_rate=0.1, max_delta_step=0, max_depth=100,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [47]:
mse = mean_squared_error(xgb.predict(X_test), y_test)
print(mse)

0.069134535027


In [48]:
Y_out = xgb.predict(tfids_test_texts)

In [51]:
print(Y_out[1325])

4.91391


In [52]:
ids = pd.DataFrame(feedbacks)['id']

In [53]:
content = [[ids[i],  round(Y_out[i])] for i, e in enumerate(Y_out)]

In [54]:
solution = pd.DataFrame( content, columns = ['ReviewId', 'Score'], dtype=int)
solution.head()

Unnamed: 0,ReviewId,Score
0,32551,5
1,80268,5
2,78875,5
3,47745,2
4,105443,5


In [55]:
solution.to_csv('solution.csv',index=False)