In [0]:
import pandas as pd

In [5]:
from google.colab import drive
drive.mount('/content/drive')
%cd drive/'My Drive'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[Errno 2] No such file or directory: 'drive/My Drive'
/content/drive/My Drive


In [6]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_train.shape

(48192, 3)


# Question 9

**How do model complexity and model training time depend on text tokenization method?**

При использовании tfidf_vectorizer/count_vectorizer сложность модели и время ее обучения напрямую завизит от токенизации. Чем больше уникальных слов в корпусе тем больше будет количество входных фичей при использовании, например MLP.


# Exam

Develop a model for predicting review rating.  
**Multiclass classification into 5 classes**  
Score: **F1 with macro averaging**  
You are forbidden to use test dataset for any kind of training.  
Remember proper training pipeline.  
If you are not using default params in the models, you have to use some validation scheme to justify them. 

Use `random_state` or `seed` params - your experiment must be reprodusible.


### 1 baseline = 0.51
### 2 baseline = 0.53


In [7]:
df_train.head()

Unnamed: 0,review,title,target
0,"The staff was very friendly, the breakfast ver...",Walker Gem,5
1,Excellent service - very approachable and prof...,Excellent Service,4
2,Really a top notch place to spend a day at the...,"Good location, warm and friendly staff",5
3,"a little noisy, there was a false fire alarm a...","nice hotel,",4
4,Place had too many animals and I'm allergic to...,Experience,3


In [8]:
#review preprocessing
import numpy as np
import pandas as pd
import os
import collections
import re
import string
import nltk
from nltk import word_tokenize, WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
stoplist = stopwords.words('english')

def remove_punct(text):
    text_nopunct = ''
    text_nopunct = re.sub('['+string.punctuation+']', '', text)
    return text_nopunct

def lower_token(tokens):
    return [w.lower() for w in tokens]

def remove_stop_words(tokens): 
    return [word for word in tokens if word not in stoplist]

def preproc_pipeline(data):
    data['review_clean'] = data['review'].apply(lambda x: remove_punct(x))
    tokens = [word_tokenize(sen) for sen in data.review_clean]
    lower_tokens = [lower_token(token) for token in tokens]
    filtered_words = [remove_stop_words(sen) for sen in lower_tokens]
    result = [' '.join(sen) for sen in filtered_words] 
    data['review_final'] = result
    data['tokens'] = filtered_words
    return data

df_train = preproc_pipeline(df_train)
df_test = preproc_pipeline(df_test)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [9]:
df_train.head()

Unnamed: 0,review,title,target,review_clean,review_final,tokens
0,"The staff was very friendly, the breakfast ver...",Walker Gem,5,The staff was very friendly the breakfast very...,staff friendly breakfast nice extremely comfor...,"[staff, friendly, breakfast, nice, extremely, ..."
1,Excellent service - very approachable and prof...,Excellent Service,4,Excellent service very approachable and profe...,excellent service approachable professional st...,"[excellent, service, approachable, professiona..."
2,Really a top notch place to spend a day at the...,"Good location, warm and friendly staff",5,Really a top notch place to spend a day at the...,really top notch place spend day beginning end...,"[really, top, notch, place, spend, day, beginn..."
3,"a little noisy, there was a false fire alarm a...","nice hotel,",4,a little noisy there was a false fire alarm at...,little noisy false fire alarm midnight reason ...,"[little, noisy, false, fire, alarm, midnight, ..."
4,Place had too many animals and I'm allergic to...,Experience,3,Place had too many animals and Im allergic to ...,place many animals im allergic petsalthough re...,"[place, many, animals, im, allergic, petsaltho..."


In [10]:
all_training_words = [word for tokens in df_train["tokens"] for word in tokens]
training_sentence_lengths = [len(tokens) for tokens in df_train["tokens"]]
TRAINING_VOCAB = sorted(list(set(all_training_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB)))
print("Max sentence length is %s" % max(training_sentence_lengths))

1561308 words total, with a vocabulary size of 58614
Max sentence length is 1343


In [11]:
all_test_words = [word for tokens in df_test["tokens"] for word in tokens]
test_sentence_lengths = [len(tokens) for tokens in df_test["tokens"]]
TEST_VOCAB = sorted(list(set(all_test_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_test_words), len(TEST_VOCAB)))
print("Max sentence length is %s" % max(test_sentence_lengths))

175433 words total, with a vocabulary size of 15990
Max sentence length is 424


In [0]:
#embeddings based on word2vec
from gensim import models

#MODEL: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/view?usp=sharing
word2vec_path = 'GoogleNews-vectors-negative300.bin.gz'
word2vec = models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(vectors, clean_comments, generate_missing=False):
    embeddings = clean_comments['tokens'].apply(lambda x: get_average_word2vec(x, vectors, 
                                                                                generate_missing=generate_missing))
    return list(embeddings)

training_w2v_embeddings = get_word2vec_embeddings(word2vec, df_train, generate_missing=True)
test_w2v_embeddings = get_word2vec_embeddings(word2vec, df_test, generate_missing=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

estimator = RandomForestClassifier(random_state = 1234)

param_grid = {'n_estimators': [60, 80]}

model = GridSearchCV(estimator, param_grid, 
                     cv=3, scoring='f1_macro', n_jobs=-1, verbose=1)

model.fit(training_w2v_embeddings, df_train['target'])

print('Best parameters found by grid search are:', model.best_params_)

Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:  3.6min finished


Best parameters found by grid search are: {'n_estimators': 80}


In [0]:
from sklearn import metrics

# predict
y_pred_train = model.predict(training_w2v_embeddings)
y_pred_test = model.predict(test_w2v_embeddings)

# eval
print('F1 score (based on training data):', metrics.f1_score(df_train['target'], y_pred_train, average='macro'))
print('F1 score (based on testing data):', metrics.f1_score(df_test['target'], y_pred_test, average='macro'))

F1 score (based on training data): 0.9882067331561967
F1 score (based on testing data): 0.3630106670336927


In [12]:
# some wrappers to work with word2vec
from gensim.models.word2vec import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import TransformerMixin
from collections import defaultdict

   
class Word2VecWrapper(TransformerMixin):
    def __init__(self, window=5,negative=5, size=100, iter=100, is_cbow=False, random_state=1234):
        self.window_ = window
        self.negative_ = negative
        self.size_ = size
        self.iter_ = iter
        self.is_cbow_ = is_cbow
        self.w2v = None
        self.random_state = random_state
        
    def get_size(self):
        return self.size_

    def fit(self, X, y=None):
        """
        X: list of strings
        """
        sentences_list = [x.split() for x in X]
        self.w2v = Word2Vec(sentences_list, 
                            window=self.window_,
                            negative=self.negative_, 
                            size=self.size_, 
                            iter=self.iter_,
                            sg=not self.is_cbow_, seed=self.random_state)

        return self
    
    def has(self, word):
        return word in self.w2v

    def transform(self, X):
        """
        X: a list of words
        """
        if self.w2v is None:
            raise Exception('model not fitted')
        return np.array([self.w2v[w] if w in self.w2v else np.zeros(self.size_) for w in X ])

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import LinearSVC
import scipy.sparse as sp

sentences_list = df_train['review_final']

w2v_cbow = Word2VecWrapper(window=5, negative=5, size=300, iter=300, is_cbow=True, random_state=1234)
w2v_cbow.fit(sentences_list)

embeding = w2v_cbow
encoder_pos = OneHotEncoder()
X_train = sp.hstack([embeding.transform(df_train['tokens']),
                     encoder_pos.fit_transform(df_train[['tokens']])
])
X_test = sp.hstack([embeding.transform(df_test['tokens']),
                    encoder_pos.fit_transform(df_test[['tokens']])
])

model = model_selection.GridSearchCV(LinearSVC(penalty='l2', multi_class='ovr', random_state=1234), 
                                    {'C': np.logspace(-4, 0, 5)}, 
                                    cv=3, scoring='f1_macro', n_jobs=-1, verbose=1)

KeyboardInterrupt: ignored

In [0]:
model.fit(X_train, df_train['target'])

#print('train', metrics.f1_score(y_train, model.predict(X_train), average='macro'))
#print('test', metrics.f1_score(y_test, model.predict(X_test), average='macro'))

In [0]:
from sklearn import metrics

# predict
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# eval
print('F1 score (based on training data):', metrics.f1_score(df_train['target'], y_pred_train, average='macro'))
print('F1 score (based on testing data):', metrics.f1_score(df_test['target'], y_pred_test, average='macro'))