In [1]:
import string
import re
import pandas as pd
import time

from nltk.corpus import stopwords
from spacy.en import English

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from subprocess import run
from fasttext import supervised, load_model

### Loading data

In [2]:
data = pd.concat([pd.read_csv('reviews_rt_all.csv',sep='|'), 
                  pd.read_csv('imdb_small.csv',sep='|')], ignore_index=True)
print(data.shape)
data.head()

(152610, 2)


Unnamed: 0,label,text
0,1,"To an entire generation of filmgoers, it just ..."
1,1,Pixar classic is one of the best kids' movies ...
2,1,Apesar de representar um imenso avanço tecnoló...
3,1,"When Woody perks up in the opening scene, it's..."
4,1,Introduced not one but two indelible character...


### Processing data

* Words with n't|'re|'s|'ve|'ll|'d were fixed for lemmatization
* Actors could play in good and bad movies we attempted to remove them (at least such cases when their names are in brackets)
* Digits, special signs and one-letter words were removed

In [3]:
redundant_signs = set(string.punctuation) - set(['.'])
letters = [x for x in string.ascii_lowercase + '. ']

def clean_data(inp_str):
    inp_str = inp_str.lower()

    # fix haven't|doesn't|shouldn't cases
    inp_str = inp_str.replace("n't", " not")
    inp_str = inp_str.replace("'re", " are")
    inp_str = inp_str.replace("'s", " s")
    inp_str = inp_str.replace("'ve", " have")
    inp_str = inp_str.replace("'ll", " will")
    inp_str = inp_str.replace("'d", " d")

    # here may be actor's names, types of smth etc. I guess it's redundant info
    # let's discuss of necessity of this block
    bracket_words = re.findall('([\(\[\{].+?[\)\]\}])', inp_str)
    for word in bracket_words:
        inp_str = inp_str.replace(''.join(word), "")

    # replace redundant_signs
    for item in redundant_signs:
        inp_str = inp_str.replace(item, ' ')

    # replace digits
    inp_str = re.sub('\d', ' ', inp_str)
    # replace two or more dots. 1 dot is remained as it separates sentences
    inp_str = re.sub('\.{1,10}', ' ', inp_str)
    # replace one-letter words or just letters
    inp_str = re.sub(r"\b[a-z]{1}\b", ' ', inp_str)

    return ' '.join(list(filter(None, inp_str.split(' '))))

In [4]:
data['clean_text'] = data['text'].apply(clean_data)
print('source data: ', data.shape)
data.head()

source data:  (152610, 3)


Unnamed: 0,label,text,clean_text
0,1,"To an entire generation of filmgoers, it just ...",to an entire generation of filmgoers it just m...
1,1,Pixar classic is one of the best kids' movies ...,pixar classic is one of the best kids movies o...
2,1,Apesar de representar um imenso avanço tecnoló...,apesar de representar um imenso avanço tecnoló...
3,1,"When Woody perks up in the opening scene, it's...",when woody perks up in the opening scene it no...
4,1,Introduced not one but two indelible character...,introduced not one but two indelible character...


### Finalizing data

* Lemmatization was made with spacy pasckage
* The word 'movie' was removed
* Reviews with non ascii letters and empty reviews were removed
* Labels were replaced with `__label__1` or `__label__0` because fastText requires obviously marked labels.

In [5]:
nlp = English()
def lem(line, nlp):
    return ' '.join([word.lemma_ for word in nlp(line)])

In [6]:
stop_words = ['movie']

In [7]:
def finalize_data(df, nlp):
    df['stemed_text'] = df['clean_text'].apply(lem, args=(nlp,))
    df['stemed_text'] = df['stemed_text'].apply(
        lambda x: ' '.join([item for item in x.split() if item not in stop_words]))
    df['is_ascii'] = df['stemed_text'].apply(lambda x: set(x).issubset(letters))
    df['letters'] = df['stemed_text'].apply(len)
    df['new_label'] = df['label'].apply(lambda x: '__label__1 ' if x == 1 else '__label__0 ')

    df = df[df['is_ascii'] == 1]
    df = df[df['letters'] > 0]
    df = df.reset_index()
    df = df.ix[:, ['new_label', 'stemed_text']]

    return df

In [8]:
data = finalize_data(data, nlp)
print('cleaned data: ', data.shape)
data.head()

cleaned data:  (145204, 2)


Unnamed: 0,new_label,stemed_text
0,__label__1,to an entire generation of filmgoers it just m...
1,__label__1,pixar classic be one of the good kid of all time
2,__label__1,when woody perk up in the opening scene it not...
3,__label__1,introduce not one but two indelible character ...
4,__label__1,it be easy to see how virtually everything tha...


### Model learning and accuracy evaluation

`train_test_split` was used for splitting and evaluation our model. 

https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md

The main task is to train 1 clasifier on rt and imdb datasets. The main difference between data is that imdb has long reviews and rt's reviews are short.

In [9]:
X_train, X_test, y_train, y_test = train_test_split(data['stemed_text'], 
                                                    data['new_label'], 
                                                    test_size=0.2, 
                                                    random_state=42, 
                                                    stratify=data['new_label'])

pd.concat([y_train, X_train], axis = 1).to_csv('train_imdb_rt_vec_wo_sw.txt', header=False, encoding='utf-8', index=False)
pd.concat([y_test, X_test], axis = 1).to_csv('test_imdb_rt_vec_wo_sw.txt', header=False, encoding='utf-8', index=False)

In [None]:
run("fasttext supervised -input train_imdb_rt_vec_wo_sw.txt -lr 0.005 -epoch 15 -minCount 500 -dim 300 -output model_vec_wo_sw -pretrainedVectors wiki.en.vec")

In [10]:
def get_score(X_train, X_test, y_train, y_test, model_file):

    y_train = y_train.apply(lambda x: int(x.strip()[-1]))
    y_test = y_test.apply(lambda x: int(x.strip()[-1]))
    
    start_time = time.time()
    classifier = load_model(model_file, label_prefix='__label__')
    learning_time = time.time()
    train_prediction = classifier.predict_proba(list(X_train))
    test_prediction = classifier.predict_proba(list(X_test))
    prediction_time = time.time()
    train_predictions = [int(item[0][0]) for item in train_prediction]
    test_predictions = [int(item[0][0]) for item in test_prediction]

    print("=" * 30)
    print('****Results****')
    print('Learning model: %d seconds' % round(learning_time - start_time, 2))
    print('Cross-validation time: %d seconds' % round(prediction_time - learning_time, 2))
    acc_tr = accuracy_score(y_train, train_predictions)
    acc_te = accuracy_score(y_test, test_predictions)
    print("Accuracy: train - {:.6}, test - {:.6}, diff - {:.6} \n".format(acc_tr, acc_te, acc_tr - acc_te))

In [19]:
get_score(X_train, X_test, y_train, y_test, 'model_vec_wo_sw.bin')

****Results****
Learning model: 7 seconds
Cross-validation time: 15 seconds
Accuracy: train - 0.832658, test - 0.817568, diff - 0.0150893 

