In [None]:
import string
import re
import pandas as pd
import time

from nltk.corpus import stopwords
from spacy.en import English
# pip install -U spacy
# python -m spacy.en.download all

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from fasttext import supervised, load_model

the model can be found here:
https://mega.nz/#!658hyBjK!3Nx5M0IOq3uBpl0nRt555ycaW-aeZXhc1b6EjZeizPE

### Loading data

In [None]:
data = pd.concat([pd.read_csv('reviews_rt_all.csv',sep='|'), 
                  pd.read_csv('imdb_small.csv',sep='|')], ignore_index=True)
print(data.shape)
data.head()

### Processing data

* Words with n't|'re|'s|'ve|'ll|'d were fixed for lemmatization
* Actors could play in good and bad movies we attempted to remove them (at least such cases when their names are in brackets)
* Digits, special signs and one-letter words were removed

In [None]:
redundant_signs = set(string.punctuation) - set(['.'])
letters = [x for x in string.ascii_lowercase + '. ']

def clean_data(inp_str):
    inp_str = inp_str.lower()

    # fix haven't|doesn't|shouldn't cases
    inp_str = inp_str.replace("n't", " not")
    inp_str = inp_str.replace("'re", " are")
    inp_str = inp_str.replace("'s", " s")
    inp_str = inp_str.replace("'ve", " have")
    inp_str = inp_str.replace("'ll", " will")
    inp_str = inp_str.replace("'d", " d")

    # here may be actor's names, types of smth etc. I guess it's redundant info
    # let's discuss of necessity of this block
    bracket_words = re.findall('([\(\[\{].+?[\)\]\}])', inp_str)
    for word in bracket_words:
        inp_str = inp_str.replace(''.join(word), "")

    # replace redundant_signs
    for item in redundant_signs:
        inp_str = inp_str.replace(item, ' ')

    # replace digits
    inp_str = re.sub('\d', ' ', inp_str)
    # replace two or more dots. 1 dot is remained as it separates sentences
    inp_str = re.sub('\.{1,10}', ' ', inp_str)
    # replace one-letter words or just letters
    inp_str = re.sub(r"\b[a-z]{1}\b", ' ', inp_str)

    return ' '.join(list(filter(None, inp_str.split(' '))))

In [None]:
data['clean_text'] = data['text'].apply(clean_data)
print('source data: ', data.shape)
data.head()

### Finalizing data

* Lemmatization was made with spacy pasckage
* The word 'movie' was removed
* Reviews with non ascii letters and empty reviews were removed

In [None]:
nlp = English()
def lem(line, nlp):
    return ' '.join([word.lemma_ for word in nlp(line)])

In [None]:
stop_words = ['movie']

In [None]:
def finalize_data(df, nlp):
    df['stemed_text'] = df['clean_text'].apply(lem, args=(nlp,))
    df['stemed_text'] = df['stemed_text'].apply(
        lambda x: ' '.join([item for item in x.split() if item not in stop_words]))
    df['is_ascii'] = df['stemed_text'].apply(lambda x: set(x).issubset(letters))
    df['letters'] = df['stemed_text'].apply(len)
#     df['new_label'] = df['label'].apply(lambda x: '__label__1 ' if x == 1 else '__label__0 ')

    df = df[df['is_ascii'] == 1]
    df = df[df['letters'] > 0]
    df = df.reset_index()
    df = df.ix[:, ['label', 'stemed_text']]

    return df

In [None]:
data = finalize_data(data, nlp)
print('cleaned data: ', data.shape)
data.head()

### Accuracy evaluation

In [None]:
def get_score(df, model_name):
    
    classifier = load_model(model_name, label_prefix='__label__')
    test_prediction = classifier.predict_proba(list(df['stemed_text']))
    test_predictions = [int(item[0][0]) for item in test_prediction]

    print("=" * 30)
    print('****Results****')
    acc_te = accuracy_score(list(df['label']), test_predictions)
    print("Accuracy: test - {:.6}\n".format(acc_te))

In [None]:
get_score(data, 'model_vec_wo_sw.bin')