In [1]:
# import main libraries/packages
import warnings
warnings.filterwarnings('ignore') # to ignore annoying IPython warnings
import numpy as np
import pandas as pd
import gzip
import re

from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix, hstack # to get memory-efficient representation of matrices (sparse format)
from textblob import TextBlob, Word

# preprocessing / feature extraction / feature transformation
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn.decomposition import PCA, SparsePCA

# models
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.pipeline import Pipeline, FeatureUnion

# metrics/validation
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold 
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.metrics import classification_report
# model serialization/deserialization
import dill

### Unpacking from .json

In [3]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)
        
def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

'''
df = getDF('reviews_Movies_and_TV_5.json.gz')
train, test = train_test_split(df.asin.unique(), test_size=0.1, random_state=42)
df_train = df[df.asin.isin(train) & (df.overall != 3)].copy()
df_test = df[df.asin.isin(test) & (df.overall != 3)].copy()
df_train['overall'] = df_train['overall'].apply(lambda x: 1 if x > 3 else 0)
df_test['overall'] = df_test['overall'].apply(lambda x: 1 if x > 3 else 0)
df_train[['overall', 'reviewText', 'asin']].to_csv('csv/train.csv', index=False)
df_test[['overall', 'reviewText']].to_csv('csv/test.csv', index=False)
'''

pass

# loading datasets

In [7]:
# load train dataset
train_df_names = ['csv/train.csv']

df = pd.concat((pd.read_csv(name, engine='c', sep=',', 
                 usecols=['overall', 'reviewText']) for name in train_df_names), ignore_index=True)

df = df.rename(columns={'overall' : 'label', 'reviewText' : 'text'}) # rename columns

# load test dataset
df_test = pd.read_csv('csv/test.csv', sep=",", engine='c', usecols=['overall', 'reviewText'])
df_test = df_test.rename(columns={'overall' : 'label', 'reviewText' : 'text'})

# other test datasets
test_RT = pd.read_csv("csv/reviews_rt_all.csv", sep="|", engine='c', usecols=['label', 'text'])
test_imdb = pd.read_csv("csv/imdb_small.csv", sep="|", engine='c', usecols=['label', 'text'])
test_polarity_RT = pd.read_csv("csv/test_reviews.csv", sep="|", engine='c', usecols=['label', 'text'])

# to avoid some errors in 'object'-typed column
df.text = df.text.apply(str)
df_test.text = df_test.text.apply(str)
test_RT.text = test_RT.text.apply(str)
test_imdb.text = test_imdb.text.apply(str)
test_polarity_RT.text = test_polarity_RT.text.apply(str)

print('review count: {}'.format(len(df)))

review count: 1343971


In [8]:
df.head()

Unnamed: 0,label,text
0,1,This is a charming version of the classic Dick...
1,1,Henry Winkler is very good in this twist on th...
2,1,This is one of the best Scrooge movies out. H...
3,1,This has been a favorite movie of mine for a l...
4,1,This is the American adaptation of the Charles...


In [9]:
%%time
# check for class balance
print('class balance Train set:', '\n', df.label.value_counts())
print('\n','class balance Test set:', '\n', df_test.label.value_counts())
print('\n','class balance Test RT set:', '\n', test_RT.label.value_counts())
print('\n','class balance Test imdb set:', '\n', test_imdb.label.value_counts())
print('\n','class balance Test polarity RT set:', '\n', test_polarity_RT.label.value_counts())

class balance Train set: 
 1    1158561
0     185410
Name: label, dtype: int64

 class balance Test set: 
 1    131041
0     21219
Name: label, dtype: int64

 class balance Test RT set: 
 1    64658
0    37952
Name: label, dtype: int64

 class balance Test imdb set: 
 1    25000
0    25000
Name: label, dtype: int64

 class balance Test polarity RT set: 
 1    5330
0    5330
Name: label, dtype: int64
Wall time: 13 ms


In [10]:
%%time
# imbalanced data
#df_small = df.sample(500000, random_state=50)

# balanced data
df_small = pd.concat([df[df.label == 1].sample(814590, random_state=50),  df[df.label == 0].sample(185410, random_state=50)])
print('class balance:', '\n', df_small.label.value_counts())

class balance: 
 1    814590
0    185410
Name: label, dtype: int64
Wall time: 307 ms


In [11]:
df_small.head()

Unnamed: 0,label,text
103191,1,This is a great and classic movie that I basic...
806026,1,Everyone loves Vince Vaughn but where are the ...
395165,1,"The live action movie may be out, but those of..."
1000240,1,This is a nice compilation of Olivia episodes....
143654,1,I didn't get out to see this one when it came ...


# Preprocessing

In [12]:
%%time
def get_rate(s):
    candidates = re.findall(r'(\d{1,3}[\\|/]{1}\d{1,2})', s)
    rates = []
    for c in candidates:
        try:
            rates.append(eval(c))
        except SyntaxError:
            pass
        except ZeroDivisionError:
            return 0
    return np.median(rates)

# regular expression to split review on sentences
sentence_splitter = re.compile('(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z]\.)(?<=\!|\?|\.)\s')

# lists of positive/negative smiles
positive_smiles = set([
":‑)",":)",":-]",":]",":-3",":3",":->",":>","8-)","8)",":-}",":}",":o)",":c)",":^)","=]","=)",":‑D",":D","8‑D","8D",
"x‑D","xD","X‑D","XD","=D","=3","B^D",":-))",";‑)",";)","*-)","*)",";‑]",";]",";^)",":‑,",";D",":‑P",":P","X‑P","XP",
"x‑p","xp",":‑p",":p",":‑Þ",":Þ",":‑þ",":þ",":‑b",":b","d:","=p",">:P", ":'‑)", ":')",  ":-*", ":*", ":×"
])
negative_smiles = set([
":‑(",":(",":‑c",":c",":‑<",":<",":‑[",":[",":-||",">:[",":{",":@",">:(","D‑':","D:<","D:","D8","D;","D=","DX",":‑/",
":/",":‑.",'>:\\', ">:/", ":\\", "=/" ,"=\\", ":L", "=L",":S",":‑|",":|","|‑O","<:‑|"
])

# pattern to catch SUCH WORDS and ignore SuCH :)
uppercase_pattern = re.compile(r'(\b[0-9]*[A-Z]+[0-9]*[A-Z]{1,}[0-9]*\b)')

# contrast conjugations
contrast_conj = set([
'alternatively','anyway','but','by contrast','differ from','elsewhere','even so','however','in contrast','in fact',
'in other respects','in spite of','in that respect','instead','nevertheless','on the contrary','on the other hand',
'rather','though','whereas','yet'])

# to get review "purity" ~ same sentiment over review (~1) or not (~0)
def purity(sentences):
    polarities = np.array([TextBlob(x).sentiment.polarity for x in sentences])
    return polarities.sum() / np.abs(polarities).sum()

# feature engineering ^-^
def get_custom_features(text):
    # assume text = pd.Series with review text
    print('extracting custom features...')
    tdf = pd.DataFrame()
    tdf['text'] = text 
    
    # feature 4 - totally uppercase words (like HOLY JESUS!)
    tdf['upper_word_cnt'] = tdf.text.apply(lambda s: len(re.findall(uppercase_pattern, s)))
    
    # try to extract rating :) like "great film. 9/10" will yield 0.9
    tdf['rating'] = tdf['text'].apply(get_rate).fillna(-1) # feature 5 - rating (if found in review)

    # try to extract smiles and count positive/negative smiles per review (features 6,7)
    tdf['positive_smiles'] = tdf.text.apply(lambda s: len([x for x in s.split() if x in positive_smiles]))
    tdf['negative_smiles'] = tdf.text.apply(lambda s: len([x for x in s.split() if x in negative_smiles]))
     
    return csr_matrix(tdf[tdf.columns[2:]].values) # to get sparse format

Wall time: 0 ns


# model 

In [15]:
# EXTRACTOR
extraction_list = []

# 1. custom features
extraction_list.append(['custom_features', 
                             FunctionTransformer(func=get_custom_features,
                                                 validate=False,
                                                 accept_sparse=True
                                                )
                            ])
# 2. simple bag-of-words (tf-idf)
extraction_list.append(['tfidf', 
                             TfidfVectorizer(decode_error='ignore',
                                             max_df=0.75, 
                                             min_df=3,
                                             ngram_range=(1, 3),
                                             max_features=None,
                                             stop_words='english'
                                            )
                            ])

extractor = FeatureUnion(extraction_list)


# CLASSIFIER
clf = ExtraTreesClassifier(n_estimators=50, 
                             max_leaf_nodes=None, 
                             verbose=2,
                             min_samples_leaf=3, 
                             random_state=1,
                             n_jobs=-1,
                             class_weight='balanced',
                             criterion='entropy'
                            )

# create pipeline, combining steps together                                                                                                                       

model = Pipeline(
    [
        ('feature_extraction', extractor),
        ('clf', clf)
    ])

# training model

In [16]:
%%time
# train/test split
X_train, X_test, y_train, y_test = train_test_split(
                                                    df_small.text, 
                                                    df_small.label, 
                                                    test_size=0.1, 
                                                    random_state=50, 
                                                    stratify=df_small.label
                                                   )



# fit model
model.fit(X_train, y_train)
print('finally fitted :)')

#check results on validation
print('Accuracy on validation: {}'.format(accuracy_score(model.predict(X_test), y_test)))
print(classification_report(y_test, model.predict(X_test)))

extracting custom features...


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 18.0min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 23.3min finished


finally fitted :)
extracting custom features...


[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    8.2s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:   10.6s finished


Accuracy on validation: 0.88306
extracting custom features...


[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    8.2s


             precision    recall  f1-score   support

          0       0.64      0.86      0.73     18541
          1       0.96      0.89      0.93     81459

avg / total       0.90      0.88      0.89    100000

Wall time: 40min 36s


[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:   10.6s finished


# testing model

In [17]:
%%time
# test predict RT
X, y = test_RT.text, test_RT.label
y_pred_RT = model.predict(X)
print(classification_report(y, y_pred_RT))
print('accuracy: {}'.format(accuracy_score(y, y_pred_RT)))


extracting custom features...


[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:   10.9s


             precision    recall  f1-score   support

          0       0.73      0.41      0.52     37952
          1       0.72      0.91      0.81     64658

avg / total       0.73      0.73      0.70    102610

accuracy: 0.725163239450346
Wall time: 21.2 s


[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:   14.1s finished


In [18]:
# test predict imdb
X, y = test_imdb.text, test_imdb.label
y_pred_imdb = model.predict(X)
print(classification_report(y, y_pred_imdb ))
print('accuracy: {}'.format(accuracy_score(y, y_pred_imdb)))

extracting custom features...


[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    4.4s


             precision    recall  f1-score   support

          0       0.82      0.92      0.87     25000
          1       0.90      0.80      0.85     25000

avg / total       0.86      0.86      0.86     50000

accuracy: 0.8572


[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    5.8s finished


In [19]:
# test predict polarity_RT
X, y = test_polarity_RT.text, test_polarity_RT.label
y_pred_pol = model.predict(X)
print(classification_report(y, y_pred_pol))
print('accuracy: {}'.format(accuracy_score(y, y_pred_pol)))

extracting custom features...


[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.6s


             precision    recall  f1-score   support

          0       0.82      0.42      0.56      5330
          1       0.61      0.91      0.73      5330

avg / total       0.72      0.66      0.64     10660

accuracy: 0.6647279549718574


[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    2.0s finished


In [20]:
%%time
#test predict
X, y = df_test.text, df_test.label
y_pred = model.predict(X)
print(classification_report(y, y_pred)) # смотреть на значение f1-score в строке для класса 0
print('accuracy: {}'.format(accuracy_score(y, y_pred)))

extracting custom features...


[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:   11.8s


             precision    recall  f1-score   support

          0       0.55      0.86      0.67     21219
          1       0.97      0.89      0.93    131041

avg / total       0.92      0.88      0.89    152260

accuracy: 0.8831144095625904
Wall time: 1min 11s


[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:   14.9s finished
