In [1]:
# import main libraries/packages
import warnings
warnings.filterwarnings('ignore') # to ignore annoying IPython warnings
import numpy as np
import re # for some regexp magic ^-^
import pandas as pd
import gzip
from sklearn.model_selection import train_test_split

# for the future use

from scipy.sparse import csr_matrix, hstack # to get memory-efficient representation of matrices (sparse format)
from textblob import TextBlob, Word # pip install textblob / conda install textblob

# preprocessing / feature extraction / feature transformation
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn.decomposition import PCA, SparsePCA

# models
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
#from sklearn.svm import LinearSVC, SVC # for the future blending/stacking, also - as baselines to beat
#from sklearn.linear_model import LogisticRegression # for the future blending/stacking, also - as baselines to beat
#from xgboost import XGBClassifier  # uncomment if you have it installed
# how to install xgboost on windows - 
# https://www.ibm.com/developerworks/community/blogs/jfp/entry/Installing_XGBoost_For_Anaconda_on_Windows

# model/feature aggregation in Pipelines
from sklearn.pipeline import Pipeline, FeatureUnion

# metrics/validation
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold 
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix

# model serialization/deserialization
import dill

In [None]:
%%time
import pandas as pd
import gzip
from sklearn.model_selection import train_test_split

path = 'D:/4_Учебное/DataAnalysis/Kaggle/data/reviews_Movies_and_TV_5.json.gz'

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

df = getDF(path)  
train, test = train_test_split(df.asin.unique(),test_size=0.1, random_state=42)
df_train = df[df.asin.isin(train) & (df.overall != 3)].copy()
df_test = df[df.asin.isin(test) & (df.overall != 3)].copy()
df_train['overall'] = df_train['overall'].apply(lambda x: 1 if x > 3 else 0)
df_test['overall'] = df_test['overall'].apply(lambda x: 1 if x > 3 else 0)
df_train[['overall','reviewText']].to_csv('train.csv', index=False)
df_test[['overall','reviewText']].to_csv('test.csv', index=False)

In [2]:
# load train dataset
train_df_names = ['train.csv']

df = pd.concat((pd.read_csv(name, engine='c', sep=',', 
                 usecols=['overall', 'reviewText']) for name in train_df_names), ignore_index=True)
print('review count: {}'.format(len(df)))

review count: 1343971


In [8]:
df.head(10)

Unnamed: 0,overall,reviewText
0,1,This is a charming version of the classic Dick...
1,1,Henry Winkler is very good in this twist on th...
2,1,This is one of the best Scrooge movies out. H...
3,1,This has been a favorite movie of mine for a l...
4,1,This is the American adaptation of the Charles...
5,1,Glad that this american classic came out on dv...
6,1,A good Christmas carol dhenry winkler one duri...
7,1,How a bitter old man comes to know the true me...
8,1,"The small historic Canadian town of Elora, wit..."
9,1,Even though i don't care for Henry Winklers a...


In [9]:
%%time
df.reviewText = df.reviewText.apply(str)
# check for class balance
print('class balance:', '\n', df.overall.value_counts())

review count, no duplicates: 1343036
class balance: 
 1    1157699
0     185337
Name: overall, dtype: int64
non-english reviews: 0/1343036
review count, english only: 1343036
Wall time: 14min 31s


In [10]:
%%time
def get_rate(s):
    candidates = re.findall(r'(\d{1,3}[\\|/]{1}\d{1,2})', s)
    rates = []
    for c in candidates:
        try:
            rates.append(eval(c))
        except SyntaxError:
            pass
        except ZeroDivisionError:
            return 0
    return np.median(rates)

# regular expression to split review on sentences
sentence_splitter = re.compile('(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z]\.)(?<=\!|\?|\.)\s')

# lists of positive/negative smiles
positive_smiles = set([
":‑)",":)",":-]",":]",":-3",":3",":->",":>","8-)","8)",":-}",":}",":o)",":c)",":^)","=]","=)",":‑D",":D","8‑D","8D",
"x‑D","xD","X‑D","XD","=D","=3","B^D",":-))",";‑)",";)","*-)","*)",";‑]",";]",";^)",":‑,",";D",":‑P",":P","X‑P","XP",
"x‑p","xp",":‑p",":p",":‑Þ",":Þ",":‑þ",":þ",":‑b",":b","d:","=p",">:P", ":'‑)", ":')",  ":-*", ":*", ":×"
])
negative_smiles = set([
":‑(",":(",":‑c",":c",":‑<",":<",":‑[",":[",":-||",">:[",":{",":@",">:(","D‑':","D:<","D:","D8","D;","D=","DX",":‑/",
":/",":‑.",'>:\\', ">:/", ":\\", "=/" ,"=\\", ":L", "=L",":S",":‑|",":|","|‑O","<:‑|"
])

# pattern to catch SUCH WORDS and ignore SuCH :)
uppercase_pattern = re.compile(r'(\b[0-9]*[A-Z]+[0-9]*[A-Z]{1,}[0-9]*\b)')

# contrast conjugations
contrast_conj = set([
'alternatively','anyway','but','by contrast','differ from','elsewhere','even so','however','in contrast','in fact',
'in other respects','in spite of','in that respect','instead','nevertheless','on the contrary','on the other hand',
'rather','though','whereas','yet'])

# to get review "purity" ~ same sentiment over review (~1) or not (~0)
def purity(sentences):
    polarities = np.array([TextBlob(x).sentiment.polarity for x in sentences])
    return polarities.sum() / np.abs(polarities).sum()

# feature engineering ^-^
def get_custom_features(reviewText):
    # assume text = pd.Series with review text
    print('extracting custom features...')
    tdf = pd.DataFrame()
    tdf['reviewText'] = reviewText 
    tdf['sentences'] = tdf.reviewText.apply(lambda s: re.split(sentence_splitter, s)) # split to sentences
    
    # feature 4 - totally uppercase words (like HOLY JESUS!)
    tdf['upper_word_cnt'] = tdf.reviewText.apply(lambda s: len(re.findall(uppercase_pattern, s)))
    
    # try to extract rating :) like "great film. 9/10" will yield 0.9
    tdf['rating'] = tdf['reviewText'].apply(get_rate).fillna(-1) # feature 5 - rating (if found in review)

    # try to extract smiles and count positive/negative smiles per review (features 6,7)
    tdf['positive_smiles'] = tdf.reviewText.apply(lambda s: len([x for x in s.split() if x in positive_smiles]))
    tdf['negative_smiles'] = tdf.reviewText.apply(lambda s: len([x for x in s.split() if x in negative_smiles]))
     
    return csr_matrix(tdf[tdf.columns[2:]].values) # to get sparse format

Wall time: 0 ns


In [20]:
extraction_list = []

# 1. custom features
extraction_list.append(['custom_features', 
                             FunctionTransformer(func=get_custom_features,
                                                 validate=False,
                                                 accept_sparse=True
                                                )
                            ])
# 2. simple bag-of-words (tf-idf)
extraction_list.append(['tfidf', 
                             TfidfVectorizer(decode_error='ignore',
                                             max_df=0.75, 
                                             min_df=3,
                                             ngram_range=(1, 3),
                                             max_features=None,
                                             stop_words='english'
                                            )
                            ])

extractor = FeatureUnion(extraction_list)

In [21]:
clf = ExtraTreesClassifier(n_estimators=50, 
                             max_leaf_nodes=None, 
                             verbose=1,
                             min_samples_leaf=3, 
                             random_state=1,
                             n_jobs=-1,
                             class_weight='balanced',
                             criterion='entropy'
                            )

In [22]:
# create pipeline, combining steps together                                                                                                                       

model = Pipeline(
    [
        ('feature_extraction', extractor),
        ('clf', clf)
    ])

In [14]:
%%time
#df_small = df.sample(100000)
df_small = pd.concat([df[df.overall == 1].sample(50000, random_state=1),  df[df.overall == 0].sample(50000, random_state=1)])
print('class balance:', '\n', df_small.overall.value_counts())

Wall time: 826 ms


In [23]:
df_small.head()

Unnamed: 0,overall,reviewText
598061,1,Sam Rockwell is amazingly authentic in his por...
400726,1,best movie ever! i watch and rewatch. i have i...
35765,1,Obviously that title refers to my review and n...
308069,1,i saw this movie on tv years ago and almost fo...
180692,1,The Parallax View (1974) came out before Holly...


In [24]:
%%time
# train/test split
X_train, X_test, y_train, y_test = train_test_split(
                                                    df_small.reviewText, 
                                                    df_small.overall, 
                                                    test_size=0.1, 
                                                    random_state=42, 
                                                    stratify=df_small.overall
                                                   )



# fit model
model.fit(X_train, y_train)
print('finally fitted :)')

#check results on validation
print('Accuracy on validation: {}'.format(accuracy_score(model.predict(X_test), y_test)))

extracting custom features...


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  2.5min finished


finally fitted :)
extracting custom features...


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.7s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    3.1s finished


Accuracy on validation: 0.8487
Wall time: 12min 5s


In [25]:
%%time
df_test_data = pd.read_csv('test.csv', sep=",", engine='c', usecols=['overall', 'reviewText'])
X, y = df_small.reviewText, df_small.overall  # use binary labels = {0-negative,1-positive}
print('accuracy: {}'.format(accuracy_score(y, model.predict(X))))

extracting custom features...


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   26.4s


accuracy: 0.92706
Wall time: 6min 45s


[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:   30.3s finished


In [None]:
import os

filename = 'ExtraTrees_MovTv_5-model.pkl'
try:
    with open(filename, 'wb') as f:
        print('saving model...')
        dill.dump(model, f)
        print('model saved in file {}'.format(os.getcwd() + os.sep + filename))
except:
    print('Errors in model dump...')