In [1]:
# import main libraries/packages
import warnings
warnings.filterwarnings('ignore') # to ignore annoying IPython warnings

import pandas as pd
import numpy as np
import re # for some regexp magic ^-^

# for the future use

from scipy.sparse import csr_matrix, hstack # to get memory-efficient representation of matrices (sparse format)
from textblob import TextBlob, Word # pip install textblob / conda install textblob

# preprocessing / feature extraction / feature transformation
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn.decomposition import PCA, SparsePCA

# models
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.svm import LinearSVC, SVC # for the future blending/stacking, also - as baselines to beat
from sklearn.linear_model import LogisticRegression # for the future blending/stacking, also - as baselines to beat
#from xgboost import XGBClassifier  # uncomment if you have it installed
# how to install xgboost on windows - 
# https://www.ibm.com/developerworks/community/blogs/jfp/entry/Installing_XGBoost_For_Anaconda_on_Windows

# model/feature aggregation in Pipelines
from sklearn.pipeline import Pipeline, FeatureUnion

# metrics/validation
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold 
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix

# model serialization/deserialization
import dill

In [93]:
%%time
# import review dataset for train (csv)

# train datasets (RottenTomatoes + IMDB_small)
train_df_names = ['imdb_small.csv', 'reviews_rt_all.csv']

df = pd.concat((pd.read_csv(name, engine='c', sep='|', 
                 usecols=['label', 'text']) for name in train_df_names), ignore_index=True)
print('review count: {}'.format(len(df)))

from bs4 import BeautifulSoup
df.text = df.text.apply(lambda s: BeautifulSoup(s, 'lxml').text) #

# drop duplicates, if any
df.drop_duplicates(subset=['text'], inplace=True)
print('review count, no duplicates: {}'.format(len(df)))

# check for class balance
print('class balance:', '\n', df.label.value_counts())

# check for language that differs from English (rough enough, we'll cut several "almost english" reviews)
def is_english(s):
    words = s.split()
    non_english = 0
    for w in words:
        try:
            w.encode('ascii')
        except UnicodeEncodeError:
            non_english += 1
    return True if non_english*1.0/len(words) <= 0.05 else False

df_nonenglish = df[~df['text'].apply(is_english)]
print('non-english reviews: {}/{}'.format(len(df[~df['text'].apply(is_english)]), len(df)))

# let's get rid of them
df = df[df['text'].apply(is_english)]
print('review count, english only: {}'.format(len(df)))

review count: 152610
review count, no duplicates: 151871
class balance: 
 1    89349
0    62522
Name: label, dtype: int64
non-english reviews: 1716/151871
review count, english only: 150155
Wall time: 1min 33s


In [94]:
%%time
def get_rate(s):
    candidates = re.findall(r'(\d{1,3}[\\|/]{1}\d{1,2})', s)
    rates = []
    for c in candidates:
        try:
            rates.append(eval(c))
        except SyntaxError:
            pass
        except ZeroDivisionError:
            return 0
    return np.median(rates)

# regular expression to split review on sentences
sentence_splitter = re.compile('(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z]\.)(?<=\!|\?|\.)\s')

# lists of positive/negative smiles
positive_smiles = set([
":‑)",":)",":-]",":]",":-3",":3",":->",":>","8-)","8)",":-}",":}",":o)",":c)",":^)","=]","=)",":‑D",":D","8‑D","8D",
"x‑D","xD","X‑D","XD","=D","=3","B^D",":-))",";‑)",";)","*-)","*)",";‑]",";]",";^)",":‑,",";D",":‑P",":P","X‑P","XP",
"x‑p","xp",":‑p",":p",":‑Þ",":Þ",":‑þ",":þ",":‑b",":b","d:","=p",">:P", ":'‑)", ":')",  ":-*", ":*", ":×"
])
negative_smiles = set([
":‑(",":(",":‑c",":c",":‑<",":<",":‑[",":[",":-||",">:[",":{",":@",">:(","D‑':","D:<","D:","D8","D;","D=","DX",":‑/",
":/",":‑.",'>:\\', ">:/", ":\\", "=/" ,"=\\", ":L", "=L",":S",":‑|",":|","|‑O","<:‑|"
])

# pattern to catch SUCH WORDS and ignore SuCH :)
uppercase_pattern = re.compile(r'(\b[0-9]*[A-Z]+[0-9]*[A-Z]{1,}[0-9]*\b)')

# contrast conjugations
contrast_conj = set([
'alternatively','anyway','but','by contrast','differ from','elsewhere','even so','however','in contrast','in fact',
'in other respects','in spite of','in that respect','instead','nevertheless','on the contrary','on the other hand',
'rather','though','whereas','yet'])

# to get review "purity" ~ same sentiment over review (~1) or not (~0)
def purity(sentences):
    polarities = np.array([TextBlob(x).sentiment.polarity for x in sentences])
    return polarities.sum() / np.abs(polarities).sum()

# feature engineering ^-^
def get_custom_features(text):
    # assume text = pd.Series with review text
    print('extracting custom features...')
    tdf = pd.DataFrame()
    tdf['text'] = text 
    tdf['sentences'] = tdf.text.apply(lambda s: re.split(sentence_splitter, s)) # split to sentences
    
    #tdf['sentence_cnt'] = tdf['sentences'].apply(len) # feature 1 - (sentence count)
    #tdf['exclamation_cnt'] = tdf.text.str.count('\!') # feature 2 - (exclamation mark count)
    #tdf['question_cnt'] = tdf.text.str.count('\?') # feature 3 - (question mark count)
    
    # feature 4 - totally uppercase words (like HOLY JESUS!)
    tdf['upper_word_cnt'] = tdf.text.apply(lambda s: len(re.findall(uppercase_pattern, s)))
    
    # try to extract rating :) like "great film. 9/10" will yield 0.9
    tdf['rating'] = tdf['text'].apply(get_rate).fillna(-1) # feature 5 - rating (if found in review)

    # try to extract smiles and count positive/negative smiles per review (features 6,7)
    tdf['positive_smiles'] = tdf.text.apply(lambda s: len([x for x in s.split() if x in positive_smiles]))
    tdf['negative_smiles'] = tdf.text.apply(lambda s: len([x for x in s.split() if x in negative_smiles]))
    
    # not so informative, but still
    #tdf['contrast_conj_cnt'] = tdf.text.apply(lambda s: len([c for c in contrast_conj if c in s]))
    
    # feature 8 (polarity of 1st sentence)
    tdf['polarity_1st_sent'] = tdf.sentences.apply(lambda s: TextBlob(s[0]).sentiment.polarity)
    # feature 9 (subjectivity of 1st sentence)
    #tdf['subjectivity_1st_sent'] = tdf.sentences.apply(lambda s: TextBlob(s[0]).sentiment.subjectivity)
    # feature 10 (polarity of last sentence)
    tdf['polarity_last_sent'] = tdf.sentences.apply(lambda s: TextBlob(s[-1]).sentiment.polarity)
    # feature 11 (subjectivity of last sentence)
    #tdf['subjectivity_last_sent'] = tdf.sentences.apply(lambda s: TextBlob(s[-1]).sentiment.subjectivity)
    # feature 12 (subjectivity of review itself)
    #tdf['polarity'] = tdf.text.apply(lambda s: TextBlob(s[-1]).sentiment.polarity)
    # feature 13 ("purity" of review, |sum(sentence polarity) / sum(|sentence polarity|))|, ~ 1 is better, ~ 0 -> mixed
    #tdf['purity'] = tdf.sentences.apply(purity)
    #tdf['purity'].fillna(0, inplace=True)
    
    return csr_matrix(tdf[tdf.columns[2:]].values) # to get sparse format

Wall time: 0 ns


In [117]:
# create a list of data extractors/transformers (format = [('ft1_name', ft1_object), ('ft2_name', ft2_object), ...])

extraction_list = []

# 1. custom features
extraction_list.append(['custom_features', 
                             FunctionTransformer(func=get_custom_features,
                                                 validate=False,
                                                 accept_sparse=True
                                                )
                            ])
# 2. simple bag-of-words (tf-idf)
extraction_list.append(['tfidf', 
                             TfidfVectorizer(decode_error='ignore',
                                             max_df=0.3, 
                                             min_df=3,
                                             ngram_range=(1, 3),
                                             max_features=None,
                                             stop_words='english'
                                            )
                            ])

extractor = FeatureUnion(extraction_list)

In [118]:
clf = ExtraTreesClassifier(n_estimators=50, 
                             max_leaf_nodes=3,
                             #max_depth=7,
                             verbose=1,
                             min_samples_leaf=3, 
                             random_state=1,
                             n_jobs=-1,
                             class_weight='balanced',
                             criterion='entropy'
                            )

In [119]:
# create pipeline, combining steps together                                                                                                                       

model = Pipeline(
    [
        ('feature_extraction', extractor),
        ('clf', clf)
    ])

In [120]:
%%time
# train/test split
X_train, X_test, y_train, y_test = train_test_split(
                                                    df.text, 
                                                    df.label, 
                                                    test_size=0.2, 
                                                    random_state=42, 
                                                    stratify=df.label
                                                   )



# fit model
model.fit(X_train, y_train)
print('finally fitted :)')

#check results on validation
print('Accuracy on validation: {}'.format(accuracy_score(model.predict(X_test), y_test)))

extracting custom features...


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 47.4min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 53.6min finished


finally fitted :)
extracting custom features...


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  1.3min
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:  1.3min finished


Accuracy on validation: 0.7997735673137758
Wall time: 1h 4min 11s


In [121]:
%%time
df_test = pd.read_csv('test.csv', sep="|", engine='c', usecols=['label', 'text'])
X, y = df_test.text, df_test.label  # use binary labels = {0-negative,1-positive}
print('accuracy: {}'.format(accuracy_score(y, model.predict(X))))

extracting custom features...


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    6.0s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    6.4s finished


accuracy: 0.7859287054409005
Wall time: 23.1 s


In [31]:
import os

filename = 'ExtraTrees-model.pkl'
try:
    with open(filename, 'wb') as f:
        print('saving model...')
        dill.dump(model, f)
        print('model saved in file {}'.format(os.getcwd() + os.sep + filename))
except:
    print('Errors in model dump...')

saving model...
model saved in file C:\Users\Анастасия\ExtraTrees-model.pkl
