In [1]:
# import main libraries/packages
import warnings
warnings.filterwarnings('ignore') # to ignore annoying IPython warnings
import numpy as np
import pandas as pd
import gzip
import re

from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix, hstack # to get memory-efficient representation of matrices (sparse format)
from textblob import TextBlob, Word

# preprocessing / feature extraction / feature transformation
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn.decomposition import PCA, SparsePCA

# models
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.pipeline import Pipeline, FeatureUnion

# metrics/validation
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold 
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.metrics import classification_report
# model serialization/deserialization
import dill

In [2]:
# load train dataset
train_df_names = ['train.csv']

df = pd.concat((pd.read_csv(name, engine='c', sep=',', 
                 usecols=['overall', 'reviewText']) for name in train_df_names), ignore_index=True)

df = df.rename(columns={'overall' : 'label', 'reviewText' : 'text'}) #rename var

# load test dataset

test_DM = pd.read_csv("Digital_Music.csv", sep=",", engine='c', usecols=['label', 'text'])
test_VG = pd.read_csv("Video_Games.csv", sep=",", engine='c', usecols=['label', 'text'])
test_OP = pd.read_csv("Office_Products.csv", sep=",", engine='c', usecols=['label', 'text'])

df.text = df.text.apply(str)
test_DM.text = test_DM.text.apply(str)
test_VG.text = test_VG.text.apply(str)
test_OP.text = test_OP.text.apply(str)

In [3]:
print('class balance:', '\n', df.label.value_counts())

class balance: 
 1    1158561
0     185410
Name: label, dtype: int64


In [4]:
%%time
# imbalanced data
#df_small = df.sample(200000, random_state=20)

# balanced data
df_small = pd.concat([df[df.label == 1].sample(100000, random_state=48),  df[df.label == 0].sample(100000, random_state=48)])
print('class balance:', '\n', df_small.label.value_counts())

class balance: 
 1    185410
0    185410
Name: label, dtype: int64
Wall time: 417 ms


In [5]:
%%time
def get_rate(s):
    candidates = re.findall(r'(\d{1,3}[\\|/]{1}\d{1,2})', s)
    rates = []
    for c in candidates:
        try:
            rates.append(eval(c))
        except SyntaxError:
            pass
        except ZeroDivisionError:
            return 0
    return np.median(rates)

# regular expression to split review on sentences
sentence_splitter = re.compile('(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z]\.)(?<=\!|\?|\.)\s')

# lists of positive/negative smiles
positive_smiles = set([
":‑)",":)",":-]",":]",":-3",":3",":->",":>","8-)","8)",":-}",":}",":o)",":c)",":^)","=]","=)",":‑D",":D","8‑D","8D",
"x‑D","xD","X‑D","XD","=D","=3","B^D",":-))",";‑)",";)","*-)","*)",";‑]",";]",";^)",":‑,",";D",":‑P",":P","X‑P","XP",
"x‑p","xp",":‑p",":p",":‑Þ",":Þ",":‑þ",":þ",":‑b",":b","d:","=p",">:P", ":'‑)", ":')",  ":-*", ":*", ":×"
])
negative_smiles = set([
":‑(",":(",":‑c",":c",":‑<",":<",":‑[",":[",":-||",">:[",":{",":@",">:(","D‑':","D:<","D:","D8","D;","D=","DX",":‑/",
":/",":‑.",'>:\\', ">:/", ":\\", "=/" ,"=\\", ":L", "=L",":S",":‑|",":|","|‑O","<:‑|"
])

# pattern to catch SUCH WORDS and ignore SuCH :)
uppercase_pattern = re.compile(r'(\b[0-9]*[A-Z]+[0-9]*[A-Z]{1,}[0-9]*\b)')

# contrast conjugations
contrast_conj = set([
'alternatively','anyway','but','by contrast','differ from','elsewhere','even so','however','in contrast','in fact',
'in other respects','in spite of','in that respect','instead','nevertheless','on the contrary','on the other hand',
'rather','though','whereas','yet'])

# to get review "purity" ~ same sentiment over review (~1) or not (~0)
def purity(sentences):
    polarities = np.array([TextBlob(x).sentiment.polarity for x in sentences])
    return polarities.sum() / np.abs(polarities).sum()

# feature engineering ^-^
def get_custom_features(text):
    # assume text = pd.Series with review text
    print('extracting custom features...')
    tdf = pd.DataFrame()
    tdf['text'] = text 
    
    # feature 4 - totally uppercase words (like HOLY JESUS!)
    tdf['upper_word_cnt'] = tdf.text.apply(lambda s: len(re.findall(uppercase_pattern, s)))
    
    # try to extract rating :) like "great film. 9/10" will yield 0.9
    tdf['rating'] = tdf['text'].apply(get_rate).fillna(-1) # feature 5 - rating (if found in review)

    # try to extract smiles and count positive/negative smiles per review (features 6,7)
    tdf['positive_smiles'] = tdf.text.apply(lambda s: len([x for x in s.split() if x in positive_smiles]))
    tdf['negative_smiles'] = tdf.text.apply(lambda s: len([x for x in s.split() if x in negative_smiles]))
     
    return csr_matrix(tdf[tdf.columns[2:]].values) # to get sparse format

Wall time: 0 ns


In [6]:
extraction_list = []

# 1. custom features
extraction_list.append(['custom_features', 
                             FunctionTransformer(func=get_custom_features,
                                                 validate=False,
                                                 accept_sparse=True
                                                )
                            ])
# 2. simple bag-of-words (tf-idf)
extraction_list.append(['tfidf', 
                             TfidfVectorizer(decode_error='ignore',
                                             max_df=0.75, 
                                             min_df=3,
                                             ngram_range=(1, 3),
                                             max_features=None,
                                             stop_words='english'
                                            )
                            ])

extractor = FeatureUnion(extraction_list)

In [7]:
clf = ExtraTreesClassifier(n_estimators=50, 
                             max_leaf_nodes=None, 
                             verbose=1,
                             min_samples_leaf=3, 
                             random_state=1,
                             n_jobs=-1,
                             class_weight='balanced',
                             criterion='entropy'
                            )

In [8]:
# create pipeline, combining steps together                                                                                                                       

model = Pipeline(
    [
        ('feature_extraction', extractor),
        ('clf', clf)
    ])

In [None]:
%%time
# train/test split
X_train, X_test, y_train, y_test = train_test_split(
                                                    df_small.text, 
                                                    df_small.label, 
                                                    test_size=0.1, 
                                                    random_state=43, 
                                                    stratify=df_small.label
                                                   )



# fit model
model.fit(X_train, y_train)
print('finally fitted :)')

#check results on validation
print('Accuracy on validation: {}'.format(accuracy_score(model.predict(X_test), y_test)))
print(classification_report(y_test, model.predict(X_test)))

extracting custom features...


In [9]:
%%time
#test predict DM
X, y = test_DM.text, test_DM.label
y_pred = model.predict(X)
print(classification_report(y, y_pred)) # смотреть на значение f1-score в строке для класса 0
print('accuracy: {}'.format(accuracy_score(y, y_pred)))

extracting custom features...


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    7.4s


             precision    recall  f1-score   support

          0       0.35      0.82      0.49      5801
          1       0.98      0.83      0.90     52116

avg / total       0.91      0.83      0.86     57917

accuracy: 0.8287722085052748
Wall time: 1min 22s


[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    8.5s finished


In [10]:
%%time
#test predict VG
X, y = test_VG.text, test_VG.label
y_pred = model.predict(X)
print(classification_report(y, y_pred)) # смотреть на значение f1-score в строке для класса 0
print('accuracy: {}'.format(accuracy_score(y, y_pred)))

extracting custom features...


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   28.5s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:   32.0s finished


             precision    recall  f1-score   support

          0       0.33      0.87      0.47     28516
          1       0.97      0.71      0.82    174989

avg / total       0.88      0.73      0.77    203505

accuracy: 0.7309255300852558
Wall time: 6min 11s


In [11]:
%%time
#test predict OP
X, y = test_OP.text, test_OP.label
y_pred = model.predict(X)
print(classification_report(y, y_pred)) # смотреть на значение f1-score в строке для класса 0
print('accuracy: {}'.format(accuracy_score(y, y_pred)))

extracting custom features...


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    6.3s


             precision    recall  f1-score   support

          0       0.13      0.83      0.23      2856
          1       0.98      0.67      0.79     45342

avg / total       0.93      0.68      0.76     48198

accuracy: 0.6754014689406199
Wall time: 1min 23s


[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    7.3s finished
