In [24]:
import pandas as pd
import numpy as np
import nltk
from langdetect import detect
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer

In [4]:
# Data import
rt = pd.read_csv('reviews_rt_all.csv', sep = '|')
imdb = pd.read_csv('imdb_small.csv', sep = '|')

In [5]:
# Split RT and IMDB datasets seapretely
X_train_rt, X_test_rt, y_train_rt, y_test_rt = train_test_split(rt.text, rt.label, test_size=0.2, random_state=42, stratify=rt.label)
X_train_imdb, X_test_imdb, y_train_imdb, y_test_imdb = train_test_split(imdb.text, imdb.label, test_size=0.2, random_state=42, stratify=imdb.label)

# Then concatenate
X_train = pd.concat([X_train_rt, X_train_imdb])
X_test = pd.concat([X_test_rt, X_test_imdb])
y_train = pd.concat([y_train_rt, y_train_imdb])
y_test = pd.concat([y_test_rt, y_test_imdb])

In [4]:
# Take the last 22 words from each review in the train set
X_train = pd.concat([rt.text, imdb.text])
y_train = pd.concat([rt.label, imdb.label])
# X_train = X_train.str.split().apply(lambda x:  ' '.join(x for x in x[-22:]))
# X_train = X_train.map(lambda t: take_n_words(t, naive=False, n=40))

0    To an entire generation of filmgoers, it just ...
1    Pixar classic is one of the best kids' movies ...
2    Apesar de representar um imenso avanço tecnoló...
3    When Woody perks up in the opening scene, it's...
4    Introduced not one but two indelible character...
Name: text, dtype: object


In [19]:
# Stopwords
# STOPWORDS = ['by','does', 'was', 'were', 'the', 'of', 'end', 'and', 'is']
STOPWORDS = []

## FEATURE ENGINEERING

In [64]:
import re

def get_rate(s):
    candidates = re.findall(r'(\d{1,3}[\\|/]{1}\d{1,2})', s)
    rates = []
    for c in candidates:
        try:
            rates.append(eval(c))
        except SyntaxError:
            pass
        except ZeroDivisionError:
            return 0
    return np.median(rates)

# regular expression to split review on sentences
sentence_splitter = re.compile('(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z]\.)(?<=\!|\?|\.)\s')

# lists of positive/negative smiles
positive_smiles = set([
":‑)",":)",":-]",":]",":-3",":3",":->",":>","8-)","8)",":-}",":}",":o)",":c)",":^)","=]","=)",":‑D",":D","8‑D","8D",
"x‑D","xD","X‑D","XD","=D","=3","B^D",":-))",";‑)",";)","*-)","*)",";‑]",";]",";^)",":‑,",";D",":‑P",":P","X‑P","XP",
"x‑p","xp",":‑p",":p",":‑Þ",":Þ",":‑þ",":þ",":‑b",":b","d:","=p",">:P", ":'‑)", ":')",  ":-*", ":*", ":×"
])
negative_smiles = set([
":‑(",":(",":‑c",":c",":‑<",":<",":‑[",":[",":-||",">:[",":{",":@",">:(","D‑':","D:<","D:","D8","D;","D=","DX",":‑/",
":/",":‑.",'>:\\', ">:/", ":\\", "=/" ,"=\\", ":L", "=L",":S",":‑|",":|","|‑O","<:‑|"
])

# pattern to catch SUCH WORDS and ignore SuCH :)
uppercase_pattern = re.compile(r'(\b[0-9]*[A-Z]+[0-9]*[A-Z]{1,}[0-9]*\b)')

# contrast conjugations
contrast_conj = set([
'alternatively','anyway','but','by contrast','differ from','elsewhere','even so','however','in contrast','in fact',
'in other respects','in spite of','in that respect','instead','nevertheless','on the contrary','on the other hand',
'rather','though','whereas','yet'])

# to get review "purity" ~ same sentiment over review (~1) or not (~0)
def purity(sentences):
    polarities = np.array([TextBlob(x).sentiment.polarity for x in sentences])
    return polarities.sum() / np.abs(polarities).sum()

# feature engineering ^-^
def get_custom_features(text):
    # assume text = pd.Series with review text
#     print('extracting custom features...')
    tdf = pd.DataFrame()
    tdf['text'] = text 
    tdf['sentences'] = tdf.text.apply(lambda s: re.split(sentence_splitter, s)) # split to sentences
    
    tdf['sentence_cnt'] = tdf['sentences'].apply(len) # feature 1 - (sentence count)
#     tdf['exclamation_cnt'] = tdf.text.str.count('\!') # feature 2 - (exclamation mark count)
#     tdf['question_cnt'] = tdf.text.str.count('\?') # feature 3 - (question mark count)
    
    # feature 4 - totally uppercase words (like HOLY JESUS!)
    #tdf['upper_word_cnt'] = tdf.text.apply(lambda s: len(re.findall(uppercase_pattern, s)))
    
    # try to extract rating :) like "great film. 9/10" will yield 0.9
#     tdf['rating'] = tdf['text'].apply(get_rate).fillna(-1) # feature 5 - rating (if found in review)

    # try to extract smiles and count positive/negative smiles per review (features 6,7)
    tdf['positive_smiles'] = tdf.text.apply(lambda s: len([x for x in s.split() if x in positive_smiles]))
    tdf['negative_smiles'] = tdf.text.apply(lambda s: len([x for x in s.split() if x in negative_smiles]))
    
    # not so informative, but still
    #tdf['contrast_conj_cnt'] = tdf.text.apply(lambda s: len([c for c in contrast_conj if c in s]))
    
    # feature 8 (polarity of 1st sentence)
#     tdf['polarity_1st_sent'] = tdf.sentences.apply(lambda s: TextBlob(s[0]).sentiment.polarity)
    # feature 9 (subjectivity of 1st sentence)
    #tdf['subjectivity_1st_sent'] = tdf.sentences.apply(lambda s: TextBlob(s[0]).sentiment.subjectivity)
    # feature 10 (polarity of last sentence)
    #tdf['polarity_last_sent'] = tdf.sentences.apply(lambda s: TextBlob(s[-1]).sentiment.polarity)
    # feature 11 (subjectivity of last sentence)
    #tdf['subjectivity_last_sent'] = tdf.sentences.apply(lambda s: TextBlob(s[-1]).sentiment.subjectivity)
    # feature 12 (subjectivity of review itself)
    #tdf['polarity'] = tdf.text.apply(lambda s: TextBlob(s[-1]).sentiment.polarity)
    # feature 13 ("purity" of review, |sum(sentence polarity) / sum(|sentence polarity|))|, ~ 1 is better, ~ 0 -> mixed
    #tdf['purity'] = tdf.sentences.apply(purity)
    #tdf['purity'].fillna(0, inplace=True)
    
    return csr_matrix(tdf[tdf.columns[2:]].values) # to get sparse format

In [65]:
extraction_list = []

# 1. custom features
extraction_list.append(['cf', 
                             FunctionTransformer(func=get_custom_features,
                                                 validate=False,
                                                 accept_sparse=True
                                                )
                            ])
# 2. simple bag-of-words (tf-idf)
extraction_list.append(['cv', 
                             CountVectorizer(
                                            )
                            ])

extractor = FeatureUnion(extraction_list)

In [76]:
params = {
}

In [77]:
import time
from scipy.sparse import csr_matrix
localtime = time.asctime( time.localtime(time.time()) )
print ("Start time :", localtime)
classifier = LogisticRegression(solver='liblinear')
pipeline = Pipeline([('extr', extractor), ('clsf', classifier)])
grid_search = GridSearchCV(pipeline, params, cv=5, verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)
localtime = time.asctime( time.localtime(time.time()) )
print ("End time :", localtime)
sorted(grid_search.grid_scores_, key=lambda x: x.mean_validation_score)


Local current time : Fri Mar 10 19:45:33 2017
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................


  **self._backend_args)


[CV]  ................................................................


  **self._backend_args)


[CV]  ................................................................


  **self._backend_args)


[CV]  ................................................................


  **self._backend_args)


[CV]  ................................................................


  **self._backend_args)
Process ForkPoolWorker-150:
Process ForkPoolWorker-151:
Process ForkPoolWorker-152:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/alex/anaconda3/envs/testenv/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/Users/alex/anaconda3/envs/testenv/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/Users/alex/anaconda3/envs/testenv/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/Users/alex/anaconda3/envs/testenv/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/alex/anaconda3/envs/testenv/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/alex/anaconda3/envs/testenv/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._targ

KeyboardInterrupt: 