### Minimal library import

In [5]:
# import main libraries/packages
import warnings
warnings.filterwarnings('ignore') # to ignore annoying IPython warnings

import pandas as pd
import numpy as np
import re

from textblob import TextBlob, Word # pip install textblob / conda install textblob

from scipy.sparse import csr_matrix
from sklearn.metrics import accuracy_score

import dill

In [6]:
# download .rar file with serialized model:
# https://drive.google.com/open?id=0B816yAqrVM5vejJNNWtDMWFSLUE

with open('XGB-model.pkl', 'rb') as f:
    model = dill.load(f) # trying to load model...

### Supportive functions/objects for feature extraction

In [9]:
# don't pay much attention here, it's for inner model usage

def get_rate(s):
    candidates = re.findall(r'(\d{1,3}[\\|/]{1}\d{1,2})', s)
    rates = []
    for c in candidates:
        try:
            rates.append(eval(c))
        except SyntaxError:
            pass
        except ZeroDivisionError:
            return 0
    return np.median(rates)

# regular expression to split review on sentences
sentence_splitter = re.compile('(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z]\.)(?<=\!|\?|\.)\s')

# set of positive/negative smiles
positive_smiles = set([
":‑)",":)",":-]",":]",":-3",":3",":->",":>","8-)","8)",":-}",":}",":o)",":c)",":^)","=]","=)",":‑D",":D","8‑D","8D",
"x‑D","xD","X‑D","XD","=D","=3","B^D",":-))",";‑)",";)","*-)","*)",";‑]",";]",";^)",":‑,",";D",":‑P",":P","X‑P","XP",
"x‑p","xp",":‑p",":p",":‑Þ",":Þ",":‑þ",":þ",":‑b",":b","d:","=p",">:P", ":'‑)", ":')",  ":-*", ":*", ":×"
])
negative_smiles = set([
":‑(",":(",":‑c",":c",":‑<",":<",":‑[",":[",":-||",">:[",":{",":@",">:(","D‑':","D:<","D:","D8","D;","D=","DX",":‑/",
":/",":‑.",'>:\\', ">:/", ":\\", "=/" ,"=\\", ":L", "=L",":S",":‑|",":|","|‑O","<:‑|"
])

# to catch uppercase words with length > 1
uppercase_pattern = re.compile(r'(\b[0-9]*[A-Z]+[0-9]*[A-Z]{1,}[0-9]*\b)')

# list of contrary conjugations
contrast_conj = set([
'alternatively','anyway','but','by contrast','differ from','elsewhere','even so','however','in contrast','in fact',
'in other respects','in spite of','in that respect','instead','nevertheless','on the contrary','on the other hand',
'rather','though','whereas','yet'])

# to calculate "purity" of the review
def purity(sentences):
    polarities = np.array([TextBlob(x).sentiment.polarity for x in sentences])
    return polarities.sum() / np.abs(polarities).sum()

### Example of usage

In [12]:
X = pd.Series(['nice film!', 
               'it is total crap. 2/10', 
               '8/10!', 
               "that's why i don't like vanilla films"])

y = pd.Series([1,0,1,0])

print('accuracy: {}'.format(accuracy_score(y, model.predict(X))))

accuracy: 1.0


## Test your dataset here

In [None]:
# df = pd.read_csv('your_dataset.csv', sep="your_separator", engine='c', usecols=['label', 'text'])
# X, y = df.text, df.label  # use binary labels = {0-negative,1-positive}
# print('accuracy: {}'.format(accuracy_score(y, model.predict(X))))