## read file and convert to dataframe

In [32]:
from bs4 import BeautifulSoup
import re

def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text
def spit_sample(text):
    text = denoise_text(text)
    text = re.split(r'\n', text)
    return [re.split(r'\t', item) for item in text]

In [35]:
html = open("data/labeledTrainData.tsv").read()
data = spit_sample(html)

In [38]:
columns = data[0]

In [39]:
columns

['id', 'sentiment', 'review']

In [77]:
import pandas as pd
df = pd.DataFrame(data[1:], columns =columns)

In [78]:
df.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


## preprocessing phase

In [47]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vinhtrongtruong/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [76]:
import string
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

stop_words = set(stopwords.words('english'))

def convert_to_lowercase(sent):
    return sent.lower()

def remove_number(sent):
    return re.sub(r'\d+', '', sent)

def remove_punctuation(sent):
    return sent.translate(str.maketrans('', '', string.punctuation))

def tokenizer(sent):
    return word_tokenize(sent)

def remove_stopwords(sent):
    word_tokens = tokenizer(sent)
    return [w for w in word_tokens if not w in stop_words]

def nomarlize(sent):
    sent = convert_to_lowercase(sent)
    sent = remove_number(sent)
    sent = remove_punctuation(sent)
    sent = remove_stopwords(sent)
    return ' '.join(str(e) for e in sent)

In [79]:
df = df.dropna()

In [80]:
df['review'] = df['review'].apply(nomarlize)

In [81]:
df.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,stuff going moment mj ive started listening mu...
1,"""2381_9""",1,classic war worlds timothy hines entertaining ...
2,"""7759_3""",0,film starts manager nicholas bell giving welco...
3,"""3630_4""",0,must assumed praised film greatest filmed oper...
4,"""9495_8""",1,superbly trashy wondrously unpretentious explo...


## modeling phase

In [82]:
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC

In [83]:
classifier = LinearSVC(fit_intercept = True,multi_class='crammer_singer', C=1)

steps = []
steps.append(('CountVectorizer', CountVectorizer(ngram_range=(1,5),max_df=0.5, min_df=5)))
steps.append(('tfidf', TfidfTransformer(use_idf=False, sublinear_tf = True,norm='l2',smooth_idf=True)))
steps.append(('classifier', classifier))
clf = Pipeline(steps)

In [85]:
X = df.review
y = df.sentiment

In [86]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42)

In [87]:
clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('CountVectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=5,
        ngram_range=(1, 5), preprocessor=None, stop_words=None,
  ...0,
     multi_class='crammer_singer', penalty='l2', random_state=None,
     tol=0.0001, verbose=0))])

In [88]:
y_pred = clf.predict(X_test)
report = metrics.classification_report(y_test, y_pred, labels=[1,0], digits=3)

  mask &= (ar1 != a)


In [96]:
print(report)

              precision    recall  f1-score   support

           1      0.882     0.890     0.886      3758
           0      0.889     0.881     0.885      3741

   micro avg      0.885     0.885     0.885      7499
   macro avg      0.885     0.885     0.885      7499
weighted avg      0.885     0.885     0.885      7499



In [97]:
from sklearn.externals import joblib
joblib.dump(clf, 'model.pkl') 

['model.pkl']

## predict phase

In [98]:
pre_train_model = joblib.load('model.pkl') 

In [120]:
def predict(sent):
    sent = nomarlize(sent)
    return (sent, 'sentiment: {}'.format('positive' if pre_train_model.predict([sent])[0] == '1' else 'negative'))

In [121]:
predict('The new design is awful!')

('new design awful', 'sentiment: negative')

In [122]:
predict('I really like the new design of your website!')

('really like new design website', 'sentiment: positive')