In [6]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC


In [7]:
data = pd.read_csv('reviews_rt_all.csv', sep = '|')

In [8]:
data.head()

Unnamed: 0,label,text
0,1,"To an entire generation of filmgoers, it just ..."
1,1,Pixar classic is one of the best kids' movies ...
2,1,Apesar de representar um imenso avanço tecnoló...
3,1,"When Woody perks up in the opening scene, it's..."
4,1,Introduced not one but two indelible character...


In [9]:
data.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
0,37952
1,64658


In [10]:
data.text.str.lower()

0         to an entire generation of filmgoers, it just ...
1         pixar classic is one of the best kids' movies ...
2         apesar de representar um imenso avanço tecnoló...
3         when woody perks up in the opening scene, it's...
4         introduced not one but two indelible character...
5         it is easy to see how virtually everything tha...
6         though some of the animation seems dated compa...
7         perhaps the film is meant as a pre-emptive sop...
8         time has been kind to it, and future years i i...
9         i think i speak for all adults and kids when i...
10        so ingenious in concept, design and execution ...
11        turns out the real magic is nothing to do with...
12        its lightness of touch has not diminished, nor...
13        the 3-d viewing doesn't make much difference.....
14        it doesn't enhance the experience, because the...
15        as such toy story in 3d is never overwhelming....
16        the fresh look serves the stor

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
cvect = CountVectorizer()
counts = cvect.fit_transform(data['text'])

In [12]:
print(counts.toarray().transpose())

[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]


In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test  = train_test_split(
        data.text, 
        data.label,
        train_size=0.85, 
        random_state=1234)

In [14]:
#from nltk.corpus import stopwords
#stopset = set(stopwords.words('english'))
STOPWORDS = ['a','an','by','did','does', 'was', 'were', 'i']

In [15]:
# Snowball stemmers could be used as a dependency
# for cleaning our text
'''
import nltk
from nltk.stem import PorterStemmer
import string
stemmer = PorterStemmer()
def tokenize_and_stem(text):
    tokens = nltk.tokenize.word_tokenize(text)
    # strip out punctuation and make lowercase
    tokens = [token.lower().strip(string.punctuation)
              for token in tokens if token.isalnum()]

    # now stem the tokens
    tokens = [stemmer.stem(token) for token in tokens]

    return tokens
'''

'\nimport nltk\nfrom nltk.stem import PorterStemmer\nimport string\nstemmer = PorterStemmer()\ndef tokenize_and_stem(text):\n    tokens = nltk.tokenize.word_tokenize(text)\n    # strip out punctuation and make lowercase\n    tokens = [token.lower().strip(string.punctuation)\n              for token in tokens if token.isalnum()]\n\n    # now stem the tokens\n    tokens = [stemmer.stem(token) for token in tokens]\n\n    return tokens\n'

In [16]:
from sklearn.pipeline import Pipeline
#from sklearn.LogisticRegression import LogisticRegression
from sklearn.linear_model import LogisticRegression
#tokenizer=tokenize_and_stem
pipeline = Pipeline([('vectorizer', CountVectorizer(ngram_range=(1, 2),stop_words=STOPWORDS)), ('classifier', LogisticRegression())])
model = pipeline.fit(X=X_train, y=y_train)

In [17]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
y_pred = model.predict(X_test)
print (accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.805873180873
             precision    recall  f1-score   support

          0       0.77      0.68      0.72      5750
          1       0.82      0.88      0.85      9642

avg / total       0.80      0.81      0.80     15392



In [18]:
from sklearn.externals import joblib

joblib.dump(pipeline, 'output.pkl')

['output.pkl']