In [1]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [2]:
count = CountVectorizer()

In [3]:
docs = np.array([
    'The sun is shining',
    'The weather is sweet',
    'the sun is shining, the weather is sweet, and one and one is two'
    ])

In [4]:
bag = count.fit_transform(docs)

In [5]:
bag.toarray()

array([[0, 1, 0, 1, 1, 0, 1, 0, 0],
       [0, 1, 0, 0, 0, 1, 1, 0, 1],
       [2, 3, 2, 1, 1, 1, 2, 1, 1]], dtype=int64)

In [6]:
count.vocabulary_

{'and': 0,
 'is': 1,
 'one': 2,
 'shining': 3,
 'sun': 4,
 'sweet': 5,
 'the': 6,
 'two': 7,
 'weather': 8}

In [7]:
tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)

In [8]:
np.set_printoptions(precision=2)

In [9]:
tfidf.fit_transform(count.fit_transform(docs)).toarray()

array([[0.  , 0.43, 0.  , 0.56, 0.56, 0.  , 0.43, 0.  , 0.  ],
       [0.  , 0.43, 0.  , 0.  , 0.  , 0.56, 0.43, 0.  , 0.56],
       [0.5 , 0.45, 0.5 , 0.19, 0.19, 0.19, 0.3 , 0.25, 0.19]])

In [10]:
import pandas as pd
import os

In [11]:
df = pd.read_csv('D:/LocalData/N196003/Desktop/movie_data.csv')

In [12]:
df.head(5)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [13]:
df.loc[0, 'review'][-50:]

'is seven.<br /><br />Title (Brazil): Not Available'

In [14]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:\)|\(|D|P)', text)
    text = (re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', ''))
    return text

In [15]:
preprocessor(df.loc[0, 'review'][-50:])

'is seven title brazil not available'

In [16]:
preprocessor("</a>This is :) is :( a test :-)!")

'this is is a test :):('

In [17]:
df['review'] = df['review'].apply(preprocessor)

In [18]:
def tokenizer(text):
    return text.split()

In [19]:
tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [20]:
from nltk.stem.porter import PorterStemmer

In [21]:
porter = PorterStemmer()

In [22]:
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [23]:
tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [24]:
text = 'I have a pen like a pencil'

In [25]:
text.split()

['I', 'have', 'a', 'pen', 'like', 'a', 'pencil']

In [26]:
import nltk

In [27]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\N196003\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [28]:
from nltk.corpus import stopwords

In [29]:
stop = stopwords.words('english')

In [30]:
[w for w in tokenizer_porter('runners like running and run a lot')[-10:] if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

In [31]:
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [32]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [33]:
tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)

In [34]:
param_grid = [
    {'vect__ngram_range': [(1, 1)],
     'vect__stop_words': [stop, None],
     'vect__tokenizer': [tokenizer, tokenizer_porter],
     'clf__penalty': ['l1', 'l2'],
     'clf__C': [1.0, 10.0, 100.0]},
    
    {'vect__ngram_range': [(1, 1)],
     'vect__stop_words': [stop, None],
     'vect__tokenizer': [tokenizer, tokenizer_porter],
     'vect__use_idf': [False],
     'vect__norm': [None],
     'clf__penalty': ['l1', 'l2'],
     'clf__C': [1.0, 10.0, 100.0]}
]

In [35]:
lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(random_state=0))])

In [37]:
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy', cv=5, verbose=1)

In [38]:
lr_tfidf.get_params().keys()

dict_keys(['memory', 'steps', 'vect', 'clf', 'vect__analyzer', 'vect__binary', 'vect__decode_error', 'vect__dtype', 'vect__encoding', 'vect__input', 'vect__lowercase', 'vect__max_df', 'vect__max_features', 'vect__min_df', 'vect__ngram_range', 'vect__norm', 'vect__preprocessor', 'vect__smooth_idf', 'vect__stop_words', 'vect__strip_accents', 'vect__sublinear_tf', 'vect__token_pattern', 'vect__tokenizer', 'vect__use_idf', 'vect__vocabulary', 'clf__C', 'clf__class_weight', 'clf__dual', 'clf__fit_intercept', 'clf__intercept_scaling', 'clf__max_iter', 'clf__multi_class', 'clf__n_jobs', 'clf__penalty', 'clf__random_state', 'clf__solver', 'clf__tol', 'clf__verbose', 'clf__warm_start'])

In [39]:
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=1)]: Done 240 out of 240 | elapsed: 681.5min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...nalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'vect__ngram_range': [(1, 1)], 'vect__stop_words': [['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's...se_idf': [False], 'vect__norm': [None], 'clf__penalty': ['l1', 'l2'], 'clf__C': [1.0, 10.0, 100.0]}],
       pre_dispatch='2*n_jobs', refit=True, return_tra

In [42]:
params = gs_lr_tfidf.best_params_

In [43]:
params

{'clf__C': 10.0,
 'clf__penalty': 'l2',
 'vect__ngram_range': (1, 1),
 'vect__stop_words': None,
 'vect__tokenizer': <function __main__.tokenizer>}

In [44]:
print('CV　accuracy: %.3f' % gs_lr_tfidf.best_score_)

CV　accuracy: 0.897


In [45]:
clf = gs_lr_tfidf.best_estimator_

In [46]:
print('Test accuracy: %.3f' % clf.score(X_test, y_test))

Test accuracy: 0.898
