In [1]:
import numpy as np
import pandas as pd 
from sklearn.utils import shuffle
import nltk

Using TensorFlow backend.


In [2]:
from nltk.stem import WordNetLemmatizer
from sklearn.ensemble import RandomForestClassifier
from bs4 import BeautifulSoup

In [3]:
wordnet_lemmatizer = WordNetLemmatizer()

In [7]:
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

In [8]:
positive_reviews = BeautifulSoup(open('electronics/positive.review').read())
positive_reviews = positive_reviews.findAll('review_text')
negative_reviews = BeautifulSoup(open('electronics/negative.review').read())
negative_reviews = negative_reviews.findAll('review_text')

In [17]:
print(positive_reviews[3].text)


Cheaper than thick CD cases and less prone to breakage, these "slim" cases by Memorex protect your CDs. Most buyers will get them  for use in CD or DVD burning, and these are an affordable choice from a quality company for that purpose.

You may want to consider larger cases depending on your tastes, but otherwise these will probably satisfy your needs



In [13]:
def my_tokenizer(s):
    s = s.lower()
    tokens = nltk.tokenize.word_tokenize(s)
    tokens = [t for t in tokens if len(t) > 2]
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens]
    tokens = [t for t in tokens if t not in stopwords]
    return tokens

In [18]:
word2idx = {}
current_index = 0
positive_tokenized = []
negative_tokenized = []
orig_reviews = []

for review in positive_reviews:
    orig_reviews.append(review.text)
    tokens = my_tokenizer(review.text)
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in word2idx:
            word2idx[token] = current_index
            current_index += 1 

for review in negative_reviews:
    orig_reviews.append(review.text)
    tokens = my_tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word2idx:
            word2idx[token] = current_index
            current_index += 1 

In [19]:
print(positive_tokenized[0:1])

[['purchased', 'unit', 'due', 'frequent', 'blackout', 'area', 'power', 'supply', 'going', 'bad', 'run', 'cable', 'modem', 'router', 'lcd', 'monitor', 'minute', 'enough', 'time', 'save', 'work', 'shut', 'equally', 'important', 'know', 'electronics', 'receiving', 'clean', 'power', 'feel', 'investment', 'minor', 'compared', 'loss', 'valuable', 'data', 'failure', 'equipment', 'due', 'power', 'spike', 'irregular', 'power', 'supply', 'always', 'amazon', 'business', 'day']]


In [20]:
def tokens_to_vector(tokens, label):
    x = np.zeros(len(word2idx) + 1)
    for t in tokens:
        i = word2idx[t]
        x[i] += 1 
    x = x / x.sum()
    x[-1]= label
    return x 

In [21]:
N = len(positive_tokenized) + len(negative_tokenized)
data = np.zeros((N, len(word2idx) + 1))
i = 0 
for tokens in positive_tokenized:
    xy = tokens_to_vector(tokens, 1)
    data[i,:] += xy 
    i += 1 

for tokens in negative_tokenized:
    xy = tokens_to_vector(tokens, 0)
    data[i,:] = xy
    i += 1 

In [22]:
orig_reviews, data = shuffle(orig_reviews, data)

In [32]:
from sklearn.model_selection import train_test_split
X = data[:, :-1]
Y = data[:, -1]
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, stratify = Y, test_size = 0.2)

In [36]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [47]:
grid_params = {
    "n_estimators": [250, 300, 350],
    "max_depth": [17,18],
    "min_samples_leaf": [3]
}

In [48]:
from sklearn.model_selection import GridSearchCV

In [49]:
gs = GridSearchCV(RandomForestClassifier(), grid_params, cv = 3)
gs.fit(Xtrain, Ytrain)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [250, 300, 350], 'max_depth': [17, 18], 'min_samples_leaf': [3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [50]:
gs.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=18, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [51]:
gs.best_score_

0.795