In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import nltk as nlp
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk import TweetTokenizer

In [2]:
tokenizer = TweetTokenizer(reduce_len=True)

In [3]:
df = pd.read_csv("amazon_yelp_twitter2.csv")

In [4]:
df.head()

Unnamed: 0,sentiment,text
0,1,nearly perfect wheat-free bread mix
1,0,be aware: speakers not as advertised on amazon
2,0,results comedic at best
3,1,going to enjoy the sunshine while its here
4,1,i feel better now.


# Clean the Data and Check for Size

In [5]:
df = df.iloc[:,:2].drop_duplicates()

In [6]:
df2 = df[pd.notnull(df['text'])]

In [7]:
df2.shape

(4486498, 2)

In [8]:
 df_pos, df_neg = [x for _, x in df.groupby(df['sentiment'] < 1)]

In [9]:
pos_list = df_pos["text"].values
neg_list = df_neg["text"].values

In [10]:
pos_list_str= ''.join(map(str, pos_list))
neg_list_str= ''.join(map(str, neg_list))

In [11]:
print(len(pos_list_str),len(neg_list_str))

107792435 104612634


# Splitting the data

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=1000, noise=0.50, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [13]:
pd.crosstab(index=y, columns='text')

col_0,text
row_0,Unnamed: 1_level_1
0,500
1,500


# Creating Models and Performaing Ensemble Learning 

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression(solver="liblinear", random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)
svm_clf = SVC(gamma="auto", random_state=42)

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='hard')

In [15]:
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)), ('rf', RandomFor...f',
  max_iter=-1, probability=False, random_state=42, shrinking=True,
  tol=0.001, verbose=False))],
         flatten_transform=None, n_jobs=1, voting='hard', weights=None)

In [16]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print("train:",clf.__class__.__name__, accuracy_score(y_train, clf.predict(X_train)))
    print("test:",clf.__class__.__name__, accuracy_score(y_test, y_pred))
    print("==================================================================")
    print("==================================================================")
    print("==================================================================")

train: LogisticRegression 0.8026666666666666
test: LogisticRegression 0.812
train: RandomForestClassifier 1.0
test: RandomForestClassifier 0.804
train: SVC 0.8213333333333334
test: SVC 0.828
train: VotingClassifier 0.8466666666666667
test: VotingClassifier 0.828


  if diff:
  if diff:


In [17]:
from sklearn.metrics import confusion_matrix

pd.DataFrame (
    confusion_matrix( y_test, y_pred ),
    columns = [ 'Predicted Success', 'Predicted Failure' ],
    index = [ 'True Success', 'True Failure' ]
)

Unnamed: 0,Predicted Success,Predicted Failure
True Success,106,23
True Failure,20,101


# Bagging Ensembles (Reducing Variance) - Decision Trees

In [18]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(random_state=42), n_estimators=500,
    max_samples=700, bootstrap=True, n_jobs=-1, random_state=42)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [19]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.8


In [20]:
tree_clf = DecisionTreeClassifier(random_state=50)
tree_clf.fit(X_train, y_train)
y_pred_tree = tree_clf.predict(X_test)
print(accuracy_score(y_test, y_pred_tree))

0.752


# Random Forrest Reggresion Model

In [21]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(splitter="random", max_leaf_nodes=200, random_state=50),
    n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1, random_state=50)

In [22]:
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [23]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.8


In [24]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=1000, n_jobs=-1, random_state=50)
rnd_clf.fit(X_train, y_train)

y_pred_rf = rnd_clf.predict(X_test)

In [25]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred_rf))

0.808


In [26]:
rnd_clf.feature_importances_

array([0.47748162, 0.52251838])

In [27]:
from sklearn.metrics import confusion_matrix

pd.DataFrame (
    confusion_matrix( y_test, y_pred ),
    columns = [ 'Predicted Success', 'Predicted Failure' ],
    index = [ 'True Success', 'True Failure' ]
)

Unnamed: 0,Predicted Success,Predicted Failure
True Success,105,24
True Failure,26,95
