In [None]:
# Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
# Installing Natural Language Tool Kit 
# pip install nltk

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from collections import defaultdict
import math

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
import pandas as pd

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
# Read in manually labeled text review data
df = pd.read_csv("/content/drive/MyDrive/Grad School/MSBA/Big Data Tech/test_labeled_data.csv")
print(df)

                                                   Text sentiment
0        the peanuts were actually small sized unsalted  negative
1     Not sure if this was an error or if the vendor...  negative
2     If you are looking for the secret ingredient i...  negative
3                          The flavor is very medicinal  negative
4     My only complaint is there was a bit too much ...  negative
...                                                 ...       ...
2092  I absolutely LOVE the pumpkin spice latte they...  positive
2093  While there is no nutmeg on top this stuff sti...  positive
2094  I can just heat up a small pan of milk on the ...  positive
2095         Also it is cheaper than going to Starbucks  positive
2096  Their grande size latte is about $4 whereas on...  positive

[2097 rows x 2 columns]


In [None]:
# Split data into X and y variables
X = df['Text']

df['sentiment'] = df['sentiment'].replace("negative", 0)
df['sentiment'] = df['sentiment'].replace("positive", 1)
y = df['sentiment']

print(X)
print(y)

0          the peanuts were actually small sized unsalted
1       Not sure if this was an error or if the vendor...
2       If you are looking for the secret ingredient i...
3                            The flavor is very medicinal
4       My only complaint is there was a bit too much ...
                              ...                        
2092    I absolutely LOVE the pumpkin spice latte they...
2093    While there is no nutmeg on top this stuff sti...
2094    I can just heat up a small pan of milk on the ...
2095           Also it is cheaper than going to Starbucks
2096    Their grande size latte is about $4 whereas on...
Name: Text, Length: 2097, dtype: object
0       0
1       0
2       0
3       0
4       0
       ..
2092    1
2093    1
2094    1
2095    1
2096    1
Name: sentiment, Length: 2097, dtype: int64


In [None]:
# Tokenization is the process of splitting sentences into smaller units.
# Using word_tokenize() function to tokenize sentences
df['Tokens'] = df['Text'].map(lambda x: nltk.word_tokenize(x))
print(df['Tokens'])

0       [the, peanuts, were, actually, small, sized, u...
1       [Not, sure, if, this, was, an, error, or, if, ...
2       [If, you, are, looking, for, the, secret, ingr...
3                      [The, flavor, is, very, medicinal]
4       [My, only, complaint, is, there, was, a, bit, ...
                              ...                        
2092    [I, absolutely, LOVE, the, pumpkin, spice, lat...
2093    [While, there, is, no, nutmeg, on, top, this, ...
2094    [I, can, just, heat, up, a, small, pan, of, mi...
2095    [Also, it, is, cheaper, than, going, to, Starb...
2096    [Their, grande, size, latte, is, about, $, 4, ...
Name: Tokens, Length: 2097, dtype: object


In [None]:
# Making each Text review lowercase
import string
STOP_WORDS = stopwords.words('english')
from nltk.stem import PorterStemmer
ps = PorterStemmer()
import re 

lower_case_list = []
lower_case_token = ""
for i in range(0,len(df['Tokens'])):
  tokens = df['Tokens'][i]
  for w in range(0,len(tokens)):
    # lower case each token
    tokens[w] = tokens[w].lower()
    # remove punctuation from each token
    tokens[w] = tokens[w].translate(str.maketrans('', '', string.punctuation))
    # remove stop words
    if tokens[w] in STOP_WORDS:
      continue
    # stem words
    tokens[w] = ps.stem(tokens[w])
    lower_case_token = lower_case_token + " " + tokens[w]
    lower_case_token = re.sub('\s+',' ',lower_case_token)
    lower_case_token = lower_case_token.lstrip()
  lower_case_list.append(lower_case_token)
  lower_case_token = ""

print(lower_case_list)

['peanut actual small size unsalt', 'sure error vendor intend repres product jumbo ', 'look secret ingredi robitussin believ found', 'flavor medicin', 'complaint bit much redblack licoriceflavor piec particular favorit ', 'realiz simpli could nt find anywher citi bum', 'higher food sit go stale', 'tri new food first put bowl bowl sit full kitti touch food', 'unfortun need find new food cat eat', 'candi red flavor', 'plain chewi', 'would never buy', 'never huge coffe fan', 'escap fact howev even best instant oatmeal nowher near good even store brand oatmeal requir stovetop prepar', 'one thing like though mccann use thicken', 'oat plu water plu heat make creami tasti oatmeal without need guar gum', 'mayb guar gum sit bowl instant mccann becom thick gluey', 'instant oatmeal becom soggi minut water hit bowl', 'mccann regular oat meal excel may take bit longer prepar time morn', 'still oatmeal', 'appl cinnamon though tend littl liquidi may want experi amount water add', 'bad thing consid ba

In [None]:
# Create a dataframe
df_cleaned_text = pd.DataFrame(lower_case_list, columns=['Text'])
print(df_cleaned_text)

                                                   Text
0                       peanut actual small size unsalt
1        sure error vendor intend repres product jumbo 
2           look secret ingredi robitussin believ found
3                                        flavor medicin
4     complaint bit much redblack licoriceflavor pie...
...                                                 ...
2092      absolut love pumpkin spice latt serv starbuck
2093   nutmeg top stuff still hit spot also add nutmeg 
2094  heat small pan milk stove add via readi brew m...
2095                           also cheaper go starbuck
2096  grand size latt 4 wherea one packet via readi ...

[2097 rows x 1 columns]


In [None]:
# Appending initial sentiment onto processed Text
df_clean = df_cleaned_text 
df_clean['sentiment'] = y
print(df_clean)

                                                   Text  sentiment
0                       peanut actual small size unsalt          0
1        sure error vendor intend repres product jumbo           0
2           look secret ingredi robitussin believ found          0
3                                        flavor medicin          0
4     complaint bit much redblack licoriceflavor pie...          0
...                                                 ...        ...
2092      absolut love pumpkin spice latt serv starbuck          1
2093   nutmeg top stuff still hit spot also add nutmeg           1
2094  heat small pan milk stove add via readi brew m...          1
2095                           also cheaper go starbuck          1
2096  grand size latt 4 wherea one packet via readi ...          1

[2097 rows x 2 columns]


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report, confusion_matrix
# code taken from https://github.com/jradishness/sentiment-analysis/blob/master/final_code_GaussianNB.py

# DATA SPLITING
print("Splitting Data into subsets...")
X_train, X_test, y_train, y_test = train_test_split(df_clean['Text'],   # Test-Train Split function
                                                    df_clean['sentiment'],
                                                    train_size = 0.8,
                                                    test_size = 0.2,
                                                    random_state=14
                                                    )
print("Training shape is: ", X_train.shape)
# FEATURE EXTRACTION
print("Extracting features...")
vect = CountVectorizer(min_df=1            # Minimum Document Frequency
                       #ngram_range=(1,3),   # unigrams to trigrams
                       ).fit(X_train)
X_train_vectorized = vect.transform(X_train)
X_test_vectorized = vect.transform(X_test)
num_features = len(vect.get_feature_names_out ())
print(len(vect.get_feature_names_out ()), "features found.")     # Feature check

# Numpy Dense Array Transformation
print("Converting to Numpy Dense Array...")
X_train_dense = X_train_vectorized.toarray()   # Convert to Dense Numpy Array
X_test_dense = X_test_vectorized.toarray()

Splitting Data into subsets...
Training shape is:  (1677,)
Extracting features...
2289 features found.
Converting to Numpy Dense Array...


In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score,\
    precision_score, f1_score, roc_auc_score

In [None]:
# Gaussian Naive Bayes model
print("Training/Evaluating Gaussian Naive Bayes model...")
gnb_model = GaussianNB().fit((X_train_dense), y_train)

# get the prediction from a knn
y_pred_train_gnb = gnb_model.predict_proba(X_train_dense)
y_pred_test_gnb = gnb_model.predict_proba(X_test_dense)

# get the score from a knn
gnb_auc_train = roc_auc_score(y_train, y_pred_train_gnb[:, 1])
print('Train AUC-ROC: {:.4f}'.format(gnb_auc_train))

gnb_auc_test = roc_auc_score(y_test, y_pred_test_gnb[:, 1])
print('Test AUC-ROC: {:.4f}'.format(gnb_auc_test))

Training/Evaluating Gaussian Naive Bayes model...
Train AUC-ROC: 0.8457
Test AUC-ROC: 0.6364


In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=50)                        # K = 50
knn.fit(X_train_dense, y_train)                                   # fit the model
knn_pred_train = knn.predict(X_train_dense)                       # make predictions
knn_score_train = knn.predict_proba(X_train_dense)                # get prediction scores

# get the prediction from a knn
y_pred_train_knn = knn.predict_proba(X_train_dense)
y_pred_test_knn = knn.predict_proba(X_test_dense)

# get the score from a knn
# auc-roc
knn_auc_train = roc_auc_score(y_train, y_pred_train_knn[:, 1])
print('Train AUC-ROC: {:.4f}'.format(knn_auc_train))

knn_auc_test = roc_auc_score(y_test, y_pred_test_knn[:, 1])
print('Test AUC-ROC: {:.4f}'.format(knn_auc_test))

Train AUC-ROC: 0.7274
Test AUC-ROC: 0.6149


In [None]:
'''
from sklearn.linear_model import LogisticRegression

log_clf = LogisticRegression()          
log_clf.fit(X_train_dense, y_train)                                    # fit the model
log_clf_pred_train = log_clf.predict(X_train_dense)                    # make predictions
log_clf_score_train = log_clf.predict_proba(X_train_dense)             # get prediction scores

# get the prediction from a logistic regression
y_pred_train_log = log_clf.predict_proba(X_train_dense)
y_pred_test_log = log_clf.predict_proba(X_test_dense)

# get the score from a random forest
# auc-roc
log_auc_train = roc_auc_score(y_train, y_pred_train_log[:, 1])
print('Train AUC-ROC: {:.4f}'.format(log_auc_train))

log_auc_test = roc_auc_score(y_test, y_pred_test_log[:, 1])
print('Test AUC-ROC: {:.4f}'.format(log_auc_test))
'''
# Display coefficients
log_clf_coef = pd.DataFrame({
    'Feature Name': vect.get_feature_names_out(),
    'Coefficient': log_clf.coef_[0]
})
print(log_clf_coef.sort_values(by=['Coefficient'],ascending=True))

print(X_test[0:5])
print(y_pred_test_log[0:5])

NameError: ignored

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score,\
    precision_score, f1_score, roc_auc_score

dt_clf = DecisionTreeClassifier(max_depth=10)          
dt_clf.fit(X_train_dense, y_train)                                # fit the model
dt_clf_pred_train = dt_clf.predict(X_train_dense)                 # make predictions
dt_clf_score_train = dt_clf.predict_proba(X_train_dense)          # get prediction scores

# get the prediction from a decision tree
y_pred_train_dt = dt_clf.predict_proba(X_train_dense)
y_pred_test_dt = dt_clf.predict_proba(X_test_dense)

# get the score from a random forest
# auc-roc
dt_auc_train = roc_auc_score(y_train, y_pred_train_dt[:, 1])
print('Train AUC-ROC: {:.4f}'.format(dt_auc_train))

dt_auc_test = roc_auc_score(y_test, y_pred_test_dt[:, 1])
print('Test AUC-ROC: {:.4f}'.format(dt_auc_test))

In [None]:
'''from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_depth = 5)
rf.fit(X_train_dense, y_train)

# get the prediction from a random forest
y_pred_train_rf = rf.predict_proba(X_train_dense)
y_pred_test_rf = rf.predict_proba(X_test_dense)

# get the score from a random forest
rf_auc_train = roc_auc_score(y_train, y_pred_train_rf[:, 1])
print('Train AUC-ROC: {:.4f}'.format(rf_auc_train))

rf_auc_test = roc_auc_score(y_test, y_pred_test_rf[:, 1])
print('Test AUC-ROC: {:.4f}'.format(rf_auc_test))
'''

In [None]:
'''from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=10,
learning_rate=0.01,
max_depth=7
).fit(X_train_dense, y_train)

# get the prediction from a random forest
y_pred_train_clf = clf.predict_proba(X_train_dense)
y_pred_test_clf = clf.predict_proba(X_test_dense)

# get the score from a random forest
# auc-roc
clf_auc_train = roc_auc_score(y_train, y_pred_train_clf[:, 1])
print('Train AUC-ROC: {:.4f}'.format(clf_auc_train))

clf_auc_test = roc_auc_score(y_test, y_pred_test_clf[:, 1])
print('Test AUC-ROC: {:.4f}'.format(clf_auc_test))
'''