In [1]:
import pandas as pd
import numpy as np
import regex as re
import nltk
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [6]:
df_train = pd.read_csv("train.csv", encoding='unicode_escape')
df_train["content"] = df_train["text"]
df_test = pd.read_csv("FINAL_merged_formatted_Amazon_Bestsellers_ALL_Reviews.csv")
df_train['content'] = df_train['content'].astype(str)
df_test['content'] = df_test['content'].astype(str)

In [3]:
print("Train set length:", len(df_train))
print("Web set length:", len(df_test))

Train set length: 27481
Web set length: 33468


### Partitioning

In [17]:
def partition(x):
    if x == "negative":
        return 0
    elif x == "neutral":
        return 2
    else:
        return 1

actualScore = df_train['sentiment']
class_ = actualScore.map(partition)
df_train['ratings_class'] = class_
df_train.ratings_class.value_counts()

ratings_class
2    11118
1     8582
0     7781
Name: count, dtype: int64

In [7]:
def partition(x):
    if x < 3:
        return 2
    elif x == 3:
        return 0
    else:
        return 1

# actualScore = df_test['rating']
# class_ = actualScore.map(partition)
# df_test['ratings_class'] = class_
# df_test.ratings_class.value_counts()

### Stemming

In [8]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [9]:
def stemming(x):
    stemmer = PorterStemmer()
    x = word_tokenize(x)
    output = ''

    for i in x:
        output += stemmer.stem(i) + ' '

    return output

In [10]:
df_train['content'] = df_train['content'].apply(stemming)
df_test['content'] = df_test['content'].apply(stemming)

### Lemmatization

In [11]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [12]:
from nltk.stem import WordNetLemmatizer
def get_lemmatized_text(corpus):
    lemmatizer = WordNetLemmatizer()
    return [' '.join([lemmatizer.lemmatize(word) for word in review.split()]) for review in corpus]

In [13]:
df_train['content'] = get_lemmatized_text(df_train['content'])
df_test['content'] = get_lemmatized_text(df_test['content'])

### Removal of Stopwords

In [14]:
stop = stopwords.words('english')
additional_stopwords = ["'s","...","'ve","``","''","'m",'--',"'ll","'d"]
stop = set(stop + additional_stopwords)
def remove_stop(x):
    x = word_tokenize(x)
    store = ''

    for i in x:
        if i not in stop:
            store += i + ' '

    return store

In [15]:
df_train['content'] = df_train['content'].apply(remove_stop)
df_test['content'] = df_test['content'].apply(remove_stop)

### Wordcount

In [None]:
wordcount = df_train['content'].apply(lambda x: len(x.split())).sum()
print("There are {} words in the corpus.".format(wordcount))

There are 274428 words in the corpus.


In [None]:
df_train.content.str.split(expand=True).stack().value_counts()

!                       15296
.                       13910
`                       11614
,                        8461
*                        4953
                        ...  
kaila                       1
//tinyurl.com/64ozr7        1
weeaboo                     1
boo-hoo                     1
atg                         1
Name: count, Length: 23257, dtype: int64

## Naive Bayes Classification

In [20]:
from sklearn.model_selection import train_test_split

import time

x_train, x_test, y_train = df_train[["content"]], df_test[["content"]], \
        df_train["ratings_class"]

start_time = time.time()
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(use_idf=True, sublinear_tf=True)
comment_matrix = vectorizer.fit_transform(x_train['content'])

from sklearn.naive_bayes import MultinomialNB
comment_classifier = MultinomialNB().fit(comment_matrix, y_train)
print("--- Training time: %s seconds ---" % (time.time() - start_time))

from sklearn import metrics
test_vector = vectorizer.transform(x_test['content'])

start_time = time.time()
result = comment_classifier.predict(test_vector)
print("--- Evaluating time: %s seconds ---" % (time.time() - start_time))
result


--- Training time: 0.6495673656463623 seconds ---
--- Evaluating time: 0.009547948837280273 seconds ---


array([2, 0, 2, ..., 2, 2, 2])

In [30]:
df_test["model_prediction"] = result
df_test["model_prediction"] = df_test["model_prediction"].replace({1: 'positive', 0: 'negative', 2: 'neutral'})

In [35]:
df_test.to_csv("NB_model_prediction.csv")