In [4]:
import numpy
import pandas as pd
import csv
import time
import random
import string
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import NaiveBayesClassifier
from nltk import classify

In [None]:
#nltk.download('stopwords')
#nltk.download('punkt')

In [None]:
# file path to the txt files
data_path = './data/'

# load csv files into pandas dataframes
Custom_Header = ["Sentence", "Sentiment"]
data_amazon = pd.read_csv(data_path+'amazon.txt', sep='\t', quoting=csv.QUOTE_NONE,
                          header=None, names=Custom_Header)
data_imdb = pd.read_csv(data_path+'imdb.txt', sep='\t', quoting=csv.QUOTE_NONE,
                        header=None, names=Custom_Header)
data_yelp = pd.read_csv(data_path+'yelp.txt', sep='\t', quoting=csv.QUOTE_NONE,
                        header=None, names=Custom_Header)

# print(data_amazon.head())
# print(data_imdb.head())
# print(data_yelp.head())

In [None]:
# stop words are words that do not add much meaning to a sentence and can be removed
stop_words = set(stopwords.words('english'))
stop_words_array = stopwords.words('english')

In [None]:
# tokenize the sentences () into words and remove punctuation and stop words
# tokenize the words from every dataset and store them in a list

amazon_tokens = set(word.lower() for words in data_amazon['Sentence'] for word in word_tokenize(
    words) if word.lower() not in stop_words_array and word.lower() not in string.punctuation)

imdb_tokens = set(word.lower() for words in data_imdb['Sentence'] for word in word_tokenize(
    words) if word.lower() not in stop_words_array and word.lower() not in string.punctuation)

yelp_tokens = set(word.lower() for words in data_yelp['Sentence'] for word in word_tokenize(
    words) if word.lower() not in stop_words_array and word.lower() not in string.punctuation)


In [None]:
# data_tokens are the unique words from all the data sets
data_tokens = amazon_tokens.union(imdb_tokens).union(yelp_tokens)

In [None]:
amazon_train = [({word: (word in word_tokenize(data_amazon['Sentence'][i].lower()) and word.lower() not in stop_words_array)
                  for word in amazon_tokens}, data_amazon['Sentiment'][i]) for i in range(0, len(data_amazon))]
imdb_train = [({word: (word in word_tokenize(data_imdb['Sentence'][i].lower()) and word.lower() not in stop_words_array)
                for word in imdb_tokens}, data_imdb['Sentiment'][i]) for i in range(0, len(data_imdb))]
yelp_train = [({word: (word in word_tokenize(data_yelp['Sentence'][i].lower()) and word.lower() not in stop_words_array)
                for word in yelp_tokens}, data_yelp['Sentiment'][i]) for i in range(0, len(data_yelp))]

In [None]:
# for bigger data set we merge the data sets together 
data_train = amazon_train + imdb_train + yelp_train

In [None]:
# FUNCTION to save the data_train to a pickle file
def save_data_train(data_train, file_name):
    with open(file_name, 'wb') as handle:
        pickle.dump(data_train, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [None]:
# load the data_train from the pickle file
save_data_train(data_train,'data_train.pickle')

In [None]:
#FUNCTION to load the data_train from the pickle file
def load_data_train(file_name):
    with open(file_name, 'rb') as handle:
        data_train = pickle.load(handle)
    return data_train

In [None]:
# load the data_train from the pickle file
data_train = load_data_train('data_train.pickle')

In [None]:
# shuffle the data_train to randomize the data
random.shuffle(data_train)

In [None]:
# split the data_train into training and testing data
data_train_x = data_train[:int(len(data_train)*0.8)]
data_test_x = data_train[int(len(data_train)*0.8):]

In [None]:
# create model using NaiveBayesClassifier
model = NaiveBayesClassifier.train(data_train_x)
model.show_most_informative_features()

In [None]:
# compare the accuracy of the model to the test data
acc = classify.accuracy(model, data_test_x)
print("Accuracy:", acc)

In [None]:
# FUNCTION to save the model
def save_model(model, filename):
    with open(filename, 'wb') as f:
        pickle.dump(model, f)

In [None]:
# save the model
save_model(model, 'model.pickle')


In [2]:
# FUNICTION to load the model from disk
def load_model(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

In [5]:
# load the model from pickle file
newmodel = load_model('model.pickle')


In [6]:
#FUNCTION to predict the sentiment of a sentence
def predict(sentence):
    return newmodel.classify(dict([token, True] for token in word_tokenize(sentence.lower())))

In [7]:
# test the model
test_sentence = "I love this movie"
test_sentence2 = "I hate this movie"

print(predict(test_sentence))
print(predict(test_sentence2))


1
0
