In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.metrics import accuracy_score, classification_report
import pickle

In [6]:
df = pd.read_csv('train.csv', header=None)
df.columns = ['Polarity','Product','Review']

In [None]:
df

Unnamed: 0,Polarity,Product,Review
0,2,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...
1,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
2,2,Amazing!,This soundtrack is my favorite music of all ti...
3,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
4,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."
...,...,...,...
3599995,1,Don't do it!!,The high chair looks great when it first comes...
3599996,1,"Looks nice, low functionality",I have used this highchair for 2 kids now and ...
3599997,1,"compact, but hard to clean","We have a small house, and really wanted two o..."
3599998,1,what is it saying?,not sure what this book is supposed to be. It ...


In [7]:
sample_proportion = 0.03  # Represents 30%
sample_size = int(sample_proportion * len(df))  # Ensure integer size

# Create the stratified shuffle split object with the correct sample size
sss = StratifiedShuffleSplit(n_splits=1, test_size=sample_size, random_state=42)

# Split the data using stratified sampling
for train_index, test_index in sss.split(df, df["Polarity"]):
    sampled_data = df.iloc[test_index]

sampled_data

Unnamed: 0,Polarity,Product,Review
3505797,1,Definitely NOT for anyone over age 17,I thought and I read reviews that said that th...
1266595,1,This one should be pulled from the shelves!,I can't believe this version of this album sti...
347938,2,"some will enjoy, others may dose off",I watched this with my fiance and a group of f...
3558103,1,IT BURNED MY HAIR OFF,I have the silver version of this exact blowdr...
1735684,1,difficult to use,the microfiber mop head drags across the wood ...
...,...,...,...
1223910,2,Lion's Lady,For any women who love a good romance novel. L...
1399421,1,No happy with this salt server,I put this on my wish list and luckily someone...
2765017,1,Did not get the right product,I did not receive the right product and had to...
2190214,2,You will buy this cd...you will buy this cd..I...,A super effort by Mr. Suhler and Monkey Beat. ...


In [8]:
def polarity_optimisation(num):
    if(num == 2):
        return 0
    else:
        return 1

In [9]:
def pre_processor(corpus):
    # Optimize polarity (replace with appropriate logic)
    corpus.loc[:, 'Polarity'] = corpus['Polarity'].apply(polarity_optimisation)

    # Remove whitespace
    corpus.loc[:, 'Review'] = corpus['Review'].apply(lambda x: x.strip())

    # Convert to lowercase
    corpus.loc[:, 'Review'] = corpus['Review'].apply(lambda x: x.lower())

    # Remove non-English characters
    pattern_english = re.compile(r'[^a-zA-Z\s]')
    corpus.loc[:, 'Review'] = corpus['Review'].apply(lambda x: re.sub(pattern_english, '', x))

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    corpus.loc[:, 'Review'] = corpus['Review'].apply(
        lambda x: ' '.join([word for word in word_tokenize(x) if word.lower() not in stop_words])
    )

    # Apply stemming
    stemmer = PorterStemmer()
    corpus.loc[:, 'Review'] = corpus['Review'].apply(
        lambda x: ' '.join([stemmer.stem(word) for word in word_tokenize(x)])
    )

    return corpus

In [10]:
processed_dataset = pre_processor(sampled_data)
processed_dataset

Unnamed: 0,Polarity,Product,Review
3505797,1,Definitely NOT for anyone over age 17,thought read review said moistur matur woman d...
1266595,1,This one should be pulled from the shelves!,cant believ version album still exist capit re...
347938,0,"some will enjoy, others may dose off",watch fianc group friend film gener quit laugh...
3558103,1,IT BURNED MY HAIR OFF,silver version exact blowdryer year alway use ...
1735684,1,difficult to use,microfib mop head drag across wood floor oppos...
...,...,...,...
1223910,0,Lion's Lady,women love good romanc novel lion ladi one bes...
1399421,1,No happy with this salt server,put wish list luckili someon purchas dont want...
2765017,1,Did not get the right product,receiv right product return happi wrong item o...
2190214,0,You will buy this cd...you will buy this cd..I...,super effort mr suhler monkey beat eclect cd s...


In [11]:
# Instantiate the TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))

# Fit and transform the training data
X_train = vectorizer.fit_transform(processed_dataset['Review'])
y_train = processed_dataset['Polarity']

# Train the Naive Bayes model
model = MultinomialNB()
model.fit(X_train, y_train)


In [12]:
# Read the test dataset
test_corpus = pd.read_csv('test.csv', header=None, nrows=40000)
test_corpus.columns = ['Polarity', 'Product', 'Review']  # Assuming this is correct
test_corpus = test_corpus[['Polarity', 'Review']].reset_index(drop=True)

In [13]:
# Preprocessing test data
test_corpus = pre_processor(test_corpus)
# test_corpus['Polarity'] = test_corpus['Polarity'].apply(polarity_optimisation)

In [14]:
test_corpus.head(10)

Unnamed: 0,Polarity,Review
0,0,love pat one great voic gener listen cd year s...
1,0,despit fact play small portion game music hear...
2,1,bought charger jul work ok design nice conveni...
3,0,check maha energi websit powerex mhcf charger ...
4,0,review quit bit combo player hesit due unfavor...
5,1,also began incorrect disc problem ive read vcr...
6,1,love style coupl year dvd give problem doesnt ...
7,1,scroll dvd menu set vertic triangl key select ...
8,0,exot tale orient dr shen fu weird tale magazin...
9,1,firstlyi enjoy format tone book author address...


In [15]:
positive_test_data = test_corpus[test_corpus["Polarity"] == 0]
positive_test_data.head(10)

Unnamed: 0,Polarity,Review
0,0,love pat one great voic gener listen cd year s...
1,0,despit fact play small portion game music hear...
3,0,check maha energi websit powerex mhcf charger ...
4,0,review quit bit combo player hesit due unfavor...
8,0,exot tale orient dr shen fu weird tale magazin...
10,0,current live europ book recommend visitor cove...
13,0,use product coupl year start use hair gotten d...
16,0,awesom game almost everyon know tictacto easi ...
17,0,price pc game well worth great graphic color l...
18,0,est libro est esplndido lo disfruta lo pued us...


In [16]:
negative_test_data = test_corpus[test_corpus["Polarity"] == 1]
negative_test_data.head(10)

Unnamed: 0,Polarity,Review
2,1,bought charger jul work ok design nice conveni...
5,1,also began incorrect disc problem ive read vcr...
6,1,love style coupl year dvd give problem doesnt ...
7,1,scroll dvd menu set vertic triangl key select ...
9,1,firstlyi enjoy format tone book author address...
11,1,want listen el duke better access showerthi gi...
12,1,game requir quicktim workif better version qui...
14,1,hope drive would run bu power requir adapt act...
15,1,first compani took money sent email tell produ...
20,1,ive read book much expect bore book


In [17]:
X_test = vectorizer.transform(test_corpus['Review'])

In [18]:
y_test = test_corpus['Polarity']
y_pred = model.predict(X_test)

In [19]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.828325

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.85      0.83     20359
           1       0.84      0.81      0.82     19641

    accuracy                           0.83     40000
   macro avg       0.83      0.83      0.83     40000
weighted avg       0.83      0.83      0.83     40000



In [21]:
def sentiment_analysis(text):
  text = "This movie was fantastic!"
  new_features = vectorizer.transform([text])
  prediction = model.predict(new_features)[0]

  if prediction == "positive":
    print("Positive")
  else:
    print("Negative")

In [22]:
sentiment_analysis("terrible product")

Negative


In [23]:
sentiment_analysis("cool, i bough it for 3 times!")

Negative


In [None]:
pickle_out = open("../resource/output/vectorizer.pkl", "wb")
pickle.dump(vectorizer, pickle_out)
pickle_out.close()

pickle_out = open("../resource/output/bayesian.pkl", "wb")
pickle.dump(model, pickle_out)
pickle_out.close()