In [1]:
import pandas as pd
import glob
import os
import random

# Read pickle source data file and convert to dataframe

In [4]:
# Specify the file path for the pickle file
pickle_file_path = 'merged_df.pkl'

# Read the pickle file and convert it to a DataFrame
merged_df = pd.read_pickle(pickle_file_path)

# Train and Test split

In [5]:
from sklearn.model_selection import train_test_split

# Split the data into train and test sets
train_df, test_df = train_test_split(merged_df, test_size=0.3, random_state=50)

# Print the shapes of the train and test sets
print("Train set shape:", train_df.shape)
print("Test set shape:", test_df.shape)

Train set shape: (188804, 8)
Test set shape: (80916, 8)


# Naive Bayes Classifier for Sentiment Analysis

In [6]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Prepare the data
X_train = train_df['review'].values
y_train = train_df['rating_sentiment'].values

# Vectorize the text data
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)

# Train the Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

# Make predictions on the training data
train_predictions = nb_classifier.predict(X_train)

# Calculate the accuracy
train_accuracy = accuracy_score(y_train, train_predictions)
print("Train Accuracy:", train_accuracy)


Train Accuracy: 0.8053007351539162


# Test Accuracy

In [7]:
# Prepare the test data
X_test = test_df['review'].values
y_test = test_df['rating_sentiment'].values

# Vectorize the test data using the same vectorizer
X_test = vectorizer.transform(X_test)

# Make predictions on the test data
test_predictions = nb_classifier.predict(X_test)

# Calculate the accuracy
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)


Test Accuracy: 0.7865935043749073


# Testing a sample review

In [8]:
review = "21 was such an exciting movie to just sit down and watch. The whole entire movie was very intense. Especially the last 10 minutes of this, it really keeps you on the edge of your seat. This movie had some great twists that you do not see coming at all. Especially one of the twists at the end that you will be shocked from. I loved the main actors role in this movie. I thought that Jim Sturgess did such an amazing job playing the main role. He had a whole lot of stuff going through his mind in this movie the whole time. I thought that if they tried they would not have of found a better actor to play his role. Kevin Spacey also handed in a great performance as well. Kate Bosworth was looking very hot, and also did a great job. I loved how this movie was really based on a true story. I always find movies that are based on true stories good. This one was especially good though. It sort of teaches you a little bit about black jack as the movie goes on. This movie is well worth your money and time. It's good this movie was 2 hours long it made all the better. The thing this movie is trying to show you is that you get a second chance. It really shows the main actor in it that you don't know how good you have it until you loose it all. The people in this movie get way ahead of themselves and want to do to much. Overall this was a great movie. It was well worth my time and I would love to see it plenty of more times. So just give this movie a chance and go see it. You will most likely enjoy it. I think that any ages above 13 would enjoy this. You will especially like this movie if you like to gamble or play cards."
# review = "Before making a movie about blackjack and card counting it would have been a good idea to read some of hundreds of books on this subject available in any major bookstore. That would have prevented the creators of this movie to look like a bunch of ignorant fools totally lacking even general knowledge of the game of blackjack, card counting and casino's countermeasures. Here is why:<br/><br/>1. Nowadays blackjack is played with 4-6-8 deck shoes cut in the middle (or 2/3 at best), which makes the player's edge (if any) so small, that making any profit is mathematically possible only in the long run. It means that no matter how favourable the count is, you chances of winning a particular hand are increased by such a small percentage, that before making any profit you may be losing many hands and even suffer substantial financial losses. Only if you are ready for losing streaks in the process and patient, and if you don't make mistakes with count and basic strategy, you may be winning in the long run. Coming to Vegas for a weekend and making fortune by winning all the time is an absolute nonsense.<br/><br/>2. To prevent card counters from making money casino's security personnel do not abduct them in the middle of a crowded casino, torture them in the back office and take away their winnings - casinos simply ban card counters. Casinos in the US are legally private clubs: they don't charge admission fee and it's up to them to decide who will be allowed to enter the club. Since card counting is not illegal (it's just a skill), nor casino security, nor law enforcement can arrest anybody for it without serious consequences like losing casino license and possibly serving jail time. Not all casinos ban card counters, but those which don't changed blackjack rules to the degree that card counting would not overcome house edge.<br/><br/>3. Besides blackjack related issues a person who keeps more than 300 grand cash in his dorm room can only be seen as a complete idiot. Obviously depositing this money in a bank account in the US could have caused problems with IRS, money laundering regulations, etc., but renting a bank safe deposit box could have solved the problem altogether. The image of a brilliant MIT student acting like a retard doesn't make any sense and makes me question the intellectual level of the screenplay writers.<br/><br/>Conclusion: stay away from this movie - don't degrade yourself by watching it."
# Vectorize the preprocessed review text
vectorized_review = vectorizer.transform([review])

# Make sentiment prediction
sentiment_prediction = nb_classifier.predict(vectorized_review)

# Print the predicted sentiment
if sentiment_prediction == 1:
    print("Positive sentiment")
else:
    print("Negative sentiment")


Positive sentiment
