<div style="background-color: lightseagreen; color: black; padding: 4px; text-align : center">
    <h3>Model Code
</h3> </div>

**This notebook contains only the model code with no explanations or markdowns. The last cell can be run to print the classification report to see the model performance if needed.** 

In [1]:
#Importing necessary libraries

import numpy as np #for numerical computing
import pandas as pd #for data handling and manipulation
import matplotlib.pyplot as plt #for data visualization
import seaborn as sns #for data visualization
import nltk #natural language tool kit
from nltk.corpus import stopwords #stopwords
from nltk.tokenize import word_tokenize #tokenizer
import string #convert to lowercase
from nltk.stem import WordNetLemmatizer #lemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag #to assign POS tags to words
from sklearn.feature_extraction.text import TfidfVectorizer #to convert text to matrix of features
from sklearn.model_selection import train_test_split #to split the data into training and testing sets for model evaluation 
from sklearn.linear_model import LogisticRegression #to built the model
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay #model evaluation and visualization
nltk.download("punkt_tab") #for tokenization
nltk.download("stopwords") #download the stop words list
nltk.download("averaged_perceptron_tagger_eng") #for pos tagging
nltk.download("wordnet") #for lexical database
nltk.download("omw-1.4") #extended support for lemmatization

#Dataset loading
df = pd.read_csv('amazon_alexa.tsv', sep='\t')

#Lowercasing
df["lowercase_reviews"] = [str(review).lower() for review in df["verified_reviews"]]

#Tokenization
df["tokens"] = [word_tokenize(review) for review in df["lowercase_reviews"]]
df["tokens_no_punct"] = [[word for word in tokens if word not in string.punctuation] for tokens in df["tokens"]]

#Stop word removal
stop_words=set(stopwords.words("english"))
df["no_stopwords_token"] = [[word for word in tokens if word not in stop_words] for tokens in df["tokens_no_punct"]]

#Lemmatization
lemmatizer = WordNetLemmatizer()
def get_wordnet_pos(word):
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict ={"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)
df["final_lemmatized_clean_tokens"] = [[lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in tokens] for tokens in df["no_stopwords_token"]]

#Sentiment Labeling
df["sentiment"] =["Positive" if rating >=3 else "Negative" for rating in df["rating"]]

#Creating a string from tokens
df["final_clean_text"] = [" ".join(tokens) for tokens in df ["final_lemmatized_clean_tokens"]]

#Converting text to numerical values 
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df["final_clean_text"])

#Assigning y to the sentiment column
y = df["sentiment"]#assigning y to the sentiment column

#Splitting dataset to train and test set, with test set being 20%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 35)

#Logistic Regression model
model = LogisticRegression(class_weight= {'Negative': 10, 'Positive': 1})
model.fit(X_train, y_train)#fitting the model

#Predicting on test data
y_pred = model.predict(X_test)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\suyas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\suyas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\suyas\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\suyas\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\suyas\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
print(classification_report(y_test,y_pred))#printing evaluation metrics

              precision    recall  f1-score   support

    Negative       0.43      0.80      0.56        50
    Positive       0.98      0.91      0.94       580

    accuracy                           0.90       630
   macro avg       0.71      0.86      0.75       630
weighted avg       0.94      0.90      0.91       630



<div style="background-color: lightseagreen; color: black; padding: 4px; text-align: center">
    <h3>Thank you!
</h3> </div>