In [1]:
%load_ext kedro

In [2]:
df = catalog.load('feature_data')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   rating             25000 non-null  int64 
 1   clean_review       25000 non-null  object
 2   sentences          25000 non-null  object
 3   words              25000 non-null  object
 4   lemmatized_review  25000 non-null  object
 5   pos_senti          25000 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 1.1+ MB


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from multiprocessing import Pool, cpu_count
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score, f1_score, confusion_matrix

In [4]:
X = df.drop('pos_senti', axis=1)
y = df['pos_senti']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/zishan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[3;92mTrue[0m

In [6]:
stop_words = set(stopwords.words('english'))
negations = {'no', 'not', 'nor', "n't"}
stop_words = stop_words - negations

In [7]:
def remove_stopwords(words):
    return [w for w in words.split(' ') if w not in stop_words]

In [10]:
with Pool(cpu_count()) as p:
    X_train['main_words'] = p.map(remove_stopwords, X_train['lemmatized_review'])

In [11]:
main_words = X_train['main_words'].apply(lambda x: " ".join(x))

cnt_vect = CountVectorizer(max_features=5000, min_df=2, max_df=0.95)
cnt_mat = cnt_vect.fit_transform(main_words)

In [10]:
tfidf_vect = TfidfVectorizer(max_features=5000, min_df=2, max_df=0.95)
tfdif_mat = tfidf_vect.fit_transform(main_words)

In [11]:
cnt_df = pd.DataFrame.sparse.from_spmatrix(cnt_mat, columns=cnt_vect.get_feature_names_out())
scaler = StandardScaler()
rating_scaled = scaler.fit_transform(X_train['rating'].values.reshape(-1, 1))
X_train_combined = hstack([cnt_mat, rating_scaled])

In [12]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_combined, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [15]:
with Pool(cpu_count()) as p:
    X_test['main_words'] = p.map(remove_stopwords, X_test['lemmatized_review'])
main_words_test = X_test['main_words'].apply(lambda x: " ".join(x))
cnt_mat_test = cnt_vect.transform(main_words_test)
scaler = StandardScaler()
rating_scaled_test = scaler.fit_transform(X_test['rating'].values.reshape(-1, 1))
X_test_combined = hstack([cnt_mat_test, rating_scaled_test])


In [16]:
y_pred = model.predict(X_test_combined)

In [8]:
# Training - without rating
with Pool(cpu_count()) as p:
    train_main_words_list = p.map(remove_stopwords, X_train['lemmatized_review'])

main_words_train = [" ".join(words) for words in train_main_words_list]

cnt_vect = CountVectorizer(max_features=5000, min_df=2, max_df=0.95)
cnt_mat_train = cnt_vect.fit_transform(main_words_train)

model = LogisticRegression(max_iter=1000)
model.fit(cnt_mat_train, y_train)

# Testing - without rating
with Pool(cpu_count()) as p:
    test_main_words_list = p.map(remove_stopwords, X_test['lemmatized_review'])

main_words_test = [" ".join(words) for words in test_main_words_list]
cnt_mat_test = cnt_vect.transform(main_words_test)

y_pred = model.predict(cnt_mat_test)
print(f"Accuracy without rating: {accuracy_score(y_test, y_pred):.4f}")

Accuracy without rating: 0.8438


In [9]:

with Pool(cpu_count()) as p:
    train_main_words_list = p.map(remove_stopwords, X_train['lemmatized_review'])

main_words_train = [" ".join(words) for words in train_main_words_list]

tfidf_vect = TfidfVectorizer(max_features=5000, min_df=2, max_df=0.95)
tfidf_mat_train = tfidf_vect.fit_transform(main_words_train)

model = LogisticRegression(max_iter=1000)
model.fit(tfidf_mat_train, y_train)

# Testing - without rating, using TF-IDF
with Pool(cpu_count()) as p:
    test_main_words_list = p.map(remove_stopwords, X_test['lemmatized_review'])

main_words_test = [" ".join(words) for words in test_main_words_list]
tfidf_mat_test = tfidf_vect.transform(main_words_test)

y_pred = model.predict(tfidf_mat_test)
print(f"TF-IDF Accuracy without rating: {accuracy_score(y_test, y_pred):.4f}")

TF-IDF Accuracy without rating: 0.8798
