In [5]:
import torchtext
import torch
import torch.nn as nn
from torch.autograd import Variable
import matplotlib.pyplot as plt
import numpy as np
from torchtext.vocab import Vectors
from tqdm import tqdm_notebook
import pandas as pd

In [6]:
train_df = pd.read_csv("/kaggle/input/sst-2-dataset/train.csv")
validation_df = pd.read_csv("/kaggle/input/sst-2-dataset/validation.csv")

train_df.drop(columns = "idx")
validation_df.drop(columns = "idx")


test_df = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")
test_df.rename(columns={'review': 'sentence', 'sentiment': 'label'}, inplace=True)
test_df['label'] = test_df['label'].map({'negative': 0, 'positive': 1})
output_path = "/kaggle/working/modified_imdb_dataset.csv"
test_df.to_csv(output_path, index=False)

In [7]:
train_df["sentence"] = train_df["sentence"].str.lower()
validation_df["sentence"] = validation_df["sentence"].str.lower()
test_df["sentence"] = test_df["sentence"].str.lower()

In [8]:
import string
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

train_df["text_wo_punct"] = train_df["sentence"].apply(lambda text: remove_punctuation(text))
validation_df["text_wo_punct"] = validation_df["sentence"].apply(lambda text: remove_punctuation(text))
test_df["text_wo_punct"] = test_df["sentence"].apply(lambda text: remove_punctuation(text))

In [9]:
import nltk

# Ensure you have downloaded the stopwords dataset
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
from nltk.corpus import stopwords

", ".join(stopwords.words('english'))


"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mus

In [11]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

train_df["text_wo_stop"] = train_df["text_wo_punct"].apply(lambda text: remove_stopwords(text))
validation_df["text_wo_stop"] = validation_df["text_wo_punct"].apply(lambda text: remove_stopwords(text))
test_df["text_wo_stop"] = test_df["text_wo_punct"].apply(lambda text: remove_stopwords(text))

In [12]:
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

train_df["text_stemmed"] = train_df["text_wo_stop"].apply(lambda text: stem_words(text))
validation_df["text_stemmed"] = validation_df["text_wo_stop"].apply(lambda text: stem_words(text))
test_df["text_stemmed"] = test_df["text_wo_stop"].apply(lambda text: stem_words(text))

In [13]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize


# Tokenize sentences
nltk.download('punkt')
train_df['tokens'] = train_df['text_stemmed'].apply(lambda x: nltk.word_tokenize(x))

# Create a word index (manual embedding)
word_set = set()
for tokens in train_df['tokens']:
    word_set.update(tokens)

word_index = {word: i+1 for i, word in enumerate(word_set)}  # Start indexing from 1

# Convert tokens to integers
train_df['indexed_tokens'] = train_df['tokens'].apply(lambda x: [word_index[token] for token in x])

# Optional: Pad sequences to ensure uniform length
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_length = 100  # Define maximum length of sequences
train_df['padded_tokens'] = pad_sequences(train_df['indexed_tokens'], maxlen=max_length, padding='post').tolist()

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


2024-05-29 14:18:58.736334: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-29 14:18:58.736479: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-29 14:18:58.842182: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [14]:
train_df

Unnamed: 0,idx,sentence,label,text_wo_punct,text_wo_stop,text_stemmed,tokens,indexed_tokens,padded_tokens
0,0,hide new secretions from the parental units,0,hide new secretions from the parental units,hide new secretions parental units,hide new secret parent unit,"[hide, new, secret, parent, unit]","[207, 9862, 8630, 10313, 1226]","[207, 9862, 8630, 10313, 1226, 0, 0, 0, 0, 0, ..."
1,1,"contains no wit , only labored gags",0,contains no wit only labored gags,contains wit labored gags,contain wit labor gag,"[contain, wit, labor, gag]","[1464, 1941, 4996, 6687]","[1464, 1941, 4996, 6687, 0, 0, 0, 0, 0, 0, 0, ..."
2,2,that loves its characters and communicates som...,1,that loves its characters and communicates som...,loves characters communicates something rather...,love charact commun someth rather beauti human...,"[love, charact, commun, someth, rather, beauti...","[4128, 9344, 5360, 2005, 1571, 2297, 8545, 3003]","[4128, 9344, 5360, 2005, 1571, 2297, 8545, 300..."
3,3,remains utterly satisfied to remain the same t...,0,remains utterly satisfied to remain the same t...,remains utterly satisfied remain throughout,remain utterli satisfi remain throughout,"[remain, utterli, satisfi, remain, throughout]","[8262, 1677, 511, 8262, 1510]","[8262, 1677, 511, 8262, 1510, 0, 0, 0, 0, 0, 0..."
4,4,on the worst revenge-of-the-nerds clichés the ...,0,on the worst revengeofthenerds clichés the fil...,worst revengeofthenerds clichés filmmakers cou...,worst revengeofthenerd cliché filmmak could dredg,"[worst, revengeofthenerd, cliché, filmmak, cou...","[1387, 1603, 5483, 430, 1331, 10572]","[1387, 1603, 5483, 430, 1331, 10572, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...
67344,67344,a delightful comedy,1,a delightful comedy,delightful comedy,delight comedi,"[delight, comedi]","[2188, 4538]","[2188, 4538, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
67345,67345,"anguish , anger and frustration",0,anguish anger and frustration,anguish anger frustration,anguish anger frustrat,"[anguish, anger, frustrat]","[8715, 1034, 6378]","[8715, 1034, 6378, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
67346,67346,"at achieving the modest , crowd-pleasing goals...",1,at achieving the modest crowdpleasing goals i...,achieving modest crowdpleasing goals sets,achiev modest crowdpleas goal set,"[achiev, modest, crowdpleas, goal, set]","[6911, 2243, 1084, 9587, 2301]","[6911, 2243, 1084, 9587, 2301, 0, 0, 0, 0, 0, ..."
67347,67347,a patient viewer,1,a patient viewer,patient viewer,patient viewer,"[patient, viewer]","[3378, 9735]","[3378, 9735, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [15]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize


# Tokenize sentences
nltk.download('punkt')
validation_df['tokens'] = validation_df['text_stemmed'].apply(lambda x: nltk.word_tokenize(x))

# Create a word index (manual embedding)
word_set = set()
for tokens in validation_df['tokens']:
    word_set.update(tokens)

word_index = {word: i+1 for i, word in enumerate(word_set)}  # Start indexing from 1

# Convert tokens to integers
validation_df['indexed_tokens'] = validation_df['tokens'].apply(lambda x: [word_index[token] for token in x])

# Optional: Pad sequences to ensure uniform length
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_length = 100  # Define maximum length of sequences
validation_df['padded_tokens'] = pad_sequences(validation_df['indexed_tokens'], maxlen=max_length, padding='post').tolist()

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [16]:
validation_df

Unnamed: 0,idx,sentence,label,text_wo_punct,text_wo_stop,text_stemmed,tokens,indexed_tokens,padded_tokens
0,0,it 's a charming and often affecting journey .,1,it s a charming and often affecting journey,charming often affecting journey,charm often affect journey,"[charm, often, affect, journey]","[2504, 3044, 1529, 2675]","[2504, 3044, 1529, 2675, 0, 0, 0, 0, 0, 0, 0, ..."
1,1,unflinchingly bleak and desperate,0,unflinchingly bleak and desperate,unflinchingly bleak desperate,unflinchingli bleak desper,"[unflinchingli, bleak, desper]","[1901, 3159, 380]","[1901, 3159, 380, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
2,2,allows us to hope that nolan is poised to emba...,1,allows us to hope that nolan is poised to emba...,allows us hope nolan poised embark major caree...,allow us hope nolan pois embark major career c...,"[allow, us, hope, nolan, pois, embark, major, ...","[2930, 3240, 941, 420, 1142, 2064, 1468, 632, ...","[2930, 3240, 941, 420, 1142, 2064, 1468, 632, ..."
3,3,"the acting , costumes , music , cinematography...",1,the acting costumes music cinematography an...,acting costumes music cinematography sound ast...,act costum music cinematographi sound astound ...,"[act, costum, music, cinematographi, sound, as...","[217, 2249, 1, 2988, 1288, 1542, 2832, 1378, 3...","[217, 2249, 1, 2988, 1288, 1542, 2832, 1378, 3..."
4,4,"it 's slow -- very , very slow .",0,it s slow very very slow,slow slow,slow slow,"[slow, slow]","[2670, 2670]","[2670, 2670, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...
867,867,has all the depth of a wading pool .,0,has all the depth of a wading pool,depth wading pool,depth wade pool,"[depth, wade, pool]","[1598, 519, 142]","[1598, 519, 142, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
868,868,a movie with a real anarchic flair .,1,a movie with a real anarchic flair,movie real anarchic flair,movi real anarch flair,"[movi, real, anarch, flair]","[2093, 801, 1359, 1196]","[2093, 801, 1359, 1196, 0, 0, 0, 0, 0, 0, 0, 0..."
869,869,a subject like this should inspire reaction in...,0,a subject like this should inspire reaction in...,subject like inspire reaction audience pianist,subject like inspir reaction audienc pianist,"[subject, like, inspir, reaction, audienc, pia...","[356, 1960, 1557, 3164, 1277, 864]","[356, 1960, 1557, 3164, 1277, 864, 0, 0, 0, 0,..."
870,870,... is an arthritic attempt at directing by ca...,0,is an arthritic attempt at directing by calli...,arthritic attempt directing callie khouri,arthrit attempt direct calli khouri,"[arthrit, attempt, direct, calli, khouri]","[1394, 2709, 2886, 1360, 500]","[1394, 2709, 2886, 1360, 500, 0, 0, 0, 0, 0, 0..."


In [17]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize


# Tokenize sentences
nltk.download('punkt')

test_df['tokens'] = test_df['text_stemmed'].apply(lambda x: nltk.word_tokenize(x))

# Create a word index (manual embedding)
word_set = set()
for tokens in test_df['tokens']:
    word_set.update(tokens)

word_index = {word: i+1 for i, word in enumerate(word_set)}  # Start indexing from 1

# Convert tokens to integers
test_df['indexed_tokens'] = test_df['tokens'].apply(lambda x: [word_index[token] for token in x])

# Optional: Pad sequences to ensure uniform length
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_length = 100  # Define maximum length of sequences
test_df['padded_tokens'] = pad_sequences(test_df['indexed_tokens'], maxlen=max_length, padding='post').tolist()

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [18]:
test_df.head()

Unnamed: 0,sentence,label,text_wo_punct,text_wo_stop,text_stemmed,tokens,indexed_tokens,padded_tokens
0,one of the other reviewers has mentioned that ...,1,one of the other reviewers has mentioned that ...,one reviewers mentioned watching 1 oz episode ...,one review mention watch 1 oz episod youll hoo...,"[one, review, mention, watch, 1, oz, episod, y...","[55067, 307, 25113, 76216, 41654, 123322, 1203...","[39721, 112081, 134212, 107514, 138075, 85643,..."
1,a wonderful little production. <br /><br />the...,1,a wonderful little production br br the filmin...,wonderful little production br br filming tech...,wonder littl product br br film techniqu unass...,"[wonder, littl, product, br, br, film, techniq...","[93290, 7912, 73642, 73040, 73040, 22688, 1290...","[93290, 7912, 73642, 73040, 73040, 22688, 1290..."
2,i thought this was a wonderful way to spend ti...,1,i thought this was a wonderful way to spend ti...,thought wonderful way spend time hot summer we...,thought wonder way spend time hot summer weeke...,"[thought, wonder, way, spend, time, hot, summe...","[103368, 93290, 59886, 37342, 63068, 100217, 3...","[103368, 93290, 59886, 37342, 63068, 100217, 3..."
3,basically there's a family where a little boy ...,0,basically theres a family where a little boy j...,basically theres family little boy jake thinks...,basic there famili littl boy jake think there ...,"[basic, there, famili, littl, boy, jake, think...","[38570, 129306, 38241, 7912, 100661, 113839, 3...","[38570, 129306, 38241, 7912, 100661, 113839, 3..."
4,"petter mattei's ""love in the time of money"" is...",1,petter matteis love in the time of money is a ...,petter matteis love time money visually stunni...,petter mattei love time money visual stun film...,"[petter, mattei, love, time, money, visual, st...","[44136, 137388, 24985, 63068, 105649, 45366, 1...","[108597, 29025, 30587, 51336, 10837, 47955, 10..."


In [20]:
import pandas as pd
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline


param_grid = {
    'tfidf__max_df': [0.7, 0.8, 0.9, 1.0],  # Maximum document frequency
    'tfidf__ngram_range': [(1, 1), (1, 2), (2, 2)],  # N-grams
    'tfidf__norm': ['l1', 'l2'],  #
    'nb__alpha': [0.1, 0.5, 1.0, 1.5, 2.0],  # Smoothing parameter for Naive Bayes
    'nb__fit_prior': [True, False]  # Whether to learn class prior probabilities
}


X_train = train_df['text_wo_stop']
y_train = train_df['label']
X_valid = validation_df['text_wo_stop']
y_valid = validation_df['label']


# Create the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=2, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Best parameters and estimator
print("Best parameters found: ", grid_search.best_params_)
best_model = grid_search.best_estimator_

# Evaluate on validation set
y_valid_pred = best_model.predict(X_valid)
valid_accuracy = accuracy_score(y_valid, y_valid_pred)
valid_report = classification_report(y_valid, y_valid_pred)

print(f"Validation Accuracy: {valid_accuracy}")
print("Validation Classification Report:")
print(valid_report)


Fitting 2 folds for each of 240 candidates, totalling 480 fits
Best parameters found:  {'nb__alpha': 0.1, 'nb__fit_prior': False, 'tfidf__max_df': 0.7, 'tfidf__ngram_range': (1, 2), 'tfidf__norm': 'l1'}
Validation Accuracy: 0.8073394495412844
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.76      0.79       428
           1       0.79      0.85      0.82       444

    accuracy                           0.81       872
   macro avg       0.81      0.81      0.81       872
weighted avg       0.81      0.81      0.81       872



In [22]:
X_test = test_df['text_wo_stop']
y_test = test_df['label']

y_test_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_report = classification_report(y_test, y_test_pred)

print(f"Test Accuracy: {test_accuracy}")
print("Test Classification Report:")
print(test_report)

Test Accuracy: 0.81402
Test Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.83      0.82     25000
           1       0.82      0.80      0.81     25000

    accuracy                           0.81     50000
   macro avg       0.81      0.81      0.81     50000
weighted avg       0.81      0.81      0.81     50000

