In [None]:
import torchtext
import torch
import torch.nn as nn
from torch.autograd import Variable
import matplotlib.pyplot as plt
import numpy as np
from torchtext.vocab import Vectors
from tqdm import tqdm_notebook
import pandas as pd



In [None]:
train_df = pd.read_csv("/content/train.csv")
validation_df = pd.read_csv("/content/validation.csv")

train_df.drop(columns = "idx")
validation_df.drop(columns = "idx")


test_df = pd.read_csv("/content/IMDB Dataset.csv")
test_df.rename(columns={'review': 'sentence', 'sentiment': 'label'}, inplace=True)
test_df['label'] = test_df['label'].map({'negative': 0, 'positive': 1})
output_path = "/content/modified_imdb_dataset.csv"
test_df.to_csv(output_path, index=False)

In [None]:
train_df["sentence"] = train_df["sentence"].str.lower()
validation_df["sentence"] = validation_df["sentence"].str.lower()
test_df["sentence"] = test_df["sentence"].str.lower()

In [None]:
import string
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

train_df["text_wo_punct"] = train_df["sentence"].apply(lambda text: remove_punctuation(text))
validation_df["text_wo_punct"] = validation_df["sentence"].apply(lambda text: remove_punctuation(text))
test_df["text_wo_punct"] = test_df["sentence"].apply(lambda text: remove_punctuation(text))

In [None]:
import nltk

# Ensure you have downloaded the stopwords dataset
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.corpus import stopwords

", ".join(stopwords.words('english'))


"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mus

In [None]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

train_df["text_wo_stop"] = train_df["text_wo_punct"].apply(lambda text: remove_stopwords(text))
validation_df["text_wo_stop"] = validation_df["text_wo_punct"].apply(lambda text: remove_stopwords(text))
test_df["text_wo_stop"] = test_df["text_wo_punct"].apply(lambda text: remove_stopwords(text))

In [None]:
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

train_df["text_stemmed"] = train_df["text_wo_stop"].apply(lambda text: stem_words(text))
validation_df["text_stemmed"] = validation_df["text_wo_stop"].apply(lambda text: stem_words(text))
test_df["text_stemmed"] = test_df["text_wo_stop"].apply(lambda text: stem_words(text))

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize


# Tokenize sentences
nltk.download('punkt')
train_df['tokens'] = train_df['text_stemmed'].apply(lambda x: nltk.word_tokenize(x))

# Create a word index (manual embedding)
word_set = set()
for tokens in train_df['tokens']:
    word_set.update(tokens)

word_index = {word: i+1 for i, word in enumerate(word_set)}  # Start indexing from 1

# Convert tokens to integers
train_df['indexed_tokens'] = train_df['tokens'].apply(lambda x: [word_index[token] for token in x])

# Optional: Pad sequences to ensure uniform length
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_length = 100  # Define maximum length of sequences
train_df['padded_tokens'] = pad_sequences(train_df['indexed_tokens'], maxlen=max_length, padding='post').tolist()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
train_df

Unnamed: 0,idx,sentence,label,text_wo_punct,text_wo_stop,text_stemmed,tokens,indexed_tokens,padded_tokens
0,0,hide new secretions from the parental units,0,hide new secretions from the parental units,hide new secretions parental units,hide new secret parent unit,"[hide, new, secret, parent, unit]","[8953, 2239, 6328, 5226, 9193]","[8953, 2239, 6328, 5226, 9193, 0, 0, 0, 0, 0, ..."
1,1,"contains no wit , only labored gags",0,contains no wit only labored gags,contains wit labored gags,contain wit labor gag,"[contain, wit, labor, gag]","[5627, 1040, 3629, 2784]","[5627, 1040, 3629, 2784, 0, 0, 0, 0, 0, 0, 0, ..."
2,2,that loves its characters and communicates som...,1,that loves its characters and communicates som...,loves characters communicates something rather...,love charact commun someth rather beauti human...,"[love, charact, commun, someth, rather, beauti...","[2060, 8809, 5942, 2080, 7219, 9710, 5677, 3371]","[2060, 8809, 5942, 2080, 7219, 9710, 5677, 337..."
3,3,remains utterly satisfied to remain the same t...,0,remains utterly satisfied to remain the same t...,remains utterly satisfied remain throughout,remain utterli satisfi remain throughout,"[remain, utterli, satisfi, remain, throughout]","[10068, 2196, 1199, 10068, 7402]","[10068, 2196, 1199, 10068, 7402, 0, 0, 0, 0, 0..."
4,4,on the worst revenge-of-the-nerds clichés the ...,0,on the worst revengeofthenerds clichés the fil...,worst revengeofthenerds clichés filmmakers cou...,worst revengeofthenerd cliché filmmak could dredg,"[worst, revengeofthenerd, cliché, filmmak, cou...","[5187, 2798, 5070, 4995, 3660, 971]","[5187, 2798, 5070, 4995, 3660, 971, 0, 0, 0, 0..."
...,...,...,...,...,...,...,...,...,...
67344,67344,a delightful comedy,1,a delightful comedy,delightful comedy,delight comedi,"[delight, comedi]","[8876, 3761]","[8876, 3761, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
67345,67345,"anguish , anger and frustration",0,anguish anger and frustration,anguish anger frustration,anguish anger frustrat,"[anguish, anger, frustrat]","[10668, 8397, 2340]","[10668, 8397, 2340, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
67346,67346,"at achieving the modest , crowd-pleasing goals...",1,at achieving the modest crowdpleasing goals i...,achieving modest crowdpleasing goals sets,achiev modest crowdpleas goal set,"[achiev, modest, crowdpleas, goal, set]","[5385, 8359, 1913, 5711, 10478]","[5385, 8359, 1913, 5711, 10478, 0, 0, 0, 0, 0,..."
67347,67347,a patient viewer,1,a patient viewer,patient viewer,patient viewer,"[patient, viewer]","[1885, 1686]","[1885, 1686, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize


# Tokenize sentences
nltk.download('punkt')
validation_df['tokens'] = validation_df['text_stemmed'].apply(lambda x: nltk.word_tokenize(x))

# Create a word index (manual embedding)
word_set = set()
for tokens in validation_df['tokens']:
    word_set.update(tokens)

word_index = {word: i+1 for i, word in enumerate(word_set)}  # Start indexing from 1

# Convert tokens to integers
validation_df['indexed_tokens'] = validation_df['tokens'].apply(lambda x: [word_index[token] for token in x])

# Optional: Pad sequences to ensure uniform length
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_length = 100  # Define maximum length of sequences
validation_df['padded_tokens'] = pad_sequences(validation_df['indexed_tokens'], maxlen=max_length, padding='post').tolist()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
validation_df

Unnamed: 0,idx,sentence,label,text_wo_punct,text_wo_stop,text_stemmed,tokens,indexed_tokens,padded_tokens
0,0,it 's a charming and often affecting journey .,1,it s a charming and often affecting journey,charming often affecting journey,charm often affect journey,"[charm, often, affect, journey]","[493, 1592, 2486, 63]","[493, 1592, 2486, 63, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,1,unflinchingly bleak and desperate,0,unflinchingly bleak and desperate,unflinchingly bleak desperate,unflinchingli bleak desper,"[unflinchingli, bleak, desper]","[3102, 1777, 1443]","[3102, 1777, 1443, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,2,allows us to hope that nolan is poised to emba...,1,allows us to hope that nolan is poised to emba...,allows us hope nolan poised embark major caree...,allow us hope nolan pois embark major career c...,"[allow, us, hope, nolan, pois, embark, major, ...","[2359, 464, 3345, 2783, 839, 3098, 1511, 668, ...","[2359, 464, 3345, 2783, 839, 3098, 1511, 668, ..."
3,3,"the acting , costumes , music , cinematography...",1,the acting costumes music cinematography an...,acting costumes music cinematography sound ast...,act costum music cinematographi sound astound ...,"[act, costum, music, cinematographi, sound, as...","[1854, 2192, 2217, 2336, 1223, 2145, 2521, 48,...","[1854, 2192, 2217, 2336, 1223, 2145, 2521, 48,..."
4,4,"it 's slow -- very , very slow .",0,it s slow very very slow,slow slow,slow slow,"[slow, slow]","[297, 297]","[297, 297, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
...,...,...,...,...,...,...,...,...,...
867,867,has all the depth of a wading pool .,0,has all the depth of a wading pool,depth wading pool,depth wade pool,"[depth, wade, pool]","[829, 2895, 457]","[829, 2895, 457, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
868,868,a movie with a real anarchic flair .,1,a movie with a real anarchic flair,movie real anarchic flair,movi real anarch flair,"[movi, real, anarch, flair]","[660, 443, 584, 3015]","[660, 443, 584, 3015, 0, 0, 0, 0, 0, 0, 0, 0, ..."
869,869,a subject like this should inspire reaction in...,0,a subject like this should inspire reaction in...,subject like inspire reaction audience pianist,subject like inspir reaction audienc pianist,"[subject, like, inspir, reaction, audienc, pia...","[2894, 770, 1186, 1956, 3075, 473]","[2894, 770, 1186, 1956, 3075, 473, 0, 0, 0, 0,..."
870,870,... is an arthritic attempt at directing by ca...,0,is an arthritic attempt at directing by calli...,arthritic attempt directing callie khouri,arthrit attempt direct calli khouri,"[arthrit, attempt, direct, calli, khouri]","[1202, 1227, 1135, 3235, 44]","[1202, 1227, 1135, 3235, 44, 0, 0, 0, 0, 0, 0,..."


In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize


# Tokenize sentences
nltk.download('punkt')

test_df['tokens'] = test_df['text_stemmed'].apply(lambda x: nltk.word_tokenize(x))

# Create a word index (manual embedding)
word_set = set()
for tokens in test_df['tokens']:
    word_set.update(tokens)

word_index = {word: i+1 for i, word in enumerate(word_set)}  # Start indexing from 1

# Convert tokens to integers
test_df['indexed_tokens'] = test_df['tokens'].apply(lambda x: [word_index[token] for token in x])

# Optional: Pad sequences to ensure uniform length
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_length = 100  # Define maximum length of sequences
test_df['padded_tokens'] = pad_sequences(test_df['indexed_tokens'], maxlen=max_length, padding='post').tolist()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

param_grid = {
    'tfidf__ngram_range': [(1, 1)],  # Unigrams or bigrams
    'svm__C': [0.1, 1],  # Regularization parameter
    'svm__kernel': ['rbf']  # Kernel type
}


X_train = train_df['text_wo_stop']
y_train = train_df['label']
X_valid = validation_df['text_wo_stop']
y_valid = validation_df['label']


# Create the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svm', SVC())
])

# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=2, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Best parameters and estimator
print("Best parameters found: ", grid_search.best_params_)
best_model = grid_search.best_estimator_

# Evaluate on validation set
y_valid_pred = best_model.predict(X_valid)
valid_accuracy = accuracy_score(y_valid, y_valid_pred)
valid_report = classification_report(y_valid, y_valid_pred)

print(f"Validation Accuracy: {valid_accuracy}")
print("Validation Classification Report:")
print(valid_report)


Fitting 2 folds for each of 2 candidates, totalling 4 fits


  pid = os.fork()
  pid = os.fork()


Best parameters found:  {'svm__C': 1, 'svm__kernel': 'rbf', 'tfidf__ngram_range': (1, 1)}
Validation Accuracy: 0.8279816513761468
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.77      0.81       428
           1       0.80      0.88      0.84       444

    accuracy                           0.83       872
   macro avg       0.83      0.83      0.83       872
weighted avg       0.83      0.83      0.83       872



In [18]:
X_test = test_df['text_wo_stop']
y_test = test_df['label']

y_test_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_report = classification_report(y_test, y_test_pred)

print(f"Test Accuracy: {test_accuracy}")
print("Test Classification Report:")
print(test_report)

Test Accuracy: 0.81414
Test Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.85      0.82     25000
           1       0.84      0.78      0.81     25000

    accuracy                           0.81     50000
   macro avg       0.82      0.81      0.81     50000
weighted avg       0.82      0.81      0.81     50000

