In [None]:

import requests

url = "https://media.geeksforgeeks.org/wp-content/uploads/20240514105101/IMDB-Dataset.csv"
response = requests.get(url)

if response.status_code == 200:
    # Save the content to a file
    with open('IMDB-Dataset.csv', 'wb') as f:
        f.write(response.content)
    print("File downloaded successfully.")
else:
    print(f"Failed to download file. Status code: {response.status_code}")



File downloaded successfully.


In [None]:
import pandas as pd
df=pd.read_csv('IMDB-Dataset.csv')


In [None]:
df.columns

Index(['review', 'sentiment'], dtype='object')

In [None]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [None]:
df.shape

(50000, 2)

In [None]:
#handling missing values
df.isnull().sum()

Unnamed: 0,0
review,0
sentiment,0


In [None]:
df.sentiment.value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


In [None]:
df_positive=df[df['sentiment']=='positive'][:2000]
df_negative=df[df['sentiment']=='negative'][:2000]


In [None]:
df2=pd.concat([df_positive,df_negative],axis=0)

In [None]:
df2.shape

(4000, 2)

In [None]:
df2.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive


#Text preprocessing

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")


In [None]:
def lemmatization(text):
  doc=nlp(text)
  lemmaList=[word.lemma_ for word in doc]
  return ' '.join(lemmaList)

In [None]:
df2['lemma']=df2['review'].apply(lemmatization)

KeyboardInterrupt: 

In [None]:
df2.head()

In [None]:
def remove_stopwords(text):
  doc=nlp(text)
  no_stopwords=[word.text for word in doc if not word.is_stop and not word.is_punct]
  return ' '.join(no_stopwords)


In [None]:
df2['final']=df2['lemma'].apply(remove_stopwords)

In [None]:
df2.drop(columns=['lemma','review'],inplace=True)

In [None]:
df2.head()

In [None]:
new_csv=df2.to_csv('data_after_lemmatizaion_and_preprocessing',index=False)

#creating model

In [None]:
import pandas as pd
import spacy
import torch

In [None]:
df_after=pd.read_csv('data_after_lemmatizaion_and_preprocessing.csv')

In [None]:
df_after.head(),df_after.tail()

(   sentiment                                              final
 0          1  reviewer mention watch 1 Oz episode hook right...
 1          1  wonderful little production < br /><br />the f...
 2          1  think wonderful way spend time hot summer week...
 3          1  Petter Mattei love Time money visually stunnin...
 4          1  probably time favorite movie story selflessnes...,
       sentiment                                              final
 3995          0  let advantage watch Piranha Piranha Venezuela ...
 3996          0  thing produce movie feel later movie internati...
 3997          0  contain spoiler movie plot summarize sentence ...
 3998          0  high school Track Field athelete Laura Remstea...
 3999          0  summary pretty sum near good original script c...)

In [None]:
!python -m spacy download en_core_web_md


Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m51.5 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import spacy
nlp = spacy.load("en_core_web_md")


In [None]:
from sklearn.model_selection import train_test_split

# Split data
X_train, X_test, y_train, y_test = train_test_split(df_after['final'], df_after['sentiment'], test_size=0.2, random_state=42)


In [None]:
def vectorize(text):
    doc = nlp(text)
    vectors = [token.vector for token in doc if token.has_vector][:100]
    if not vectors:
        vectors = [nlp("placeholder")[0].vector]
    return torch.tensor(vectors)




In [None]:
from torch.utils.data import Dataset

class SentimentDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text_vecs = vectorize(self.texts.iloc[idx])
        label = torch.tensor(self.labels.iloc[idx], dtype=torch.float)
        return text_vecs, label


In [None]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    texts, labels = zip(*batch)

    padded = pad_sequence(texts, batch_first=True)
    labels = torch.tensor(labels, dtype=torch.float)
    return padded, labels


In [None]:
train_data = SentimentDataset(X_train, y_train)
test_data = SentimentDataset(X_test, y_test)

train_loader = DataLoader(train_data, batch_size=4,collate_fn=collate_fn, shuffle=True)
test_loader = DataLoader(test_data, batch_size=4,collate_fn=collate_fn)

In [None]:
class SentimentClassifier(nn.Module):
    def __init__(self, hidden_dim=128):
        super().__init__()
        self.gru = nn.GRU(input_size=300, hidden_size=hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        _, h_n = self.gru(x)              # h_n: [1, batch, hidden_dim]
        out = self.fc(h_n.squeeze(0))     # [batch, 1]
        return self.sigmoid(out)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SentimentClassifier().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = nn.BCELoss()


In [None]:
import time

inputs, labels = next(iter(train_loader))
start = time.time()
inputs, labels = inputs.to(device), labels.to(device)
preds = model(inputs).squeeze()
loss = criterion(preds, labels)
loss.backward()
print(f"⏱️ One batch time: {time.time() - start:.2f}s")


⏱️ One batch time: 0.00s


In [None]:
print("Input shape:", inputs.shape)
print("Label shape:", labels.shape)


Input shape: torch.Size([4, 100, 300])
Label shape: torch.Size([4])


In [None]:
for epoch in range(5):
    model.train()
    total_loss = 0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        preds = model(inputs).squeeze()
        loss = criterion(preds, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")


KeyboardInterrupt: 

In [None]:
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        preds = model(inputs).squeeze() > 0.5
        correct += (preds == labels).sum().item()
        total += labels.size(0)

print(f"Test Accuracy: {correct / total:.2%}")

Test Accuracy: 49.50%
