In [27]:
import pandas as pd
imdb = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")

imdb.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [28]:
import string
for i, text in enumerate(imdb["review"]):
    review = " ".join(text.split("\n"))
    for p in string.punctuation:
        review = review.replace(p, "")
    imdb.loc[i, "review"] = review
imdb.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production br br The filmin...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically theres a family where a little boy J...,negative
4,Petter Matteis Love in the Time of Money is a ...,positive


In [29]:
import nltk
nltk.download("stopwords")
Stopwords = set(nltk.corpus.stopwords.words("english"))
sorted(Stopwords)[:10]

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an']

In [30]:
for i, text in enumerate(imdb["review"]):
    words = text.split()
    result = []
    for word in words:
        if word not in Stopwords:
            result.append(word)
    imdb.loc[i, "review"] = " ".join(result)
imdb.head()

Unnamed: 0,review,sentiment
0,One reviewers mentioned watching 1 Oz episode ...,positive
1,A wonderful little production br br The filmin...,positive
2,I thought wonderful way spend time hot summer ...,positive
3,Basically theres family little boy Jake thinks...,negative
4,Petter Matteis Love Time Money visually stunni...,positive


In [31]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

In [32]:
stem_map = {}
all_words = set()
for i, text in enumerate(imdb["review"]):
    words = text.split()
    for word in words:
        all_words.add(word)
for word in all_words:
    stem_map[word] = stemmer.stem(word)

In [33]:
import numpy as np
from collections import Counter
counter = Counter()
for i, text in enumerate(imdb["review"]):
    words = text.split()
    for word in words:
        counter[stem_map[word]] += 1
words = np.array(list(counter.keys()), dtype=object)
freqs = list(counter.values())
top_k = len(words)
topk_words = set(words[np.argsort(freqs)][:-1][:top_k])
for i, text in enumerate(imdb["review"]):
    words = text.split()
    result = []
    for word in words:
        if word in topk_words:
            result.append(word)
    imdb.loc[i, "review"] = " ".join(result)
topk_words = sorted(topk_words)
token_mapping = {}
i = 1
for word in topk_words:
    if word in token_mapping:
        continue
    token_mapping[word] = i; i+=1
VOCAB_SIZE = len(token_mapping)
VOCAB_SIZE

143865

In [34]:
sentences = []
targets = []
for i, sentence in enumerate(imdb["review"]):
    if sentence.strip() != "":
        sentence = " ".join([stem_map[word] for word in sentence.split() if stem_map[word] in token_mapping])
        if len(sentence) > 0:
            sentences.append(sentence)
            targets.append(imdb.loc[i, "sentiment"])

print(len(sentences), len(targets) ,imdb.shape[0])

49998 49998 50000


In [35]:
import random

test_proportion = 0.2
test_size = int(len(sentences)*test_proportion)
train_size = len(sentences) - test_size
sentence_targets = list(zip(sentences, targets))
random.shuffle(sentence_targets)
train, test = sentence_targets[:train_size], sentence_targets[train_size:]
X_train, y_train = map(list, zip(*train))
X_test, y_test = map(list, zip(*test))

In [36]:
import torch
from torch.utils.data import DataLoader
class cdataset:
    def __init__(self, data:list, targets:list):
        self.data = data
        self.targets = targets
    def __len__(self):
        return len(self.targets)
    def __getitem__(self, idx):
        text = self.data[idx]
        label = self.targets[idx]
        word_sequence = text.split()
        token_sequence = torch.tensor([token_mapping[word] for word in word_sequence], dtype=torch.long)
        target = torch.tensor(1 if label == "positive" else 0)
        return token_sequence, target
def collate_fn(batch):
    sequences, targets = zip(*batch)
    sequences = torch.nn.utils.rnn.pad_sequence(sequences, batch_first=True, padding_value=0)
    targets = torch.tensor(targets, dtype=torch.long)
    return sequences, targets

dataloader = DataLoader(dataset=cdataset(X_train, y_train), batch_size=32, collate_fn=collate_fn)
testloader = DataLoader(dataset=cdataset(X_test, y_test), batch_size=32, collate_fn=collate_fn)

In [37]:
from torch import nn

class RecurrentModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding = nn.Embedding(VOCAB_SIZE+1, 100)
        self.rnn = nn.GRU(100, 64, batch_first=True)
        self.fc1 = nn.Linear(64, 2)
        self.softmax = nn.Softmax(dim=-1)
    def forward(self, x):
        x = self.embedding(x)
        x,_ = self.rnn(x) # x : (batch, seq, hidden_size)
        x = x[:, -1, :] # x : (batch, hidden_size) :- last hidden state since this is a sentence classification task
        x = self.fc1(x)
        return self.softmax(x)


In [39]:
# training loop
model = RecurrentModel()
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
model.to(DEVICE)
EPOCHS = 25
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.0001)
for epoch in range(EPOCHS):
    model.train()
    for x,y in dataloader:
        optimizer.zero_grad()
        x,y = x.to(DEVICE), y.to(DEVICE)
        y_pred = model(x)
        l = loss(y_pred, y)
        l.backward()
        optimizer.step()
    with torch.no_grad():
        model.eval()
        l = 0
        correct = 0
        total = 0
        print(f"EPOCH: {epoch}:")
        for x,y in dataloader:
            x,y = x.to(DEVICE), y.to(DEVICE)
            y_pred = model(x)
            l += loss(y_pred, y).item()/len(dataloader)
            total += len(y)
            correct += (y_pred.argmax(dim=1) == y).sum()
        print("    loss:", l, f"accuracy: {correct}/{total}")
        l = 0
        correct = 0
        total = 0
        for x,y in testloader:
            x,y = x.to(DEVICE), y.to(DEVICE)
            y_pred = model(x)
            l += loss(y_pred, y).item()/len(testloader)
            total += len(y)
            correct += (y_pred.argmax(dim=1) == y).sum()
        print("    loss:", l, f"accuracy: {correct}/{total}")

EPOCH: 0:
    loss: 0.6932683818817144 accuracy: 20152/39999
    loss: 0.6941070907032152 accuracy: 4945/9999
EPOCH: 1:
    loss: 0.6927169933319088 accuracy: 20208/39999
    loss: 0.693727687334482 accuracy: 4951/9999
EPOCH: 2:
    loss: 0.6799362753868115 accuracy: 23723/39999
    loss: 0.6821654577986501 accuracy: 5849/9999
EPOCH: 3:
    loss: 0.5742700328588484 accuracy: 29123/39999
    loss: 0.5790973810342177 accuracy: 7242/9999
EPOCH: 4:
    loss: 0.5401390970706946 accuracy: 30574/39999
    loss: 0.5480506545819414 accuracy: 7512/9999
EPOCH: 5:
    loss: 0.5183329966068264 accuracy: 31488/39999
    loss: 0.5291357068969803 accuracy: 7727/9999
EPOCH: 6:
    loss: 0.49998410193920123 accuracy: 32273/39999
    loss: 0.5152160394877291 accuracy: 7897/9999
EPOCH: 7:
    loss: 0.4863401866435999 accuracy: 32853/39999
    loss: 0.5055118231727673 accuracy: 7964/9999
EPOCH: 8:
    loss: 0.481465296673775 accuracy: 33028/39999
    loss: 0.5036402891238277 accuracy: 8027/9999
EPOCH: 9:
 

In [47]:
model.to("cpu")
def preprocessor(text: str):
    for p in string.punctuation:
        text = text.replace(p, "")
    words = text.strip().split()
    word_sequence = []
    for word in words:
        if word not in Stopwords:
            word_sequence.append(word)
    word_sequence = [stemmer.stem(word) for word in word_sequence]
    token_sequence = torch.tensor([token_mapping[word] for word in word_sequence if word in token_mapping], dtype=torch.long).unsqueeze(0) # add a pseudo batch dimension
    return token_sequence

def predict(text: str):
    tensor = preprocessor(text)
    prediction = model.forward(tensor).argmax(dim=1).item()
    if prediction:
        return "positive"
    return "negative"

In [56]:
review1 = """
Pushpa 2, while a highly anticipated movie with stellar performances and a gripping narrative, may raise concerns about the message it conveys to society. The film's primary focus on smuggling activities and the glorification of a smuggler's life could potentially have a negative impact on impressionable audiences, particularly the youth.

The protagonist, portrayed as a larger-than-life figure, is depicted as a hero despite being deeply involved in illegal activities. This portrayal risks normalizing and even romanticizing criminal behavior. Young viewers, who are often drawn to such charismatic characters, might misinterpret these actions as acceptable or even aspirational, rather than understanding the legal and moral consequences of such choices.

Additionally, the film's intense focus on the underworld and smuggling might overshadow the importance of ethical values and hard work. While entertainment often involves dramatizing certain aspects of life, filmmakers carry a responsibility to balance storytelling with messages that do not inadvertently glorify harmful behaviors.

Society thrives on positive role models and stories that inspire constructive change. While "Pushpa 2" excels in its cinematic elements, its narrative might benefit from a more nuanced approach that highlights the repercussions of the protagonist's choices, emphasizing the need for lawful and ethical conduct.

Entertainment is a powerful medium, and movies that reach a wide audience must tread carefully, ensuring that their influence aligns with societal well-being rather than inadvertently promoting a culture of lawlessness or misplaced heroism.
"""

In [57]:
predict(review1)

'negative'

In [58]:
review2 = """
Pushpa 2 The Rule is nothing short of a cinematic marvel that redefines the action-drama genre. Sukumar, the mastermind director, has once again delivered a masterpiece that keeps the audience glued to their seats from start to finish. With larger-than-life visuals, a gripping storyline, and impeccable performances, this film raises the bar for Indian cinema.

Allu Arjun, reprising his iconic role as Pushpa Raj, delivers a performance that is both electrifying and deeply emotional. His raw intensity and magnetic screen presence make it impossible to look away. The character of Pushpa evolves brilliantly, showcasing his relentless pursuit of power and justice. Rashmika Mandanna shines in her role, bringing depth and emotion that balance the high-octane action.

The screenplay is a rollercoaster of emotions, blending drama, action, and intrigue seamlessly. The dialogues are sharp and impactful, with memorable lines that resonate long after the credits roll. Devi Sri Prasad's music is another highlight, with a soundtrack that amplifies the film's energy and perfectly complements the narrative.

The cinematography is breathtaking, capturing the rugged beauty of the forest landscapes and the gritty underbelly of the smuggling world. Each frame is meticulously crafted, making the visuals as compelling as the story itself. The action sequences are choreographed to perfection, delivering a spectacle that is both thrilling and realistic.

Pushpa 2 is more than just a movie, it's an experience that celebrates the indomitable spirit of a man who refuses to back down. It’s a tribute to resilience, ambition, and the raw power of determination. This film is a must-watch for fans of Indian cinema and anyone who appreciates storytelling at its finest.

In every sense, Pushpa 2 The Rule is a blockbuster that will leave an indelible mark on the audience's hearts. Sukumar and his team have created a cinematic gem that will be remembered for years to come.
"""

In [59]:
predict(review2)

'positive'