In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import re
import random
from nltk.tokenize import word_tokenize
import tensorflow as tf

In [2]:
df = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")

In [3]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [4]:
df['sentiment'].unique()

array(['positive', 'negative'], dtype=object)

In [5]:
df.dtypes

review       object
sentiment    object
dtype: object

In [6]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [7]:
df['sentiment'] = df['sentiment'].map({"positive":1, "negative":0})

In [8]:
df['sentiment'].unique()

array([1, 0])

In [9]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
...,...,...
49995,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,0


In [10]:
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()  
    text = re.sub(r'https\S+', '', text)  
    text = re.sub(r'#\w+', '', text)  
    text = re.sub(r'@\w+', '', text)  
    text = re.sub(r'[^a-z ]', '', text)  
    text = text.strip()  

    
    word_tokens = word_tokenize(text)  
    filtered_text = ' '.join([word for word in word_tokens if word not in stop_words])  

    return filtered_text

df['review'] = df['review'].apply(preprocess_text)


In [11]:
df

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching oz episode yo...,1
1,wonderful little production br br filming tech...,1
2,thought wonderful way spend time hot summer we...,1
3,basically theres family little boy jake thinks...,0
4,petter matteis love time money visually stunni...,1
...,...,...
49995,thought movie right good job wasnt creative or...,1
49996,bad plot bad dialogue bad acting idiotic direc...,0
49997,catholic taught parochial elementary schools n...,0
49998,im going disagree previous comment side maltin...,0


In [12]:
from transformers import BertTokenizer, TFBertForSequenceClassification

In [13]:
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
df_train, df_val = train_test_split(df, test_size=0.2, random_state=42)

In [17]:
def tokenize_data(text, tokenizer, max_length =128):
    return tokenizer(
        text.tolist(),
        truncation = True,
        padding = True,
        max_length = max_length,
        return_tensors = 'tf'
    )

train_tokenized = tokenize_data(df_train['review'],tokenizer)
val_tokenized   = tokenize_data(df_val['review'],tokenizer)

train_labels = df_train['sentiment'].values
val_labels = df_val['sentiment'].values


In [18]:
print(val_tokenized)

{'input_ids': <tf.Tensor: shape=(10000, 128), dtype=int32, numpy=
array([[  101,  2428,  4669, ...,  2165, 25307,   102],
       [  101,  2116,  2547, ...,  2855,  2921,   102],
       [  101,  2143,  2855, ...,     0,     0,     0],
       ...,
       [  101,  2748,  2512, ...,  2412,  2580,   102],
       [  101,  2521,  3152, ...,     0,     0,     0],
       [  101,  2387, 13336, ...,     0,     0,     0]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(10000, 128), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(10000, 128), dtype=int32, numpy=
array([[1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype

In [19]:
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_tokenized), train_labels)).batch(32)
val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_tokenized), val_labels)).batch(32)

In [20]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

In [21]:
history = model.fit(
    train_dataset,
    epochs=3,  
    validation_data=val_dataset
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [22]:
model.evaluate(val_dataset)



[0.334349662065506, 0.8949000239372253]

In [24]:
from sklearn.metrics import classification_report

predictions = model.predict(dict(val_tokenized))
pred_labels = tf.argmax(predictions.logits, axis=-1).numpy()

print(classification_report(val_labels, pred_labels))


              precision    recall  f1-score   support

           0       0.90      0.89      0.89      4961
           1       0.89      0.90      0.90      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [25]:
import os

model_save_path = "/kaggle/working/sentiment_model"

os.makedirs(model_save_path, exist_ok=True)

model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"Model saved at: {model_save_path}")


Model saved at: /kaggle/working/sentiment_model


In [26]:
import shutil

# Compress the directory into a zip file
shutil.make_archive("/kaggle/working/sentiment_model", 'zip', "/kaggle/working/sentiment_model")
print("Model zipped successfully.")

Model zipped successfully.
