# IMDB movie review analysis

### Overview

## Import necessary libraries

In [35]:
import json
import pandas as pd
import re
import nltk
import string
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM , AutoModelForSequenceClassification

## Download punktuations and stopwords for preprocessing 

In [36]:
# nltk.download('punkt')
# nltk.download('stopwords')

## Load data

In [37]:
train_data = []
test_data = []
unlabeled_data = []

def read_json(data, json_file):
    with open(json_file, 'r') as f:
        for line in f:
            line = line.strip()  # remove leading/trailing white spaces
            if line:  # ensure the line is not empty
                try:
                    data.append(json.loads(line))
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON: {e}")
                    
read_json(train_data, 'train_imdb.jsonl')
read_json(test_data, 'test_imdb.jsonl')
read_json(unlabeled_data, 'aug_imdb_unlabeled.jsonl')



In [38]:
train_data = pd.DataFrame(train_data)
test_data = pd.DataFrame(test_data)
unlabeled_data = pd.DataFrame(unlabeled_data)


In [5]:
print(f"Unlabeled Data: {len(unlabeled_data)}")
print(f"Training Data: {len(train_data)}")
print(f"Test Data: {len(test_data)}")

Unlabeled Data: 1014
Training Data: 150
Test Data: 150


In [6]:
train_data.head(5)

Unnamed: 0,text,label,embedding
0,fairly good romantic comedy in which i don't t...,1,"[-0.0167805497, -0.0395836979, 0.1233159453, -..."
1,"""dressed to kill"", is one of the best thriller...",1,"[-0.1252697259, 0.1014768854, 0.1718291789, -0..."
2,i'm glad that users (as of this date) who like...,1,"[0.1312361956, 0.0294876788, 0.2328549027, -0...."
3,needed an excuse to get out of the house while...,0,"[0.1387384981, 0.0460377187, 0.3447172046, -0...."
4,john candy's performance in once upon a crime ...,1,"[0.1606466323, -0.1768193543, 0.3563380837, -0..."


In [7]:
test_data.head()

Unnamed: 0,text,label,embedding
0,the 60s (1999) d: mark piznarski. josh hamilto...,0,"[-0.2179879397, -0.1741176099, 0.0884851664, -..."
1,hello. this movie is.......well.......okay. ju...,1,"[-0.0783471093, -0.279764235, 0.6189775467, 0...."
2,eyeliner was worn nearly 6000 years ago in egy...,1,"[0.03139963, -0.1652034372, 0.1265712678, -0.0..."
3,"this has to be, by far, the absolute worst mov...",0,"[-0.0552324504, -0.1593759954, 0.0467776954, -..."
4,"i like silent films, but this was a little too...",0,"[0.0934860557, 0.0262434836, 0.0843501985, -0...."


In [8]:
unlabeled_data.head()

Unnamed: 0,text,embedding
0,there is no relation at all between fortier an...,"[-0.097577557, -0.1536363065, 0.311417222, 0.0..."
1,in the process of trying to establish the audi...,"[-0.0003366936, 0.0877778083, -0.0071643554, 0..."
2,i give this movie 7 out of 10 because the vill...,"[-0.275570631, -0.3291363716, 0.079317905, 0.0..."
3,this is the best sci-fi that i have seen in my...,"[0.1461943835, -0.2785910368, 0.4456491172, -0..."
4,what an appalling piece of rubbish!!! who are ...,"[0.1696606129, 0.354041934, 0.4451519549, -0.0..."


In [9]:
def check_list_sizes(df, column_name):
    list_lengths = df[column_name].apply(len)
    return list_lengths.nunique() == 1

# Usage
result = check_list_sizes(unlabeled_data, 'embedding')
print("All lists have the same size:", result)

All lists have the same size: True


## Preproccesing

The steps of cleaning the dataset:
- tokenize the texts 
- make all the words in lowercase
- remove punctuations and stopwords
- stemmize words

In [10]:
def clean_text(text):
    text = re.sub(r"^[^-]*-\s*", "", text)
    text = re.sub(r"([^\w\s])", "", text)
    
    tokens = nltk.word_tokenize(text)
    remove_punct = str.maketrans('', '', string.punctuation)
    tokens = [token.lower().translate(remove_punct) for token in tokens]

    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # stemmer = PorterStemmer()
    # tokens = [stemmer.stem(token) for token in tokens]

    return tokens


In [11]:
train_data['cleaned_text'] = train_data['text'].apply(clean_text)
test_data['cleaded_text'] = test_data['text'].apply(clean_text)
unlabeled_data['cleaned_text'] = unlabeled_data['text'].apply(clean_text)

In [12]:
train_data.head()

Unnamed: 0,text,label,embedding,cleaned_text
0,fairly good romantic comedy in which i don't t...,1,"[-0.0167805497, -0.0395836979, 0.1233159453, -...","[fairly, good, romantic, comedy, dont, think, ..."
1,"""dressed to kill"", is one of the best thriller...",1,"[-0.1252697259, 0.1014768854, 0.1718291789, -0...","[dressed, kill, one, best, thriller, ever, mad..."
2,i'm glad that users (as of this date) who like...,1,"[0.1312361956, 0.0294876788, 0.2328549027, -0....","[seems, like, expecting, serious, treatment, c..."
3,needed an excuse to get out of the house while...,0,"[0.1387384981, 0.0460377187, 0.3447172046, -0....","[left, movie, hour, return, watch, paint, dryb..."
4,john candy's performance in once upon a crime ...,1,"[0.1606466323, -0.1768193543, 0.3563380837, -0...","[john, candy, performance, upon, crime, possib..."


In [13]:
test_data.head()

Unnamed: 0,text,label,embedding,cleaded_text
0,the 60s (1999) d: mark piznarski. josh hamilto...,0,"[-0.2179879397, -0.1741176099, 0.0884851664, -...","[series, later, released, videodvd, full, leng..."
1,hello. this movie is.......well.......okay. ju...,1,"[-0.0783471093, -0.279764235, 0.6189775467, 0....","[hello, movie, iswellokay, kidding, awesome, b..."
2,eyeliner was worn nearly 6000 years ago in egy...,1,"[0.03139963, -0.1652034372, 0.1265712678, -0.0...","[informed, dont, watch, show, waste, space, bo..."
3,"this has to be, by far, the absolute worst mov...",0,"[-0.0552324504, -0.1593759954, 0.0467776954, -...","[far, absolute, worst, movie, seen, last, 20, ..."
4,"i like silent films, but this was a little too...",0,"[0.0934860557, 0.0262434836, 0.0843501985, -0....","[like, silent, film, little, moronic, much, wi..."


In [14]:
unlabeled_data.head()

Unnamed: 0,text,embedding,cleaned_text
0,there is no relation at all between fortier an...,"[-0.097577557, -0.1536363065, 0.311417222, 0.0...","[relation, fortier, profiler, fact, police, se..."
1,in the process of trying to establish the audi...,"[-0.0003366936, 0.0877778083, -0.0071643554, 0...","[process, trying, establish, audience, empathy..."
2,i give this movie 7 out of 10 because the vill...,"[-0.275570631, -0.3291363716, 0.079317905, 0.0...","[give, movie, 7, 10, villain, interesting, rol..."
3,this is the best sci-fi that i have seen in my...,"[0.1461943835, -0.2785910368, 0.4456491172, -0...","[fi, seen, 29, year, watching, scifi, also, be..."
4,what an appalling piece of rubbish!!! who are ...,"[0.1696606129, 0.354041934, 0.4451519549, -0.0...","[appalling, piece, rubbish, people, blubber, g..."


## Semi-supervised learning

### Using Traditional Methods (Label Propagation)

#### Label Propagation for Unlabeled Data

We'll use the `LabelSpreading` technique from `scikit-learn` to propagate labels to the unlabeled data.

In [15]:
import pandas as pd
import numpy as np
from sklearn.semi_supervised import LabelSpreading
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping




In [16]:
# Prepare the data for LabelSpreading
X_train = np.vstack(train_data['embedding'].values)
y_train = train_data['label'].values

X_unlabeled = np.vstack(unlabeled_data['embedding'].values)
y_unlabeled = -1 * np.ones(X_unlabeled.shape[0])

X_combined = np.vstack((X_train, X_unlabeled))
y_combined = np.concatenate((y_train, y_unlabeled))

# Label propagation
label_spread = LabelSpreading(kernel='knn', n_neighbors=3)
label_spread.fit(X_combined, y_combined)

# Get the propagated labels
propagated_labels = label_spread.transduction_[-len(unlabeled_data):]

# Create a DataFrame with the propagated labels
propagated_labels_data = unlabeled_data.copy()
propagated_labels_data['label'] = propagated_labels

#### Combine Propagated Labels and Train Data

In [17]:
combined_data = pd.concat([train_data, propagated_labels_data], ignore_index=True)


X_train_combined = np.vstack(combined_data['embedding'].values)
y_train_combined = combined_data['label'].values

#### Build and Train a Neural Network with Early Stopping

In [18]:
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_combined.shape[1],)),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid') 
])




In [19]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

label_propagation_history = model.fit(X_train_combined, y_train_combined, 
                    validation_split=0.2, epochs=100, 
                    callbacks=[early_stopping])


Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100


### Evaluate model

In [20]:
X_test = np.vstack(test_data['embedding'].values)
y_test = test_data['label'].values

label_propagation_test_loss, label_propagation_test_accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {label_propagation_test_accuracy:.2f}')

Test Accuracy: 0.78


### Using LLMs

In [21]:
MODEL_ARGS = {
    'Name': 'microsoft/Phi-3-mini-128k-instruct',
    'DType': 'bfloat16'
}

def load_model(model_args):
    device = "cuda" if torch.cuda.is_available() else "cpu"

    model = AutoModelForCausalLM.from_pretrained(
        model_args['Name'],
        trust_remote_code=True,
        torch_dtype=getattr(torch, model_args['DType']),
        low_cpu_mem_usage=True,
        device_map={"": device},
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args['Name'],
        trust_remote_code=True,
    )

    return model, tokenizer

model, tokenizer = load_model(MODEL_ARGS)
# model.save_pretrained("PATH")
# tokenizer.save_pretrained("PATH")

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attenton` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [47]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to("cuda")

X_train = train_data['text'].values.tolist() 
y_train = train_data['label'].values

X_unlabeled = unlabeled_data['text'].values.tolist() 

max_length = 64  
train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=max_length)
unlabeled_encodings = tokenizer(X_unlabeled, truncation=True, padding=True, max_length=max_length)

def classify_tokens(model, encodings):
    with torch.no_grad():
        input_ids = torch.tensor(encodings['input_ids']).to("cuda")
        attention_mask = torch.tensor(encodings['attention_mask']).to("cuda")
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1).cpu().numpy()
    return predictions

unlabeled_predictions = classify_tokens(model, unlabeled_encodings)

y_combined[len(y_train):] = unlabeled_predictions

print("Unlabeled Data:", len(unlabeled_data))
print("Training Data:", len(train_data))
print("Test Data:", len(test_data))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Unlabeled Data: 1014
Training Data: 150
Test Data: 150


In [69]:
unlabeled_predictions_df = pd.DataFrame({'label': unlabeled_predictions})

unlabeled_data_with_labels = unlabeled_data.copy()  
unlabeled_data_with_labels['label'] = unlabeled_predictions_df['label']

Storing labeled data results in JSON format for future use:

In [70]:
unlabeled_data_with_labels.drop(columns=['cleaned_text'], inplace=True)
unlabeled_data_with_labels = unlabeled_data_with_labels[['text', 'label', 'embedding']]

output_file = "unlabeled_data_with_labels.jsonl"
unlabeled_data_with_labels.to_json(output_file, orient='records', lines=True)

In [67]:
unlabeled_data_with_labels

Unnamed: 0,text,label,embedding
0,there is no relation at all between fortier an...,1,"[-0.097577557, -0.1536363065, 0.311417222, 0.0..."
1,in the process of trying to establish the audi...,1,"[-0.0003366936, 0.0877778083, -0.0071643554, 0..."
2,i give this movie 7 out of 10 because the vill...,0,"[-0.275570631, -0.3291363716, 0.079317905, 0.0..."
3,this is the best sci-fi that i have seen in my...,1,"[0.1461943835, -0.2785910368, 0.4456491172, -0..."
4,what an appalling piece of rubbish!!! who are ...,1,"[0.1696606129, 0.354041934, 0.4451519549, -0.0..."
...,...,...,...
1009,unbelievable!<br /><br />this film gets a 7 ou...,1,"[-0.0955021083, 0.0211753864, 0.3570575416, -0..."
1010,sweet romantic drama/comedy about stewart and ...,1,"[0.017505046, -0.0501609854, 0.4082049727, -0...."
1011,"personally, i disdain the jerry springer show,...",0,"[-0.196471706, -0.0579777397, 0.1792553961, -0..."
1012,this film looked promising but it was actually...,1,"[-0.0007334474, -0.1367768645, 0.1660933644, 0..."


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


X_train, X_test, y_train, y_test = train_test_split(unlabeled_data_with_labels['text'], unlabeled_data_with_labels['label'], test_size=0.2)

vectorizer = TfidfVectorizer(max_features=5000)

X_train_tfidf = vectorizer.fit_transform(X_train)

X_test_tfidf = vectorizer.transform(X_test)

logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train_tfidf, y_train)

predictions = logistic_model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)


### Evaluating on test data : 

In [39]:
unlabeled_data_with_labels = []
read_json(unlabeled_data_with_labels, 'unlabeled_data_with_labels.jsonl')
unlabeled_data_with_labels = pd.DataFrame(unlabeled_data_with_labels)

In [40]:
train_data = pd.concat([train_data[['text', 'label']],unlabeled_data_with_labels[['text', 'label']]], ignore_index=True)
train_data

Unnamed: 0,text,label
0,fairly good romantic comedy in which i don't t...,1
1,"""dressed to kill"", is one of the best thriller...",1
2,i'm glad that users (as of this date) who like...,1
3,needed an excuse to get out of the house while...,0
4,john candy's performance in once upon a crime ...,1
...,...,...
1159,unbelievable!<br /><br />this film gets a 7 ou...,1
1160,sweet romantic drama/comedy about stewart and ...,1
1161,"personally, i disdain the jerry springer show,...",0
1162,this film looked promising but it was actually...,1


In [44]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [46]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score

class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

MAX_LENGTH = 64
BATCH_SIZE = 32
LEARNING_RATE = 2e-5
EPOCHS = 3

train_dataset = CustomDataset(train_data['text'], train_data['label'], tokenizer, MAX_LENGTH)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

test_dataset = CustomDataset(test_data['text'], test_data['label'], tokenizer, MAX_LENGTH)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)


In [53]:
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

for epoch in range(EPOCHS):
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].numpy()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_labels = logits.argmax(axis=1).cpu().numpy()

        predictions.extend(predicted_labels)
        true_labels.extend(labels)

accuracy = accuracy_score(true_labels, predictions)
print("Accuracy:", accuracy)



OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU 