Uploading Google Drive Files

In [None]:
from google.colab import drive
drive.mount('/content/drive')

ValueError: mount failed

Import necessary libraries

In [None]:
import os
import pandas as pd
import nltk
import re
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_scheduler
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import requests
import matplotlib.pyplot as plt

Stopwords

In [None]:
#Stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords as nltk_stopwords
nltk_stopwords = set(nltk_stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Text Preprocessing & Script Labels

In [None]:
#Text preprocessing
def preprocess_text(text, stopwords):
  text = text.lower()
  text = re.sub(r'[^a-z\s]', '', text)
  tokens = text.split()
  #tokens = [word for word in tokens if word not in stopwords]
  return ' '.join(tokens)

In [None]:
#Load scripts and predefined labels
def load_scripts(folder_path, label, stopwords):
    scripts = []
    labels = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    text = file.read()
                    preprocessed_text = preprocess_text(text, stopwords)
                    scripts.append(preprocessed_text)
                    labels.append(label)
            except UnicodeDecodeError:
                print(f"Skipping file: {filename} (encoding issue)")
    return scripts, labels

Define Paths

In [None]:
#Defining paths
acclaimed_path = '/content/drive/MyDrive/Acclaimed Movies'
panned_path = '/content/drive/MyDrive/Panned Movies'

Load Data

In [None]:
#Load data
acclaimed_scripts, acclaimed_labels = load_scripts(acclaimed_path, 1, nltk_stopwords)
panned_scripts, panned_labels = load_scripts(panned_path, 0, nltk_stopwords)

Tokenizer Initialization

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Combine into a single Dataframe

In [None]:
#Combine into a single Dataframe
scripts = acclaimed_scripts + panned_scripts
labels = acclaimed_labels + panned_labels
df = pd.DataFrame({'text': scripts, 'label': labels})

#Split data by scripts to prevent leakage
train_scripts, test_scripts = train_test_split(df, test_size=0.3, random_state=42, stratify=df['label'])
val_scripts, test_scripts = train_test_split(test_scripts, test_size=0.5, random_state=42, stratify=test_scripts['label'])

Preview

In [None]:
print(train_scripts.head())
print(val_scripts.head())
print(test_scripts.head())

                                                 text  label
36  kill bill courtesy of for educational purposes...      1
4   argo written by chris terrio based on the may ...      1
2   years a slave written by john ridley card fade...      1
15  int welton academy hallway day a young boy dre...      1
26  green book written by nick vallelonga brian cu...      1
                                                 text  label
93  scriptscom land of the lost by chris henchy pa...      0
25  hacksaw ridge by robert schenkkan revisions by...      1
5   annie hall written by woody allen marshall bri...      1
64  scriptscom independence day resurgence by nico...      0
72  scriptscom the lone ranger by justin haythe pa...      0
                                                 text  label
85  scriptscom howard the duck by steve gerber pag...      0
99  scriptscom the karate kid part iii by robert m...      0
78  scriptscom a madea christmas by tyler perry pa...      0
3   a clockwork orange s

Tokenization

In [None]:
def tokenize_texts(texts, max_length=512):
    return tokenizer(list(texts), padding=False, truncation=True, max_length=max_length, return_tensors="pt")  # No padding or truncation here

train_encodings = tokenize_texts(train_scripts['text'])
val_encodings = tokenize_texts(val_scripts['text'])
test_encodings = tokenize_texts(test_scripts['text'])

In [None]:
def split_text_into_chunks(script_df, tokenizer, max_length=512):
    chunked_texts = []
    chunked_labels = []
    chunked_ids = []
    for script_id, row in script_df.iterrows():
        text, label = row['text'], row['label']
        tokens = tokenizer(text, truncation=False, add_special_tokens=False)['input_ids']
        chunks = [tokens[i:i + max_length] for i in range(0, len(tokens), max_length)]
        chunked_texts.extend([' '.join(tokenizer.convert_ids_to_tokens(chunk)) for chunk in chunks])
        chunked_labels.extend([label] * len(chunks))
        chunked_ids.extend([script_id] * len(chunks))
    return pd.DataFrame({'text': chunked_texts, 'label': chunked_labels, 'script_id': chunked_ids})

In [None]:
#Combining into a single Dataframe
scripts = acclaimed_scripts + panned_scripts
labels = acclaimed_labels + panned_labels
df = pd.DataFrame({'text': scripts, 'label': labels})

#Splitting data by scripts to prevent leakage
train_scripts, test_scripts = train_test_split(df, test_size=0.3, random_state=42, stratify=df['label'])
val_scripts, test_scripts = train_test_split(test_scripts, test_size=0.5, random_state=42, stratify=test_scripts['label'])

#Splitting and chunking dataset
train_chunked = split_text_into_chunks(train_scripts, tokenizer)
val_chunked = split_text_into_chunks(val_scripts, tokenizer)
test_chunked = split_text_into_chunks(test_scripts, tokenizer)

def tokenize_texts(texts):
    return tokenizer(list(texts), padding=True, truncation=True, max_length=512, return_tensors="pt")

train_encodings = tokenize_texts(train_chunked['text'])
val_encodings = tokenize_texts(val_chunked['text'])
test_encodings = tokenize_texts(test_chunked['text'])

Token indices sequence length is longer than the specified maximum sequence length for this model (50090 > 512). Running this sequence through the model will result in indexing errors


Split scripts into chunks

Split and chunk datasets

In [None]:
train_chunked = split_text_into_chunks(train_scripts, tokenizer)
val_chunked = split_text_into_chunks(val_scripts, tokenizer)
test_chunked = split_text_into_chunks(test_scripts, tokenizer)

PyTorch Dataset Class

In [None]:
#PyTorch Dataset class
class MovieDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = list(labels)
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = MovieDataset(train_encodings, train_chunked['label'])
val_dataset = MovieDataset(val_encodings, val_chunked['label'])
test_dataset = MovieDataset(test_encodings, test_chunked['label'])

Dataloaders

In [None]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

Model setup

In [None]:
#Model setup
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)
num_training_steps = len(train_loader) * 5  # 5 epochs
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TRAINING LOOP

1. Training loop where model is trained over specified epochs using training dataset

In [None]:
epochs = 5
progress_bar = tqdm(range(num_training_steps))

for epoch in range(epochs):
    model.train()
    for batch in train_loader:
        batch = {key: val.to(device) for key, val in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()
        progress_bar.update(1)

    # Validation
    model.eval()
    val_loss = 0
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for batch in val_loader:
            batch = {key: val.to(device) for key, val in batch.items()}
            outputs = model(**batch)
            val_loss += outputs.loss.item()
            predictions = torch.argmax(outputs.logits, dim=-1)
            val_correct += (predictions == batch['labels']).sum().item()
            val_total += batch['labels'].size(0)

    print(f"Epoch {epoch + 1}: Validation Loss = {val_loss / len(val_loader):.4f}, Accuracy = {val_correct / val_total:.4f}")

  0%|          | 0/860 [00:00<?, ?it/s]

2. Script-Level Accuracy Function

After the training loop is run, the ```script_level_accuracy``` function evaluates the model at the script level



In [None]:
def script_level_accuracy(chunked_dataset, model, tokenizer):
    script_predictions = {}
    script_labels = {}
    for _, row in chunked_dataset.iterrows():
        text, label, script_id = row['text'], row['label'], row['script_id']
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
        inputs = {key: val.to(device) for key, val in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
            pred = torch.argmax(outputs.logits, dim=-1).item()
        if script_id not in script_predictions:
            script_predictions[script_id] = []
            script_labels[script_id] = label
        script_predictions[script_id].append(pred)

    correct = 0
    for script_id, predictions in script_predictions.items():
        majority_vote = max(set(predictions), key=predictions.count)
        if majority_vote == script_labels[script_id]:
            correct += 1
    return correct / len(script_predictions)

print(f"Script-Level Accuracy: {script_level_accuracy(test_chunked, model, tokenizer):.4f}")

Script-Level Accuracy: 0.9333


Run model on test data

In [None]:
test_accuracy = script_level_accuracy(test_chunked, model, tokenizer)
print(f"Test Accuracy: {test_accuracy:.4f}")

Test Accuracy: 0.9333


Precision, recall, F1-score for test

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

In [None]:
def get_predictions(chunked_dataset, model, tokenizer):
    all_predictions = []
    all_labels = []

    for _, row in chunked_dataset.iterrows():
        text, label, script_id = row['text'], row['label'], row['script_id']
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
        inputs = {key: val.to(device) for key, val in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
            pred = torch.argmax(outputs.logits, dim=-1).item()

        all_predictions.append(pred)
        all_labels.append(label)

    return all_predictions, all_labels

test_predictions, test_labels = get_predictions(test_chunked, model, tokenizer)

In [None]:
precision = precision_score(test_labels, test_predictions)
recall = recall_score(test_labels, test_predictions)
f1 = f1_score(test_labels, test_predictions)

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Precision: 0.8834
Recall: 0.9799
F1-score: 0.9292


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

model.eval()

#Storing true labels and predictions
val_labels = []
val_predictions = []

with torch.no_grad():
    for batch in val_loader:

        batch = {key: val.to(device) for key, val in batch.items()}


        outputs = model(**batch)


        predictions = torch.argmax(outputs.logits, dim=-1)


        val_labels.extend(batch['labels'].cpu().numpy())
        val_predictions.extend(predictions.cpu().numpy())

#Calculating P, R, F1
val_precision = precision_score(val_labels, val_predictions)
val_recall = recall_score(val_labels, val_predictions)
val_f1 = f1_score(val_labels, val_predictions)


print(f"Validation Precision: {val_precision:.4f}")
print(f"Validation Recall: {val_recall:.4f}")
print(f"Validation F1-score: {val_f1:.4f}")

Validation Precision: 0.9951
Validation Recall: 0.9760
Validation F1-score: 0.9854
