In [None]:
from google.colab import drive
drive.mount('/content/drive')

import torch

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Mounted at /content/drive
Using device: cuda


In [None]:
import os
import re

# Load the Cornell Movie Dialogs Corpus files
data_dir = '/content/drive/MyDrive/AAI520-NLP/Final_Project/data'
lines_file = os.path.join(data_dir, 'movie_lines.txt')
conversations_file = os.path.join(data_dir, 'movie_conversations.txt')

# Function to load the dataset and check the number of lines/size
def analyze_file(file_path, delimiter=' +++$+++ '):
    # Load the file
    with open(file_path, 'r', encoding='iso-8859-1') as f:
        lines = f.readlines()

    # First line (for checking the structure)
    sample_line = lines[0].strip().split(delimiter)

    # Get the number of columns (variables)
    num_variables = len(sample_line)

    # Get the total number of lines (data points)
    num_rows = len(lines)

    # Size of the dataset (in bytes)
    file_size = os.path.getsize(file_path)

    print(f"File: {os.path.basename(file_path)}")
    print(f"Number of Rows (data points): {num_rows}")
    print(f"Number of Variables (columns): {num_variables}")
    print(f"Size of Dataset: {file_size / (1024 * 1024):.2f} MB")
    print(f"Sample Line: {sample_line}\n")

    return num_rows, num_variables, file_size

# Analyze the 'movie_lines.txt' file
analyze_file(lines_file)

# Analyze the 'movie_conversations.txt' file
analyze_file(conversations_file)

File: movie_lines.txt
Number of Rows (data points): 304713
Number of Variables (columns): 5
Size of Dataset: 33.04 MB
Sample Line: ['L1045', 'u0', 'm0', 'BIANCA', 'They do not!']

File: movie_conversations.txt
Number of Rows (data points): 83097
Number of Variables (columns): 4
Size of Dataset: 6.45 MB
Sample Line: ['u0', 'u2', 'm0', "['L194', 'L195', 'L196', 'L197']"]



(83097, 4, 6760930)

In [None]:
# Load movie lines and conversations
def load_lines(file_path):
    # Load all the lines from the movie_lines.txt file and store them in a dictionary.
    lines = {}
    with open(file_path, 'r', encoding='iso-8859-1') as file:
        for line in file.readlines():
            parts = line.split(" +++$+++ ")
            if len(parts) == 5:
                # Line ID -> Dialogue text
                lines[parts[0]] = parts[4].strip()
    return lines

def load_conversations(file_path, lines):
    # Load conversations from movie_conversations.txt and match with the corresponding lines.
    conversations = []
    with open(file_path, 'r', encoding='iso-8859-1') as file:
        for line in file.readlines():
            parts = line.split(" +++$+++ ")
            if len(parts) == 4:
                line_ids = eval(parts[3])  # Extract the list of line IDs
                conversation = [lines[line_id] for line_id in line_ids if line_id in lines]
                conversations.append(conversation)
    return conversations

# Load the data
lines = load_lines(lines_file)
conversations = load_conversations(conversations_file, lines)

# Create input-output pairs from conversations
def create_conversation_pairs(conversations, context_size=2):
    """Create input-output pairs from the conversations using a sliding window approach."""
    input_texts = []
    target_texts = []
    for conversation in conversations:
        for i in range(len(conversation) - context_size):
            # Join the context lines as input
            input_text = " ".join(conversation[i:i + context_size])
            # Next line is the target
            target_text = conversation[i + context_size]
            input_texts.append(input_text)
            target_texts.append(target_text)
    return input_texts, target_texts

# Generate input and target pairs
input_texts, target_texts = create_conversation_pairs(conversations, context_size=2)

# Print a sample input-output pair
print("Input:", input_texts[0])
print("Target:", target_texts[0])

Input: Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again. Well, I thought we'd start with pronunciation, if that's okay with you.
Target: Not the hacking and gagging and spitting part.  Please.


In [None]:
import torch
from torch.utils.data import Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling

# Load the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set pad_token as eos_token

# Use GPU or CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Custom Dataset class for tokenized inputs
class CustomDataset(Dataset):
    def __init__(self, input_texts, target_texts, tokenizer, max_length=128):
        self.input_texts = input_texts
        self.target_texts = target_texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, idx):
        # Tokenize inputs and targets
        inputs = self.tokenizer(self.input_texts[idx], padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt")
        targets = self.tokenizer(self.target_texts[idx], padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt")

        # Move to the appropriate device (GPU/CPU)
        inputs = {key: value.squeeze(0).to(device) for key, value in inputs.items()}
        targets = {key: value.squeeze(0).to(device) for key, value in targets.items()}

        # Set labels
        inputs['labels'] = targets['input_ids']

        return inputs

# Create the dataset for training and evaluation
train_input_texts = input_texts[:5000]
train_target_texts = target_texts[:5000]

eval_input_texts = input_texts[5000:6000]
eval_target_texts = target_texts[500:6000]

# Instantiate custom datasets
train_dataset = CustomDataset(train_input_texts, train_target_texts, tokenizer)
eval_dataset = CustomDataset(eval_input_texts, eval_target_texts, tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',                  # Output directory for saving model and logs
    evaluation_strategy="steps",             # Evaluate every X steps
    save_strategy="steps",                   # Save model at every X steps
    logging_dir='./logs',                    # Directory for storing logs
    logging_steps=10,                        # Log every 10 steps
    eval_steps=50,                           # Evaluate every 50 steps
    per_device_train_batch_size=16,          # Batch size for training
    per_device_eval_batch_size=16,           # Batch size for evaluation
    num_train_epochs=3,                      # Number of training epochs
    weight_decay=0.01,                       # Apply weight decay to prevent overfitting
    learning_rate=5e-6,                      # Set a lower learning rate
    load_best_model_at_end=True,             # Load the best model when stopping early
    save_total_limit=2,                      # Save only the last 2 models
    report_to="none"                         # Disable reporting to external tools (optional)
)

# Data collator for padding the data
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Load the pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")
model = model.to(device)  # Move model to the GPU if available


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]



Using device: cuda


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
from transformers import EarlyStoppingCallback

# Initialize the Trainer with train and eval datasets
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,      # Use the custom train_dataset
    eval_dataset=eval_dataset,        # Use the custom eval_dataset
    data_collator=data_collator,      # Collate the data with padding
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # Stop if no improvement for 3 evaluations
)


# Train the model
trainer.train()

Step,Training Loss,Validation Loss
50,3.9748,3.862643
100,3.8537,3.816214
150,3.7816,3.79462
200,3.644,3.780359
250,3.6941,3.770197
300,3.7031,3.76104
350,3.6463,3.759307
400,3.6733,3.755812
450,3.7887,3.753564
500,3.6555,3.750248


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=939, training_loss=3.70177989061902, metrics={'train_runtime': 730.2872, 'train_samples_per_second': 20.54, 'train_steps_per_second': 1.286, 'total_flos': 979845120000000.0, 'train_loss': 3.70177989061902, 'epoch': 3.0})

In [None]:
# Function to generate responses using the fine-tuned model
def generate_response(prompt, model, tokenizer, max_length=50):
    inputs = tokenizer.encode(prompt, return_tensors="pt").to(device)

    # Generate response with modified settings
    outputs = model.generate(
        inputs,
        max_length=max_length,                 # Limit response length
        pad_token_id=tokenizer.eos_token_id,   # Ensure padding uses EOS token
        no_repeat_ngram_size=3,                # Prevent repeating 3-grams
        top_k=50,                              # Consider top 50 words by probability
        top_p=0.9,                             # Use nucleus sampling with 90% probability mass
        temperature=0.5,                       # Control randomness
        early_stopping=True                    # Stop early at a coherent response
    )

    # Decode the output and stop at the first period, exclamation mark, or question mark
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Stop at the first complete sentence
    for end_char in [".", "!", "?"]:
        if end_char in response:
            response = response.split(end_char)[0] + end_char
            break

    return response

# Sample conversation
user_input = "I love movies, do you?"
response = generate_response(user_input, model, tokenizer)
print("Chatbot:", response)

user_input = "Do you like pizza?"
response = generate_response(user_input, model, tokenizer)
print("Chatbot:", response)

user_input = "Do you like popcorn?"
response = generate_response(user_input, model, tokenizer)
print("Chatbot:", response)

user_input = "What is your favorite movie?"
response = generate_response(user_input, model, tokenizer)
print("Chatbot:", response)


Chatbot: I love movies, do you? I love them.
Chatbot: Do you like pizza? I'm not a pizza guy.
Chatbot: Do you like popcorn? I'm not sure.
Chatbot: What is your favorite movie?  I'm not sure.


In [None]:
import torch
import os

save_dir = '/content/drive/MyDrive/AAI520-NLP/Final_Project/FlaskApp'

# Ensure the directory exists
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# Save the model weights
model_file_path = os.path.join(save_dir, 'pytorch_model.bin')
torch.save(model.state_dict(), model_file_path)

# Save the config file
config = model.config
config.save_pretrained(save_dir)

# Save the tokenizer
tokenizer.save_pretrained(save_dir)

('/content/drive/MyDrive/AAI520-NLP/Final_Project/FlaskApp/tokenizer_config.json',
 '/content/drive/MyDrive/AAI520-NLP/Final_Project/FlaskApp/special_tokens_map.json',
 '/content/drive/MyDrive/AAI520-NLP/Final_Project/FlaskApp/vocab.json',
 '/content/drive/MyDrive/AAI520-NLP/Final_Project/FlaskApp/merges.txt',
 '/content/drive/MyDrive/AAI520-NLP/Final_Project/FlaskApp/added_tokens.json')