<a href="https://colab.research.google.com/github/yinon2592/DL_Project_046211/blob/main/DL_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Make data directory if it doesn't exist
!pip install transformers
!mkdir -p data
!wget -nc https://nyc3.digitaloceanspaces.com/ml-files-distro/v1/investigating-sentiment-analysis/data/training.1600000.processed.noemoticon.csv.zip -P data
!unzip -n -d data data/training.1600000.processed.noemoticon.csv.zip

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.1-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m66.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m107.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90

In [3]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, AdamW


df = pd.read_csv("data/training.1600000.processed.noemoticon.csv",
                names=['polarity', 'id', 'date', 'query', 'user', 'text'],
                encoding='latin-1')
df.head()

ParserError: ignored

In [None]:
df.polarity.value_counts()

In [None]:
df.polarity = df.polarity.replace({0: 0, 4: 1})
df.polarity.value_counts()

In [None]:
df = df.drop(columns=['id', 'date', 'query', 'user'])
df.head()

In [None]:
# To make the filesize a little smaller and pandas a little happier, let's knock this down to 500,000 tweets.
# df = df.sample(n=500000)
df = df.sample(n=100)
df.polarity.value_counts()

In [None]:
df.to_csv("data/sentiment140-subset.csv", index=False)

In [None]:
# Split into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'], df['polarity'], test_size=0.2, random_state=42
)

In [None]:
# Step 2: Fine-tune the GPT-2 model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# tokenizer.pad_token = tokenizer.eos_token  # Select 'eos_token' as the padding token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})  # Add a new padding token
# model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=2, hidden_size=1024)
model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=2, hidden_size=768)
# model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=2)

train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=256)
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True, max_length=256)


train_dataset = torch.utils.data.TensorDataset(
    torch.tensor(train_encodings['input_ids']),
    torch.tensor(train_encodings['attention_mask']),
    torch.tensor(train_labels.tolist())
)

test_dataset = torch.utils.data.TensorDataset(
    torch.tensor(test_encodings['input_ids']),
    torch.tensor(test_encodings['attention_mask']),
    torch.tensor(test_labels.tolist())
)

In [None]:
# import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

# device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

In [None]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
# model.resize_token_embeddings(len(tokenizer))
optimizer = AdamW(model.parameters(), lr=1e-5)
print(model.transformer.wte.weight.shape)



for epoch in range(3):  # Adjust the number of epochs as needed
    for batch in train_loader:
        input_ids, attention_mask, labels = batch

        # # Adjust the input dimensions if needed
        # if input_ids.dim() == 2:  # Add an extra dimension if input_ids has shape (batch_size, sequence_length)
        #     input_ids = input_ids.unsqueeze(0)
        # if attention_mask.dim() == 2:  # Add an extra dimension if attention_mask has shape (batch_size, sequence_length)
        #     attention_mask = attention_mask.unsqueeze(0)

        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Step 3: Evaluate the model
model.eval()
eval_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16)

with torch.no_grad():
    correct = 0
    total = 0
    for batch in eval_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_labels = torch.argmax(logits, dim=1)
        total += labels.size(0)
        correct += (predicted_labels == labels).sum().item()

accuracy = correct / total
print(f"Accuracy: {accuracy}")

# Step 4: Predict sentiment on new data
new_texts = ['I love this movie!', 'This is terrible.']
new_encodings = tokenizer(new_texts, truncation=True, padding=True, return_tensors='pt')
new_input_ids = new_encodings['input_ids'].to(device)
new_attention_mask = new_encodings['attention_mask'].to(device)

with torch.no_grad():
    outputs = model(input_ids=new_input_ids, attention_mask=new_attention_mask)
    logits

In [5]:


# Make data directory if it doesn't exist
!pip install transformers


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
File ‘data/training.1600000.processed.noemoticon.csv.zip’ already there; not retrieving.

Archive:  data/training.1600000.processed.noemoticon.csv.zip


In [1]:



# uninstall
!pip uninstall -y wandb

# download
# !pip install transformers


!pip uninstall transformers
!pip uninstall accelerate
!pip install transformers[torch]

!mkdir -p data
!wget -nc https://nyc3.digitaloceanspaces.com/ml-files-distro/v1/investigating-sentiment-analysis/data/training.1600000.processed.noemoticon.csv.zip -P data
!unzip -n -d data data/training.1600000.processed.noemoticon.csv.zip

# import
import re
import json
import torch
import random
import pandas as pd
from tqdm import tqdm
from torch.utils.data import Dataset
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

[0mFound existing installation: transformers 4.30.1
Uninstalling transformers-4.30.1:
  Would remove:
    /usr/local/bin/transformers-cli
    /usr/local/lib/python3.10/dist-packages/transformers-4.30.1.dist-info/*
    /usr/local/lib/python3.10/dist-packages/transformers/*
Proceed (Y/n)? y
  Successfully uninstalled transformers-4.30.1
Found existing installation: accelerate 0.20.3
Uninstalling accelerate-0.20.3:
  Would remove:
    /usr/local/bin/accelerate
    /usr/local/bin/accelerate-config
    /usr/local/bin/accelerate-launch
    /usr/local/lib/python3.10/dist-packages/accelerate-0.20.3.dist-info/*
    /usr/local/lib/python3.10/dist-packages/accelerate/*
Proceed (Y/n)? y
  Successfully uninstalled accelerate-0.20.3
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers[torch]
  Using cached transformers-4.30.1-py3-none-any.whl (7.2 MB)
Collecting accelerate>=0.20.2 (from transformers[torch])
  Using cached acceler

In [2]:
# Dataset class
class SentimentDataset(Dataset):
    def __init__(self, txt_list, label_list, tokenizer, max_length):
        # define variables
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        map_label = {0:'negative', 4: 'positive'}
        # iterate through the dataset
        for txt, label in zip(txt_list, label_list):
            # prepare the text
            prep_txt = f'<|startoftext|>Review: {txt}\nSentiment: {map_label[label]}<|endoftext|>'
            # tokenize
            encodings_dict = tokenizer(prep_txt, truncation=True,
                                       max_length=max_length, padding="max_length")
            # append to list
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
            self.labels.append(map_label[label])

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx], self.labels[idx]

# Data load function
def load_sentiment_dataset(tokenizer, random_seed = 1, file_path="data/training.1600000.processed.noemoticon.csv"):
    # load dataset and sample 10k reviews.
    df = pd.read_csv(file_path, encoding='ISO-8859-1', header=None)
    df = df[[0, 5]]
    df.columns = ['label', 'text']
    df = df.sample(10000, random_state=1)

    def pick_first_n_words(string, max_words=250): # tried a few max_words, kept 250 as max tokens was < 512
        split_str = string.split()
        return " ".join(split_str[:min(len(split_str), max_words)])

    df['text'] = df['text'].apply(lambda x: pick_first_n_words(x))

    # divide into test and train
    X_train, X_test, y_train, y_test = \
              train_test_split(df['text'].tolist(), df['label'].tolist(),
              shuffle=True, test_size=0.05, random_state=random_seed, stratify=df['label'])

    # get max length
    max_length_train = max([len(tokenizer.encode(text)) for text in X_train])
    max_length_test = max([len(tokenizer.encode(text)) for text in X_test])
    max_length = max([max_length_train, max_length_test]) + 10  #for special tokens (sos and eos) and fillers
    max_length = max(max_length, 300)
    print(f"Setting max length as {max_length}")

    # format into SentimentDataset class
    train_dataset = SentimentDataset(X_train, y_train, tokenizer, max_length=max_length)

    # return
    return train_dataset, (X_test, y_test)

In [3]:
# import
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel

# model
model_name = "gpt2"
seed = 42

# seed
torch.manual_seed(seed)

<torch._C.Generator at 0x7fc1ecf9a1d0>

In [12]:
from sklearn.metrics import f1_score

# iterate for N trials
for trial_no in range(3):

    print("Loading model...")
    # load tokenizer and model
    tokenizer = GPT2Tokenizer.from_pretrained(model_name, bos_token='',
                                              eos_token='', pad_token='')
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})  # Add padding token
    model = GPT2LMHeadModel.from_pretrained(model_name).cuda()
    model.resize_token_embeddings(len(tokenizer))

    print("Loading dataset...")
    train_dataset, test_dataset = load_sentiment_dataset(tokenizer, trial_no)

    print("Start training...")
    training_args = TrainingArguments(
        output_dir='results',
        num_train_epochs=2,
        logging_steps=10,
        load_best_model_at_end=True,
        save_strategy="epoch",
        evaluation_strategy="epoch",  # Set the evaluation strategy to 'epoch'
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        warmup_steps=100,
        weight_decay=0.01,
        logging_dir='logs'
    )

    label_mapping = {"positive": 1, "negative": 0}

    def data_collator(data):
        input_ids = torch.stack([f[0] for f in data])
        attention_mask = torch.stack([f[1] for f in data])
        labels = torch.tensor([label_mapping[f[2]] for f in data])  # Map string labels to integers
        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

    trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset,
                      eval_dataset=test_dataset, data_collator=data_collator)
    trainer.train()

    # test
    print("Start testing...")
    # evaluation mode for model
    model.eval()

    # compute prediction on test data
    original, predicted = [], []
    map_label = {0: 'negative', 4: 'positive'}
    for batch in tqdm(test_dataset):
        text_batch, label_batch = batch
        batch_original, batch_predicted = [], []
        for text, label in zip(text_batch, label_batch):
            # predict sentiment on test data
            prompt = f'Review: {text}\nSentiment:'
            generated = tokenizer(f" {prompt}", return_tensors="pt").input_ids.cuda()
            sample_outputs = model.generate(generated, do_sample=False, top_k=50, max_length=512, top_p=0.90,
                                             temperature=0, num_return_sequences=0)
            pred_text = tokenizer.decode(sample_outputs[0], skip_special_tokens=True)
            # extract the predicted sentiment
            try:
                pred_sentiment = re.findall("\nSentiment: (.*)", pred_text)[-1]
            except:
                pred_sentiment = "None"
            batch_original.append(map_label[label])
            batch_predicted.append(pred_sentiment)
        original.extend(batch_original)
        predicted.extend(batch_predicted)

    # transform into dataframe
    df = pd.DataFrame({'original': original, 'predicted': predicted})
    df.to_csv(f"result_run_{trial_no}.csv", index=False)
    # compute f1 score
    original_labels = [label_mapping[label] for label in original]
    predicted_labels = [label_mapping[label] for label in predicted]
    f1 = f1_score(original_labels, predicted_labels, average='macro')
    print(f"F1 Score: {f1}")




Loading model...


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading dataset...
Setting max length as 300
Start training...


