<a href="https://colab.research.google.com/github/yinon2592/DL_Project_046211/blob/main/section_c.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
from google.colab import drive
# drive.mount('/content/drive/my-drive/project_calculations')
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
! pip install transformers
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import torch.nn.functional as F
import csv

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [22]:
### Prepare data
import os
import pandas as pd
from torch.utils.data import Dataset
import re
import torch
# Step 1: Dataset Preparation
# Step 2: Data Preprocessing

# load section_c data (data already cleaned)
section_c_data_path = '/content/drive/My Drive/project_dataset/section_c_data.csv'
df = pd.read_csv(section_c_data_path)
df = df.sample(100, random_state=1)
print("dataset size is ", df.shape[0])
print(df.label.value_counts())
print(df.sample(5), "\n")

dataset size is  100
1    51
0    49
Name: label, dtype: int64
        label                                               text
506952      0                                                NaN
299056      1       video is coming along good the intro is done
45979       0  i leave friday i don t know what lydia is doin...
368636      0  really trying out kde 4 for the first time lov...
103692      0  neck hurts from a hard ass wreck wakin today n... 



In [46]:
class Texts(Dataset):
    def __init__(self, control_code, truncate=False, gpt2_type="gpt2", max_length=1024):

        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
        self.texts = []

        for _, row in df.iterrows():
          label = row['label']
          sentiment = 'positive' if label == 1 else 'negative'
          text = row['text'].split()
          text.insert(0, f"Considering positive or negative sentiment, the following sentence is classified as {sentiment}. ")
          self.texts.append(torch.tensor(
                self.tokenizer.encode(f"<|{control_code}|>{row[:max_length]}<|endoftext|>")
            ))
        if truncate:
            self.texts = self.texts[:20000]
        self.texts_count = len(self.texts)

    def __len__(self):
        return self.texts_count

    def __getitem__(self, item):
        return self.texts[item]

df.dropna(how='any', inplace=True)
dataset = Texts(df['text'], truncate=False, gpt2_type="gpt2")

In [47]:
#Get the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

load_model_parameters = False
model_path = '/content/drive/My Drive/project_calculations/generative_model.pth'
if os.path.exists(model_path) and load_model_parameters:
  print("loading last model parameters..")
  model.load_state_dict(torch.load(model_path))

#Accumulated batch size (since GPT2 is so big)
def pack_tensor(new_tensor, packed_tensor, max_seq_len):
    if packed_tensor is None:
        return new_tensor, True, None
    if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len:
        return packed_tensor, False, new_tensor
    else:
        packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1)
        return packed_tensor, True, None

In [48]:
def train(
    dataset, model, tokenizer,
    batch_size=16, epochs=2, lr=2e-5,
    max_seq_len=400, warmup_steps=200,
    gpt2_type="gpt2", output_dir=".", output_prefix="wreckgar",
    test_mode=False,save_model_on_epoch=False,
):
    acc_steps = 100

    device = 'cuda' if torch.cuda.is_available() else 'cpu'


    # device=torch.device("cuda")
    # model = model.cuda()
    model.to(device)
    model.train()

    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )

    train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
    loss=0
    accumulating_batch_count = 0
    input_tensor = None
    best_loss = float('inf')
    for epoch in range(epochs):
      total_loss = []
      for idx, entry in tqdm(enumerate(train_dataloader)):
          (input_tensor, carry_on, remainder) = pack_tensor(entry, input_tensor, 768)

          if carry_on and idx != len(train_dataloader) - 1:
              continue

          input_tensor = input_tensor.to(device)
          outputs = model(input_tensor, labels=input_tensor)
          loss = outputs[0]
          total_loss.append(loss)
          loss.backward()

          if (accumulating_batch_count % batch_size) == 0:
              optimizer.step()
              scheduler.step()
              optimizer.zero_grad()
              model.zero_grad()

          accumulating_batch_count += 1
          input_tensor = None
      if save_model_on_epoch and loss < best_loss:
        best_loss = loss
        print("bset loss so far is ", best_loss)
        torch.save(model.state_dict(), '/content/drive/My Drive/project_calculations/generative_model.pth')
      print(f"Training epoch {epoch} , Train Loss: {torch.tensor(total_loss).mean():.3f}")

    return model

In [49]:
model = train(dataset=dataset, model=model, tokenizer=tokenizer, save_model_on_epoch=False, epochs=3)

99it [00:02, 39.17it/s]


Training epoch 0 , Train Loss: 2.660


99it [00:03, 26.66it/s]


Training epoch 1 , Train Loss: 2.630


99it [00:02, 38.83it/s]

Training epoch 2 , Train Loss: 2.624



