<a href="https://colab.research.google.com/github/tahaShm/knowledge-distillation/blob/transfer-run/transferset_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%time
%%capture
!pip install transformers
!pip install datasets

CPU times: user 39.8 ms, sys: 15.1 ms, total: 54.9 ms
Wall time: 6.74 s


In [30]:
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel
from tqdm import tqdm

import datasets
from datasets import load_dataset

In [3]:
!nvidia-smi

Wed Dec 14 07:00:32 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  A100-SXM4-40GB      Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0    57W / 400W |      0MiB / 40536MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
torch.manual_seed(42)

<torch._C.Generator at 0x7f21edbbdfd0>

### Loading GPT2-Medium Model from 🤗 Model Hub 

In [5]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium', bos_token='<|startoftext|>',
                                          eos_token='<|endoftext|>', pad_token='<|pad|>')
model = GPT2LMHeadModel.from_pretrained('gpt2-medium').cuda()
model.resize_token_embeddings(len(tokenizer))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Embedding(50259, 1024)

### Configurations

In [6]:
DATA_PATH = '../input/netflix-shows/netflix_titles.csv'
DATA_HEADER = 'description'

OUTPUT_DIR = './results'
LOGGING_DIR = './logs'

EPOCHS = 1

LOGGING_STEPS = 100
SAVE_STEPS = 1000

TRAIN_BATCH_SIZE = 8
EVAL_BATCH_SIZE = 8

WARMUP_STEPS = 10

WEIGHT_DECAY = 0.05

REPORT_TO = 'none'

### Initial Dataset

#### Glue benchmark

In [7]:
glue_train_dataset = load_dataset('glue', 'sst2', split='train')
glue_val_dataset = load_dataset('glue', 'sst2', split='validation')
glue_test_dataset = load_dataset('glue', 'sst2', split='test')



In [8]:
glue_train = pd.DataFrame(glue_train_dataset)
glue_val = pd.DataFrame(glue_val_dataset)
glue_test = pd.DataFrame(glue_test_dataset)

In [9]:
glue_train

Unnamed: 0,sentence,label,idx
0,hide new secretions from the parental units,0,0
1,"contains no wit , only labored gags",0,1
2,that loves its characters and communicates som...,1,2
3,remains utterly satisfied to remain the same t...,0,3
4,on the worst revenge-of-the-nerds clichés the ...,0,4
...,...,...,...
67344,a delightful comedy,1,67344
67345,"anguish , anger and frustration",0,67345
67346,"at achieving the modest , crowd-pleasing goals...",1,67346
67347,a patient viewer,1,67347


#### Auxiliary (Twitter, IMDB)

In [10]:
twitter_train_dataset = load_dataset('carblacac/twitter-sentiment-analysis', split='train')
twitter_val_dataset = load_dataset('carblacac/twitter-sentiment-analysis', split='validation')
twitter_test_dataset = load_dataset('carblacac/twitter-sentiment-analysis', split='test')



In [11]:
twitter_train = pd.DataFrame(twitter_train_dataset)
twitter_val = pd.DataFrame(twitter_val_dataset)
twitter_test = pd.DataFrame(twitter_test_dataset)

twitter_train = twitter_train.sample(n=20000, random_state=42)

twitter_train.rename(columns = {'text':'sentence', 'feeling': 'label'}, inplace = True)
twitter_val.rename(columns = {'text':'sentence', 'feeling': 'label'}, inplace = True)
twitter_test.rename(columns = {'text':'sentence', 'feeling': 'label'}, inplace = True)

In [12]:
twitter_train

Unnamed: 0,sentence,label
47063,Bed time. He has a name. Ryan,1
34050,twitter is being mean to me.,0
21397,@VivaLaSara Oh dear! I hope you don't either!,0
71586,"@Redfrettchen Oh no, shit! I'm getting a litt...",0
95721,@modemlooper Let me know what fake functionali...,1
...,...,...
114905,Damn you West Brom... I knew I should have put...,0
33887,getting annoyed at trying to used to twitter! ...,0
98301,Iron Chef secret ingredient tonight? Spinach. ...,1
92136,Too much tweeting and FB'ing= dead battery,0


In [13]:
# imdb_train = pd.read_csv('ts_imdb.tsv', on_bad_lines='skip')
# imdb_train = imdb_train.sample(n=20000, random_state=42)
# imdb_train

### Dataset combination

In [14]:
train = pd.concat([glue_train, twitter_train])
val = pd.concat([glue_val, twitter_val])
test = pd.concat([glue_test, twitter_test])

In [15]:
train

Unnamed: 0,sentence,label,idx
0,hide new secretions from the parental units,0,0.0
1,"contains no wit , only labored gags",0,1.0
2,that loves its characters and communicates som...,1,2.0
3,remains utterly satisfied to remain the same t...,0,3.0
4,on the worst revenge-of-the-nerds clichés the ...,0,4.0
...,...,...,...
114905,Damn you West Brom... I knew I should have put...,0,
33887,getting annoyed at trying to used to twitter! ...,0,
98301,Iron Chef secret ingredient tonight? Spinach. ...,1,
92136,Too much tweeting and FB'ing= dead battery,0,


In [16]:
descriptions = train['sentence']

In [17]:
max_length = max([len(tokenizer.encode(description)) for description in descriptions])

In [18]:
max_length

643

In [19]:
class TrainDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in txt_list:
            encodings_dict = tokenizer('<|startoftext|>' + txt + '<|endoftext|>', truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [20]:
dataset = TrainDataset(descriptions, tokenizer, max_length=max_length)
train_size = int(0.2 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])

In [21]:
# for garbage collection

import gc
gc.collect()

0

In [22]:
torch.cuda.empty_cache()

In [23]:
training_args = TrainingArguments(output_dir=OUTPUT_DIR, num_train_epochs=EPOCHS, logging_steps=LOGGING_STEPS, 
                                  save_steps=SAVE_STEPS, per_device_train_batch_size=TRAIN_BATCH_SIZE, 
                                  per_device_eval_batch_size=EVAL_BATCH_SIZE, warmup_steps=WARMUP_STEPS, 
                                  weight_decay=WEIGHT_DECAY, logging_dir=LOGGING_DIR, report_to = REPORT_TO)


In [24]:
model_trainer = Trainer(model=model,  args=training_args, train_dataset=train_dataset, 
        eval_dataset=val_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])})
model_trainer.train()

***** Running training *****
  Num examples = 17469
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2184
  Number of trainable parameters = 354825216


Step,Training Loss
100,0.2802
200,0.1075
300,0.1032
400,0.1007
500,0.0963
600,0.0993
700,0.0974
800,0.0982
900,0.0983
1000,0.099


Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-2000
Configuration saved in ./results/checkpoint-2000/config.json
Model weights saved in ./results/checkpoint-2000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=2184, training_loss=0.10453161127838023, metrics={'train_runtime': 2134.323, 'train_samples_per_second': 8.185, 'train_steps_per_second': 1.023, 'total_flos': 2.037439994967245e+16, 'train_loss': 0.10453161127838023, 'epoch': 1.0})

In [25]:
model_trainer.save_model('/final_model')

Saving model checkpoint to /final_model
Configuration saved in /final_model/config.json
Model weights saved in /final_model/pytorch_model.bin


In [26]:
# fetched_model = AutoModelForSequenceClassification.from_pretrained("/final_model.bin")

### GPT Generated Description

In [27]:
generated = tokenizer("<|startoftext|> ", return_tensors="pt").input_ids.cuda()

In [38]:
for idx in range(10):
  current_batch_samples = []
  for i in tqdm(range(50)):
    sample_outputs = model.generate(generated, do_sample=True, top_k=50, 
                                max_length=200, min_length=5, top_p=0.95, temperature=1, num_return_sequences=200)
    current_batch_samples.extend(sample_outputs)
  sample_decode_output = [tokenizer.decode(sample_output, skip_special_tokens=True) for sample_output in current_batch_samples]
  df = pd.DataFrame(columns=['sentence']) 
  df['sentence'] = sample_decode_output
  df.to_csv('generated_' + str(idx+1) + '.csv', index=False)

  0%|          | 0/50 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  2%|▏         | 1/50 [00:08<07:06,  8.71s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  4%|▍         | 2/50 [00:17<06:55,  8.65s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  6%|▌         | 3/50 [00:25<06:46,  8.65s/it]The attention mask and the pad token id were not set. As a consequence,

In [39]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [40]:
model_trainer.save_model('/content/drive/MyDrive/571F_models/sst2')

Saving model checkpoint to /content/drive/MyDrive/571F_models/sst2
Configuration saved in /content/drive/MyDrive/571F_models/sst2/config.json
Model weights saved in /content/drive/MyDrive/571F_models/sst2/pytorch_model.bin


In [42]:
test_model = GPT2LMHeadModel.from_pretrained('/content/drive/MyDrive/571F_models/sst2').cuda()

loading configuration file /content/drive/MyDrive/571F_models/sst2/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2-medium",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 1024,
  "n_head": 16,
  "n_inner": null,
  "n_layer": 24,
  "n_positions": 1024,
  "n_special": 0,
  "predict_special_tokens": true,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transfor