In [54]:
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel

from sklearn.model_selection import train_test_split
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [55]:
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2', bos_token='<|startoftext|>',
                                          eos_token='<|endoftext|>', pad_token='<|pad|>')
model = GPT2LMHeadModel.from_pretrained('distilgpt2')
model.resize_token_embeddings(len(tokenizer))
model.to(device)

loading file https://huggingface.co/distilgpt2/resolve/main/vocab.json from cache at C:\Users\tharh/.cache\huggingface\transformers\55051ac97dcc32f0a736d21a32a4d42b0d9b90f117ca7c38e65038b04bd5c3f5.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f
loading file https://huggingface.co/distilgpt2/resolve/main/merges.txt from cache at C:\Users\tharh/.cache\huggingface\transformers\9dfb299b74cdf7601ba7cd3a8073dbdac351caec0ed7ab5849b098b3c8ae3d57.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/distilgpt2/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/distilgpt2/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/distilgpt2/resolve/main/tokenizer_config.json from cache at None
loading configuration file https://huggingface.co/distilgpt2/resolve/main/config.json from cache at C:\Users\tharh/.cache\huggingface\transformers\f985248d2791fcff97732e4ee263

RuntimeError: CUDA out of memory. Tried to allocate 148.00 MiB (GPU 0; 8.00 GiB total capacity; 7.22 GiB already allocated; 0 bytes free; 7.29 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
data_path = "IMDB Dataset.csv"
df = pd.read_csv(data_path)
df.head()
#titles = pd.read_csv(data_path)['title']
#titles.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
#df = df[:5000]
df.sentiment.unique()

array(['positive', 'negative'], dtype=object)

In [None]:
df['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [None]:
import re
#Creating for reviews
def clean_text(text):
    clean = re.compile(r'<.*?>')
    return re.sub(clean,'',text)

df["review"] = df["review"].apply(clean_text)

def remove_url(text):
    re_url = re.compile('https?://\S+|www\.\S+')
    return re_url.sub('', text).strip()

df["review"] = df["review"].apply(remove_url)

In [None]:
titles = df
titles.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
max_length = max([len(tokenizer.encode(title, truncation=True)) for title in titles['review']])
print("max_length : ",max_length)


max_length :  1024


In [None]:
max_length = 1022

In [None]:
train_titles, test_titles = train_test_split(titles, test_size=10)

In [None]:
class NetflixDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []

        pos_flag = torch.tensor([1,0])
        ng_flag = torch.tensor([0,1])


        for index, txt in txt_list.iterrows():
            encodings_dict = tokenizer('<|startoftext|>' + txt['review'] + '<|endoftext|>',
                                       max_length=max_length, padding="max_length", truncation=True)
            
            if txt['sentiment'] == 'positive':
                temp_input = torch.cat((torch.tensor(encodings_dict['input_ids']), pos_flag))
                temp_mask = torch.cat((torch.tensor(encodings_dict['attention_mask']), pos_flag))
            else:
                temp_input = torch.cat((torch.tensor(encodings_dict['input_ids']), ng_flag))
                temp_mask = torch.cat((torch.tensor(encodings_dict['attention_mask']), ng_flag))

            self.input_ids.append(temp_input)
            self.attn_masks.append(temp_mask)


    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [None]:
dataset = NetflixDataset(train_titles, tokenizer, max_length=max_length)
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])

In [None]:
for temp in val_dataset:
    print(temp[1])
    print(len(temp[1]))
    print(temp[0])
    print(len(temp[0]))
    break

tensor([1, 1, 1,  ..., 0, 0, 1])
1024
tensor([50257,   464,   582,  ..., 50258,     0,     1])
1024


In [None]:
print(val_dataset[200][1])
print(len(val_dataset[200][1]))
print(val_dataset[200][0])
print(len(val_dataset[200][0]))

tensor([1, 1, 1,  ..., 0, 1, 0])
1024
tensor([50257,  1870,   494,  ..., 50258,     1,     0])
1024


In [None]:
training_args = TrainingArguments(output_dir='./results', num_train_epochs=1, logging_steps=500, save_steps=2000,
                                  per_device_train_batch_size=1, per_device_eval_batch_size=1,
                                  warmup_steps=10, weight_decay=0.05, logging_dir='./logs', report_to = 'none')

PyTorch: setting up devices


In [None]:

torch.cuda.empty_cache()


In [None]:
Trainer(model=model,  args=training_args, train_dataset=train_dataset, 
        eval_dataset=val_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}).train()

***** Running training *****
  Num examples = 44991
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 44991


  0%|          | 0/44991 [00:00<?, ?it/s]

RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 8.00 GiB total capacity; 7.22 GiB already allocated; 0 bytes free; 7.29 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
model = GPT2LMHeadModel.from_pretrained('results/checkpoint-4000/pytorch_model.bin', config='results/checkpoint-4000/config.json')

In [None]:
test_str = "The movie "
generated = tokenizer("<|startoftext|> "+ test_str, return_tensors="pt").input_ids
print(generated)
generated = generated[0]
#print(generated)

pos_flag = torch.tensor([1,0])
ng_flag = torch.tensor([0,1])
#print(movie_flag)

generated = torch.cat((generated, pos_flag))
#print(generated)

temp = torch.tensor([generated.numpy()])
#print(temp)



In [None]:
sample_outputs = model.generate(temp,no_repeat_ngram_size = 1,num_beams=20, num_return_sequences=2, max_new_tokens = 30)

print(sample_outputs)
print(tokenizer.decode(sample_outputs[0], skip_special_tokens=True))

In [None]:
test_str = "The movie "
generated = tokenizer("<|startoftext|> "+ test_str, return_tensors="pt").input_ids
generated = generated[0]
#print(generated)

pos_flag = torch.tensor([1,0])
ng_flag = torch.tensor([0,1])
#print(movie_flag)

generated = torch.cat((generated, ng_flag))
#print(generated)

temp = torch.tensor([generated.numpy()])
print(temp)

In [None]:
sample_outputs = model.generate(temp,no_repeat_ngram_size = 1,num_beams=20, num_return_sequences=2, max_new_tokens = 30)

print(sample_outputs[0])
print("result ", tokenizer.decode(sample_outputs[0], skip_special_tokens=True))

In [None]:
results = []
for index, title1 in test_titles.iterrows():

    title = title1['review']
    new_titles = {
        'seed': f'{title.split()[0]} _ {title1.type}',
        'predictions': []
    }
    generated = tokenizer("<|startoftext|> "+ title.split()[0], return_tensors="pt").input_ids
    generated = generated[0]

    if title1['sentiment'] == 'positive':
        generated = torch.cat((generated, pos_flag))
    else:
        generated = torch.cat((generated, ng_flag))
    
    generated = torch.tensor([generated.numpy()])
    sample_outputs = model.generate(generated,no_repeat_ngram_size = 1,num_beams=20, num_return_sequences=2, max_new_tokens = 50)
    
    new_titles['predictions'] = sample_outputs
    results.append(new_titles)

In [None]:
for new_title in results:
    print(f"seed: {new_title['seed']}")
    for i, pred in enumerate(new_title['predictions']):
        print(f"{i+1}: {tokenizer.decode(pred, skip_special_tokens=True)}")

In [None]:
df.tail()