In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel

from sklearn.model_selection import train_test_split
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [2]:
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2', bos_token='<|startoftext|>',
                                          eos_token='<|endoftext|>', pad_token='<|pad|>')
model = GPT2LMHeadModel.from_pretrained('distilgpt2')
model.resize_token_embeddings(len(tokenizer))
model.to(device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50259, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [3]:
data_path = "IMDB Dataset.csv"
df = pd.read_csv(data_path)
df.head()
#titles = pd.read_csv(data_path)['title']
#titles.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
#df = df[:5000]
df.sentiment.unique()

array(['positive', 'negative'], dtype=object)

In [5]:
df['sentiment'].value_counts()

negative    2532
positive    2468
Name: sentiment, dtype: int64

In [6]:
import re
#Creating for reviews
def clean_text(text):
    clean = re.compile(r'<.*?>')
    return re.sub(clean,'',text)

df["review"] = df["review"].apply(clean_text)

def remove_url(text):
    re_url = re.compile('https?://\S+|www\.\S+')
    return re_url.sub('', text).strip()

df["review"] = df["review"].apply(remove_url)

In [7]:
titles = df
titles.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [8]:
max_length = max([len(tokenizer.encode(title, truncation=True)) for title in titles['review']])
print("max_length : ",max_length)


max_length :  1024


In [9]:
max_length = 1022

In [10]:
train_titles, test_titles = train_test_split(titles, test_size=10)

In [11]:
class NetflixDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []

        pos_flag = torch.tensor([1,0])
        ng_flag = torch.tensor([0,1])


        for index, txt in txt_list.iterrows():
            encodings_dict = tokenizer('<|startoftext|>' + txt['review'] + '<|endoftext|>',
                                       max_length=max_length, padding="max_length", truncation=True)
            
            if txt['sentiment'] == 'positive':
                temp_input = torch.cat((torch.tensor(encodings_dict['input_ids']), pos_flag))
                temp_mask = torch.cat((torch.tensor(encodings_dict['attention_mask']), pos_flag))
            else:
                temp_input = torch.cat((torch.tensor(encodings_dict['input_ids']), ng_flag))
                temp_mask = torch.cat((torch.tensor(encodings_dict['attention_mask']), ng_flag))

            self.input_ids.append(temp_input)
            self.attn_masks.append(temp_mask)


    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [12]:
dataset = NetflixDataset(train_titles, tokenizer, max_length=max_length)
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])

In [13]:
for temp in val_dataset:
    print(temp[1])
    print(len(temp[1]))
    print(temp[0])
    print(len(temp[0]))
    break

tensor([1, 1, 1,  ..., 0, 0, 1])
1024
tensor([50257, 14698, 16399,  ..., 50258,     0,     1])
1024


In [14]:
print(val_dataset[200][1])
print(len(val_dataset[200][1]))
print(val_dataset[200][0])
print(len(val_dataset[200][0]))

tensor([1, 1, 1,  ..., 0, 1, 0])
1024
tensor([50257,    72,   892,  ..., 50258,     1,     0])
1024


In [15]:
training_args = TrainingArguments(output_dir='./results', num_train_epochs=1, logging_steps=500, save_steps=2000,
                                  per_device_train_batch_size=1, per_device_eval_batch_size=1,
                                  warmup_steps=10, weight_decay=0.05, logging_dir='./logs', report_to = 'none')

In [16]:

torch.cuda.empty_cache()


In [17]:
Trainer(model=model,  args=training_args, train_dataset=train_dataset, 
        eval_dataset=val_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}).train()

***** Running training *****
  Num examples = 4491
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 4491


  0%|          | 0/4491 [00:00<?, ?it/s]

{'loss': 1.6873, 'learning_rate': 4.453247043070744e-05, 'epoch': 0.11}
{'loss': 1.1534, 'learning_rate': 3.895335862530685e-05, 'epoch': 0.22}
{'loss': 1.1471, 'learning_rate': 3.337424681990627e-05, 'epoch': 0.33}


Saving model checkpoint to ./results\checkpoint-2000
Configuration saved in ./results\checkpoint-2000\config.json


{'loss': 1.1686, 'learning_rate': 2.779513501450569e-05, 'epoch': 0.45}


Model weights saved in ./results\checkpoint-2000\pytorch_model.bin


{'loss': 1.197, 'learning_rate': 2.221602320910511e-05, 'epoch': 0.56}
{'loss': 1.0805, 'learning_rate': 1.663691140370453e-05, 'epoch': 0.67}
{'loss': 1.0942, 'learning_rate': 1.105779959830395e-05, 'epoch': 0.78}


Saving model checkpoint to ./results\checkpoint-4000
Configuration saved in ./results\checkpoint-4000\config.json


{'loss': 1.0909, 'learning_rate': 5.4786877929033695e-06, 'epoch': 0.89}


Model weights saved in ./results\checkpoint-4000\pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




{'train_runtime': 2687.8625, 'train_samples_per_second': 1.671, 'train_steps_per_second': 1.671, 'train_loss': 1.1911172062005957, 'epoch': 1.0}


TrainOutput(global_step=4491, training_loss=1.1911172062005957, metrics={'train_runtime': 2687.8625, 'train_samples_per_second': 1.671, 'train_steps_per_second': 1.671, 'train_loss': 1.1911172062005957, 'epoch': 1.0})

In [None]:
model = GPT2LMHeadModel.from_pretrained('results/checkpoint-4000/pytorch_model.bin', config='results/checkpoint-4000/config.json')

In [None]:
test_str = "The movie "
generated = tokenizer("<|startoftext|> "+ test_str, return_tensors="pt").input_ids
print(generated)
generated = generated[0]
#print(generated)

pos_flag = torch.tensor([1,0])
ng_flag = torch.tensor([0,1])
#print(movie_flag)

generated = torch.cat((generated, pos_flag))
#print(generated)

temp = torch.tensor([generated.numpy()])
#print(temp)



In [None]:
sample_outputs = model.generate(temp,no_repeat_ngram_size = 1,num_beams=20, num_return_sequences=2, max_new_tokens = 30)

print(sample_outputs)
print(tokenizer.decode(sample_outputs[0], skip_special_tokens=True))

In [None]:
test_str = "The movie "
generated = tokenizer("<|startoftext|> "+ test_str, return_tensors="pt").input_ids
generated = generated[0]
#print(generated)

pos_flag = torch.tensor([1,0])
ng_flag = torch.tensor([0,1])
#print(movie_flag)

generated = torch.cat((generated, ng_flag))
#print(generated)

temp = torch.tensor([generated.numpy()])
print(temp)

In [None]:
sample_outputs = model.generate(temp,no_repeat_ngram_size = 1,num_beams=20, num_return_sequences=2, max_new_tokens = 30)

print(sample_outputs[0])
print("result ", tokenizer.decode(sample_outputs[0], skip_special_tokens=True))

In [None]:
results = []
for index, title1 in test_titles.iterrows():

    title = title1['review']
    new_titles = {
        'seed': f'{title.split()[0]} _ {title1.type}',
        'predictions': []
    }
    generated = tokenizer("<|startoftext|> "+ title.split()[0], return_tensors="pt").input_ids
    generated = generated[0]

    if title1['sentiment'] == 'positive':
        generated = torch.cat((generated, pos_flag))
    else:
        generated = torch.cat((generated, ng_flag))
    
    generated = torch.tensor([generated.numpy()])
    sample_outputs = model.generate(generated,no_repeat_ngram_size = 1,num_beams=20, num_return_sequences=2, max_new_tokens = 50)
    
    new_titles['predictions'] = sample_outputs
    results.append(new_titles)

In [None]:
for new_title in results:
    print(f"seed: {new_title['seed']}")
    for i, pred in enumerate(new_title['predictions']):
        print(f"{i+1}: {tokenizer.decode(pred, skip_special_tokens=True)}")

In [None]:
df.tail()