In [1]:
# read the review csv in dataset folder
import pandas as pd
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments
# Initialize a GPT-2 tokenizer and model from Hugging Face.
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import GPT2LMHeadModel, AutoTokenizer, DataCollatorForSeq2Seq
from sklearn.model_selection import train_test_split
import torch

In [2]:
review_df = pd.read_csv('dataset/Reviews.csv')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# drop columns that are not needed
review_df = review_df.drop(columns=['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Time', 'Score'])
review_df = review_df.dropna()
# Choose 10k rows randomly

# drop rows with text concatenated with summary greater than 1024
review_df = review_df.drop(review_df[review_df['Text'].str.len() + review_df['Summary'].str.len() > 800].index)

# iterate over dataframe
for index, row in review_df.iterrows():
    # if length of review_df['Text'] + ' TL;DR: ' + review_df['Summary'] is greater than 1024
    if len(row['Text']) +  len(' TL;DR: ') + len(row['Summary']) > 1024:
        # then truncate the review_df['Text'] to 1024 - len(' TL;DR: ') - len(row['Summary'])
        review_df.at[index, 'Text'] = row['Text'][:1024 - len(' TL;DR: ') - len(row['Summary'])]
        # then concatenate the truncated review_df['Text'] with ' TL;DR: ' and review_df['Summary']
        review_df.at[index, 'concat'] = review_df.at[index, 'Text'] + ' TL;DR: ' + review_df.at[index, 'Summary']

    else:
        # else concatenate the review_df['Text'] with ' TL;DR: ' and review_df['Summary']
        review_df.at[index, 'concat'] = row['Text'] + ' TL;DR: ' + row['Summary']

In [4]:
review_df = review_df.sample(n=10000)
# test train split
train_df, test_df = train_test_split(review_df, test_size=0.25)

In [5]:
MAX_LENGTH = test_df['concat'].str.len().max()
print(MAX_LENGTH)

808


In [6]:

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50258, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50258, bias=False)
)

In [7]:
from torch.utils.data import Dataset, DataLoader

class SummaryDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.data = []

        for _, row in df.iterrows():
            review = (row['concat'])
            tokenized_text = tokenizer.encode(review, add_special_tokens=True , padding = True, max_length=MAX_LENGTH+1)
            tokenized_text.append(tokenizer.eos_token_id)
            self.data.append(tokenized_text)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.long)

In [8]:
train_dataset = SummaryDataset(tokenizer=tokenizer, max_length=MAX_LENGTH ,df=train_df)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)



In [9]:
training_args = TrainingArguments(
        output_dir="./results",
        per_device_train_batch_size=1,
        num_train_epochs=3,
        learning_rate=5.6e-5,            # learning rate
        # logging_dir='./logs',            # directory for storing logs
        save_strategy="no",
        use_cpu=False,
        fp16=True if device == "cuda" else False  # Enable mixed precision if using GPU
)

trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
)

trainer.train()
trainer.save_model()

Step,Training Loss
500,4.5391
1000,3.5234
1500,3.4354
2000,3.3755
2500,3.3928
3000,3.3751
3500,3.3583
4000,3.323
4500,3.3509
5000,3.32


In [14]:
model.save_pretrained('model')
tokenizer.save_pretrained('model')

Non-default generation parameters: {'max_length': 50, 'do_sample': True}


('model/tokenizer_config.json',
 'model/special_tokens_map.json',
 'model/vocab.json',
 'model/merges.txt',
 'model/added_tokens.json')

In [15]:
# Load the model
from transformers import GPT2LMHeadModel, GPT2Tokenizer
model = GPT2LMHeadModel.from_pretrained('model')
tokenizer = GPT2Tokenizer.from_pretrained('model')

In [21]:
from transformers import pipeline, set_seed

summarizer = pipeline("text-generation", model=model , tokenizer=tokenizer)

prediction =[]
actual =[]

for col , row in test_df[:100].iterrows():
    ARTICLE = row['Text']
    SUMMARY = row['Summary']
    ARTICLE = ARTICLE + " TL;DR:"

    ans = summarizer(ARTICLE, min_new_tokens=20, max_new_tokens=80, top_k = 2 )

    prediction.append(ans[0]['generated_text'][len(ARTICLE):])
    actual.append(SUMMARY)

    # print("Article: ", ARTICLE)
    # print("Actual Summary: ", SUMMARY)
    # print("Predicted Summary: ", ans[0]['generated_text'][len(ARTICLE):])


from rouge import Rouge
rouge = Rouge()
scores = rouge.get_scores(prediction, actual, avg=True)
print(scores)



{'rouge-1': {'r': 0.07823015873015873, 'p': 0.030237229273993974, 'f': 0.041010727852751866}, 'rouge-2': {'r': 0.005833333333333333, 'p': 0.0024431818181818183, 'f': 0.00335294109879585}, 'rouge-l': {'r': 0.0768015873015873, 'p': 0.029126118162882866, 'f': 0.039760727852751865}}


In [22]:
ARTICLE = test_df['Text'].iloc[1]
SUMMARY = test_df['Summary'].iloc[1]
ARTICLE = ARTICLE + " TL;DR:"

ans = summarizer(ARTICLE, min_new_tokens=20, max_new_tokens=100, top_k = 2 )

prediction = ans[0]['generated_text'][len(ARTICLE):]
actual = SUMMARY


print("Article: ", ARTICLE)
print("Actual Summary: ", SUMMARY)
print("Predicted Summary: ", prediction)


Article:  We love these organic corn chips. They have an excellent crunch big chips great for dipping. Try with  Salsa, Mild, Organic, 17.5 oz. or homemade salsa. We have these on suscribe and save so we never run out. Very delicious corn taste similar to fritos. TL;DR:
Actual Summary:  Most Excellent Organic yellow corn chips. Great with homemade salsa.
Predicted Summary:   Delicious! Great taste and great price! Great for dipping. Great for dipping in salsa. Great for dipping in salsa.
