# Install packages

In [1]:
! pip install transformers
! pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m101.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m107.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, https://

In [2]:
!pwd

/content


# import all packages

In [3]:
from transformers import BertTokenizer, GPT2LMHeadModel, TextGenerationPipeline,GPT2Tokenizer
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

import re
import pandas as pd
import datetime
import torch

## Define data cleaning function to remove symbols

In [4]:
def cleaning(s):
    s = str(s)
    s = re.sub('\s\W',' ',s)
    s = re.sub('\W,\s',' ',s)
    s = re.sub("\d+", "", s)
    s = re.sub('\s+',' ',s)
    s = re.sub('[!@#$_]', '', s)
    s = s.replace("co","")
    s = s.replace("https","")
    s = s.replace("[\w*"," ")
    return s

## Converting CVS file to TXT file

In [5]:
def cvs2txt(origin_filename,saving_file):

    df = pd.read_csv(origin_filename, encoding="ISO-8859-1") 
    df = df.dropna()
    text_data = open(saving_file, 'w')
    for idx, item in df.iterrows():
        article = cleaning(item["Article"])
        text_data.write(article)
    text_data.close()
    print('txt data is saved!')

In [6]:
def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size
    )
    return dataset

In [7]:
def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, 
        mlm=mlm,
    )
    return data_collator

## Define Training function

In [8]:
def train(train_file_path,model_name,output_dir,overwrite_output_dir,per_device_train_batch_size,num_train_epochs,save_steps,date):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print('>>>>>> Using %s to train your model <<<<<<'%device)
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    train_dataset = load_dataset(train_file_path, tokenizer)
    data_collator = load_data_collator(tokenizer)
    # tokenizer.save_pretrained(output_dir)
    model = GPT2LMHeadModel.from_pretrained(model_name)
    # model.save_pretrained(output_dir)
    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=overwrite_output_dir,
        per_device_train_batch_size=per_device_train_batch_size,
        num_train_epochs=num_train_epochs,
        optim='adamw_torch',
        )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        )
    model_path = output_dir+'_'+str(save_steps)+'_'+str(num_train_epochs)+'_'+str(date)
    trainer.train()
    trainer.save_model(model_path)
    print('>>>>> New model is saved! <<<<<')

In [9]:
import os
origin_filename = "Articles.csv"
saving_file = "Articles.txt"
cvs2txt(origin_filename,saving_file)

today = datetime.date.today()
train_file_path = "Articles.txt"


txt data is saved!


In [15]:
os.environ["WANDB_DISABLED"] = "true"
model_name = 'gpt2'
output_dir = 'gpt2-fine-tune/'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 5.0
save_steps = 500

In [16]:
path = os.getcwd()
new_output_dir = output_dir+str(today)
path = os.path.join(path,new_output_dir)
# print(f'path:{path}')
isExist = os.path.exists(path)

In [17]:
if not isExist:
    os.makedirs(path)
    print('>>>>>> Model Folder is created <<<<<<')

>>>>>> Model Folder is created <<<<<<


# Model training

In [18]:
train(
  train_file_path=train_file_path,
  model_name=model_name,
  output_dir=path,
  overwrite_output_dir=overwrite_output_dir,
  per_device_train_batch_size=per_device_train_batch_size,
  num_train_epochs=num_train_epochs,
  save_steps=save_steps,
  date=today)

>>>>>> Using cuda to train your model <<<<<<


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]



Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
500,3.6997
1000,3.4093
1500,3.1665
2000,3.1266
2500,2.9756
3000,2.9593
3500,2.8563
4000,2.8528
4500,2.7917
5000,2.7824


>>>>> New model is saved! <<<<<


# Model Evaluation

### Before training

In [19]:
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)


text_generator = TextGenerationPipeline(model, tokenizer)   
output = text_generator("The Sindh government", max_length=100, do_sample=True)
print(f'output:{output}')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


output:[{'generated_text': 'The Sindh government on Thursday said it will stop mining in its territory, even where the mining permits belong without the approval of Congress. The Sindh government on Thursday said it will stop mining IOUs in its territory, even where the mining permits belong without the approval of Congress.\n\nIn 2012, the Supreme Court cleared an exemption for the mining in Haryana and Rajasthan. Since then, the Haryana government has said that "it has not taken any action on the'}]


### After training

In [21]:
checkpoint = 'checkpoint-5000' ### choose the saved checkpoint
saved_model = output_dir + checkpoint
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(saved_model)


text_generator = TextGenerationPipeline(model, tokenizer)   
output = text_generator("The Sindh government", max_length=100, do_sample=True)
print(f'output:{output}')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


output:[{'generated_text': 'The Sindh government and media have been saying for the last four days that it uld take action.On December, Punjab government issued a notification asking the government of Sindh to send Rs million annually to K-Electric, a small utility mpany belonging to Pakistan State Power Corporation PBS). PBS was set up in March.Pakistani Power has since been shut down and K-Electric is facing closure.strong>NEW DELHI: Cricket´s governing body has approved India´s cricket team´'}]
