In [1]:
import torch
import numpy as np
import pandas as pd

In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.9.2-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 11.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 27.9 MB/s 
[?25hCollecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 42.5 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 43.1 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully 

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling, TextDataset

In [4]:
#gpt2_tokenizer = AutoTokenizer.from_pretrained("gpt2")
gpt2_tokenizer = AutoTokenizer.from_pretrained("gpt2-medium")
_ = gpt2_tokenizer.add_special_tokens({'pad_token': '<|endoftext|>'}) #Later we will need to have set a pad_token
_ = gpt2_tokenizer.add_special_tokens({'additional_special_tokens': ['<|tweet|>']}) #Each comment is made to start with this token

#TODO try BART, BERTweet, GPT Neo instead

Downloading:   0%|          | 0.00/718 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [5]:
#gpt2_model = AutoModelForCausalLM.from_pretrained("gpt2") #500MB
gpt2_model = AutoModelForCausalLM.from_pretrained("gpt2-medium") #1.5G

Downloading:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

In [6]:
gpt2_data_collator = DataCollatorForLanguageModeling(tokenizer=gpt2_tokenizer, mlm=False)

In [7]:
class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.input_ids = encodings['input_ids']
        self.attention_mask = encodings['attention_mask']

    def __getitem__(self, idx):
        return {'input_ids': torch.tensor(self.input_ids[idx]), 'attention_mask': torch.tensor(self.attention_mask[idx])}

    def __len__(self):
        return len(self.input_ids)

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
!ls drive/MyDrive/LM_finetuning

 alliance_tweets_train25k_1epoch_gpt2medium_1000samples.csv
 alliance_tweets_train25k_1epoch_gpt2medium_reppen1pt5_1000samples.csv
 alliance_tweets_train25k_1epoch_gpt2small_1000samples.csv
 dup_model_train13k_1epoch_gpt2medium
 dup_tweets_train13k_1epoch_gpt2medium_1000samples.csv
 dup_tweets_train13k_1epoch_gpt2medium_reppen1pt5_1000samples.csv
 dup_tweets_train13k_1epoch_gpt2small_1000samples.csv
 fifteensquared_finetune_colab.ipynb
 fifteensquared_mjj2021_comments.txt
 fifteensquared_train3500_3epochs_gpt2medium_100samples.csv
 fifteensquared_train3500_5epochs_gpt2small_100samples.csv
 mla_tweets_2020tojune2021_forLMfinetuning.csv
 mla_tweets_finetune_colab.ipynb
 sdlp_model_train17k_1epoch_gpt2medium
 sdlp_tweets_train13k_1epoch_gpt2small_1000samples.csv
 sdlp_tweets_train17k_1epoch_gpt2medium_1000samples.csv
 sdlp_tweets_train17k_1epoch_gpt2medium_reppen1pt5_1000samples.csv
'sinn fein_tweets_train31k_1epoch_gpt2medium_1000samples.csv'
'sinn fein_tweets_train31k_1epoch_gpt2medium_

In [10]:
all_tweets = pd.read_csv('drive/MyDrive/LM_finetuning/mla_tweets_2020tojune2021_forLMfinetuning.csv')
print(all_tweets.shape)
print(f"Mean length = {all_tweets.text.apply(len).mean():.0f} characters")

(110355, 3)
Mean length = 167 characters


In [None]:
all_tweets.sample(5)

Unnamed: 0,user_id,party,text
57733,1016261363646246912,Alliance,<|tweet|>@CliffdotMac @jim_mouup @ANBorough As...
57308,583327329,Alliance,<|tweet|>Worth stressing a few things. Doesn't...
31736,44330731,SDLP,<|tweet|>Absolutely incredible#Brexitstupidity
74893,275636162,UUP,<|tweet|>@GavNix Might try and hunt that out. ...
3265,275636162,UUP,<|tweet|>@NITermite I show it for context like...


See how the base model would finish some examples. Tweets are short and unpredictable so this would be hard in any instance but from the fine tuned model we will be looking for a bit of NI specific language.

In [None]:
print(all_tweets.sample(1).to_string())

        user_id     party                                                                                                                                                                                                                                    text
67423  97452095  Alliance  <|tweet|>@MichelleGuy4 @AldermanAGrehan @Chris_McClem @SandiMcBe @tweeeetieeee @mynextproject @ELMQ86 @HamiltonAoife @paulamjennings @Joannemcneill1 @APiadlo No imposter about it. You're an inspiration. Keep on doing what you do x


In [None]:
inputs = gpt2_tokenizer.encode("""I've spoken with the individual targeted by a crude device in Ballymacash Drive""", add_special_tokens=False, return_tensors="pt")

outputs = gpt2_model.generate(inputs, 
                              max_length=128, do_sample=True, top_p=0.95, top_k=40, pad_token_id=gpt2_tokenizer.pad_token_id,
                              num_return_sequences=3)

for o in outputs:
    print('---- '+gpt2_tokenizer.decode(o)+'\n')

---- I've spoken with the individual targeted by a crude device in Ballymacash Drive-Thru. There's a video of him walking to his car with his gun drawn, then running down the road with an off-duty police officer.

A woman driving her Honda Civic got her vehicle into reverse. Police say she hit and killed the guy and is now on administrative leave. We can't reveal who this guy is, how he got into his car, but we can report that he's not a violent person. Anyone with more information is asked to call (904) 722-2236 or email dk_

---- I've spoken with the individual targeted by a crude device in Ballymacash Drive and they told me about the phone's battery. So you don't have to worry about a cell phone doing anything that would cause you to lose your life if it's in your car."

The woman went on to say that the device has been running off batteries, and that the company is not aware of any "other significant" cases where it's been found to be running off batteries.

"I do not believe this 

In [None]:
print(all_tweets.sample(1).to_string())

        user_id     party                                                                                              text
28388  41799103  Alliance  <|tweet|>NI Executive will deliver 100% rate relief for Childcare providers until 31 March 2021.


In [None]:
inputs = gpt2_tokenizer.encode("""NI Executive will deliver 100% rate relief""", add_special_tokens=False, return_tensors="pt")

outputs = gpt2_model.generate(inputs, 
                              max_length=128, do_sample=True, top_p=0.95, top_k=40, pad_token_id=gpt2_tokenizer.pad_token_id,
                              num_return_sequences=3)

for o in outputs:
    print('---- '+gpt2_tokenizer.decode(o)+'\n')

---- NI Executive will deliver 100% rate relief for these individuals," said the firm in a statement.

"I strongly urge the United States Congress to extend the funding of the Department of Health and Human Services to help ensure that the costs of care for those with pre-existing conditions do not skyrocket, which could significantly reduce the cost of prescription drug coverage."

The Senate's healthcare committee is expected to consider a similar budget resolution Thursday.<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|

In [None]:
print(all_tweets.sample(1).to_string())

         user_id party                                                                                                                                                                        text
60201  275636162   UUP  <|tweet|>@hampton_alan @AlisonW37619617 Again we don't prop them up.... if we step out all that happens is you get another SF Minister. You're venting at the wrong party.


In [None]:
inputs = gpt2_tokenizer.encode("""@hampton_alan @AlisonW37619617 Again we don't prop them up""", add_special_tokens=False, return_tensors="pt")

outputs = gpt2_model.generate(inputs, 
                              max_length=128, do_sample=True, top_p=0.95, top_k=40, pad_token_id=gpt2_tokenizer.pad_token_id,
                              num_return_sequences=3)

for o in outputs:
    print('---- '+gpt2_tokenizer.decode(o)+'\n')

---- @hampton_alan @AlisonW37619617 Again we don't prop them up. We just use their names, the same way they are used in other places. They will never get the trust. They will never be safe. It is hard to imagine that they will take your money from anybody and still put you to it. The problem is people don't trust us, the people that they say they trust will never trust them. And that is just not true. People have been going about their business since at least the days of The People's Party. Some days they have tried to get some of the money they got

---- @hampton_alan @AlisonW37619617 Again we don't prop them up at home and leave them here. It doesn't get any better.

11/10/2013 2:40:54 Mr.Lax_N_Polls says: I'd be willing to bet he would agree with me if he knew what we're talking about.

11/10/2013 2:42:03 TheOneHole says: I'd be willing to bet he would agree with me if he knew what we're talking about.

11/10/2013 2:42:23 TheOne

---- @hampton_alan @AlisonW37619617 Again we don't pr

Fine tune GPT2 on DUP tweets. 95th percentile token number is 66 if max_length=128, so shorten max_length to 64.

In [11]:
training_encoded_dup = gpt2_tokenizer(all_tweets[all_tweets.party=='DUP'].text.tolist(), padding=True, truncation=True, max_length=64)
print(f"Now have {len(training_encoded_dup['input_ids'])} encoded DUP tweets")
print(f"Median number of tokens = {np.median([len(l) for l in training_encoded_dup['input_ids']]):g}")
print(f"but excluding padding, median = {np.median([sum([e != 50256 for e in l]) for l in training_encoded_dup['input_ids']]):g}")
print(f"and 95% point = {np.quantile([sum([e != 50256 for e in l]) for l in training_encoded_dup['input_ids']], 0.95):g}")

Now have 16223 encoded DUP tweets
Median number of tokens = 64
but excluding padding, median = 38
and 95% point = 64


In [12]:
#Now expand the vocab (embedding size is 768 for gpt2, 1024 for gpt2-medium)

gpt2_model.resize_token_embeddings(len(gpt2_tokenizer))

Embedding(50258, 1024)

In [13]:
training_dataset_dup = TweetDataset(training_encoded_dup)

In [14]:
train_dataset_dup, eval_dataset_dup = torch.utils.data.random_split(training_dataset_dup, [int(len(training_dataset_dup)*0.8), len(training_dataset_dup)-int(len(training_dataset_dup)*0.8)])

In [15]:
#gpt2 (small):
#One epoch of 13000 with batch size 8, max_length 64, takes 4-7 mins on Colab
#A 'step' in args here is a batch, i.e. 13000/8 = 1625 steps is one epoch

#gpt2-medium: one epoch takes ~15 mins

training_args = TrainingArguments(
    num_train_epochs=1,              # total number of training epochs
    do_train=True,
    do_eval=True,
    evaluation_strategy='steps',
    eval_steps=300,
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    learning_rate=5e-5,   #default lr value
    save_strategy='no',
    output_dir='./results',          # output directory
    logging_dir='./logs',            # directory for storing logs
    logging_steps=500
)

In [None]:
trainer = Trainer(
    model=gpt2_model, 
    args=training_args,
    train_dataset=train_dataset_dup,
    eval_dataset=eval_dataset_dup,
    tokenizer = gpt2_tokenizer,
    data_collator = gpt2_data_collator
)

In [None]:
#1 epoch seems that it might be enough
trainer.train()

***** Running training *****
  Num examples = 12978
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1623


Step,Training Loss,Validation Loss
300,No log,3.995176
600,9.391100,3.740435
900,9.391100,3.595028
1200,3.765500,3.508511
1500,3.624900,3.461737


***** Running Evaluation *****
  Num examples = 3245
  Batch size = 64
***** Running Evaluation *****
  Num examples = 3245
  Batch size = 64
***** Running Evaluation *****
  Num examples = 3245
  Batch size = 64
***** Running Evaluation *****
  Num examples = 3245
  Batch size = 64
***** Running Evaluation *****
  Num examples = 3245
  Batch size = 64


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1623, training_loss=5.4415816837670405, metrics={'train_runtime': 2019.6666, 'train_samples_per_second': 6.426, 'train_steps_per_second': 0.804, 'total_flos': 1506584701698048.0, 'train_loss': 5.4415816837670405, 'epoch': 1.0})

Generate DUP samples from the <|tweet|> starter. After one epoch, the tweets are sensible; use a lot of mentions, most of which are real like peterweirmla, DUPleader, others are made up like SteveDoddsMLA; a lot of hashtags too like ProudofNI, EastBelfast, VetsDUP

In [None]:
stub_input = gpt2_tokenizer.encode("<|tweet|>", add_special_tokens=False, return_tensors="pt").to('cuda')

max_length_used_in_training = np.max([len(s.tokens) for s in training_encoded_dup[:50]])
outputs = gpt2_model.generate(stub_input, 
                              max_length=max_length_used_in_training*0.6, 
                              do_sample=True, top_p=0.95, top_k=40, 
                              pad_token_id=gpt2_tokenizer.pad_token_id, eos_token_id=gpt2_tokenizer.eos_token_id,
                              repetition_penalty = 1.5,  #reduces overuse of mentions and hashtags
                              num_return_sequences=8)

for i,o in enumerate(outputs):
    print(len(gpt2_tokenizer.decode(o)),'characters')
    print(str(i+1)+'   '+gpt2_tokenizer.decode(o)+'\n')

150 characters
1   <|tweet|>@Bearded_Dog @MicheleStewart1 That's a pity - he is probably the last dog who will needlessly go through this process, so much deserved. I am

129 characters
2   <|tweet|>@sinead_mccau @BelfastChamber Well done SINEAD! Keep up the great work - keep us safe! Stay Safe and Healthy!! #StaySafe

156 characters
3   <|tweet|>Good to join @PoppyCemetery today. #proudofNI is always a bit of an issue and it's not so easy for the community but also those who have been there

189 characters
4   <|tweet|>Another big day for our @LambertCovens who have had their licence removed by the Health Minister. Many of those in my constituency believe that this would be devastating if it were

207 characters
5   <|tweet|>@DUPleader Thanks DUP leadership for taking the opportunity to talk about how they want Northern Ireland's future protected and<|tweet|> changed. We have a choice between Brexit or Unionism, we need

168 characters
6   <|tweet|>@drewstevens1 Yes! The key to any re

In [None]:
outputs[1]

tensor([50257,    31,    82,   500,   324,    62,    76,   535,   559,  2488,
           33,  7046,   459,  1925,  7789,  3894,  1760,   311,  8881,  2885,
            0,  9175,   510,   262,  1049,   670,   532,  1394,   514,  3338,
         5145, 16160, 19978,   290, 30840,  3228,  1303, 25681, 31511],
       device='cuda:0')

In [None]:
#Save an output sample for analysis
#Force length to 0.6*number of training tokens because it is giving too many characters (>=250) otherwise; mean should be ~160

#Generating in small chunks avoids memory 
outputs1000 = [gpt2_model.generate(stub_input, 
                                 max_length=max_length_used_in_training*0.6, 
                                 do_sample=True, top_p=0.95, top_k=40, 
                                 pad_token_id=gpt2_tokenizer.pad_token_id, eos_token_id=gpt2_tokenizer.eos_token_id,
                                 repetition_penalty = 1.5,
                                 num_return_sequences=100) for _ in range(10)]
outputs1000 = torch.vstack(outputs1000)

In [None]:
outputs1000 = [gpt2_tokenizer.decode(o) for o in outputs1000]
outputs1000_df = pd.DataFrame({'sample_number': range(1,1001),
                              'generated_text': outputs1000})
#outputs1000_df.head(3)
#outputs1000_df.to_csv('drive/MyDrive/LM_finetuning/dup_tweets_train13k_1epoch_gpt2small_1000samples.csv', index=False)
#outputs1000_df.to_csv('drive/MyDrive/LM_finetuning/dup_tweets_train13k_1epoch_gpt2medium_1000samples.csv', index=False)
outputs1000_df.to_csv('drive/MyDrive/LM_finetuning/dup_tweets_train13k_1epoch_gpt2medium_reppen1pt5_1000samples.csv', index=False)

In [None]:
#Save, at least temporarily while iterating on the parameters
gpt2_model.save_pretrained(save_directory='drive/MyDrive/LM_finetuning/dup_model_train13k_1epoch_gpt2medium')

Configuration saved in drive/MyDrive/LM_finetuning/dup_model_train13k_1epoch_gpt2medium/config.json
Model weights saved in drive/MyDrive/LM_finetuning/dup_model_train13k_1epoch_gpt2medium/pytorch_model.bin


Do the same for SDLP

In [None]:
training_encoded_sdlp = gpt2_tokenizer(all_tweets[all_tweets.party=='SDLP'].text.tolist(), padding=True, truncation=True, max_length=64)
print(f"Now have {len(training_encoded_sdlp['input_ids'])} encoded SDLP tweets")

Now have 20725 encoded SDLP tweets


In [None]:
training_dataset_sdlp = TweetDataset(training_encoded_sdlp)

In [None]:
#Reload base model
#gpt2_model = AutoModelForCausalLM.from_pretrained("gpt2") #500MB

#Resize embeddings again
#gpt2_model.resize_token_embeddings(len(gpt2_tokenizer))

In [None]:
train_dataset_sdlp, eval_dataset_sdlp = torch.utils.data.random_split(training_dataset_sdlp, [int(len(training_dataset_sdlp)*0.8), len(training_dataset_sdlp)-int(len(training_dataset_sdlp)*0.8)])
len(train_dataset_sdlp), len(eval_dataset_sdlp)

(16580, 4145)

In [None]:
trainer = Trainer(
    model=gpt2_model, 
    args=training_args,
    train_dataset=train_dataset_sdlp,
    eval_dataset=eval_dataset_sdlp,
    tokenizer = gpt2_tokenizer,
    data_collator = gpt2_data_collator
)

In [None]:
#1 epoch is fine but could maybe go a bit more
trainer.train()

***** Running training *****
  Num examples = 16580
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2073


Step,Training Loss,Validation Loss
300,No log,3.798091
600,3.956600,3.692774
900,3.956600,3.629023
1200,3.776300,3.575052
1500,3.678400,3.532298
1800,3.678400,3.51005


***** Running Evaluation *****
  Num examples = 4145
  Batch size = 64
***** Running Evaluation *****
  Num examples = 4145
  Batch size = 64
***** Running Evaluation *****
  Num examples = 4145
  Batch size = 64
***** Running Evaluation *****
  Num examples = 4145
  Batch size = 64
***** Running Evaluation *****
  Num examples = 4145
  Batch size = 64
***** Running Evaluation *****
  Num examples = 4145
  Batch size = 64


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=2073, training_loss=3.757450985206939, metrics={'train_runtime': 2702.6479, 'train_samples_per_second': 6.135, 'train_steps_per_second': 0.767, 'total_flos': 1924732189409280.0, 'train_loss': 3.757450985206939, 'epoch': 1.0})

SDLP samples: mentions are correct as expected, including @dptinfra (real one is deptinfra); mention of Derry, 'urgent meeting with British government'; seems more positive

In [None]:
stub_input = gpt2_tokenizer.encode("<|tweet|>", add_special_tokens=False, return_tensors="pt").to('cuda')

max_length_used_in_training = np.max([len(s.tokens) for s in training_encoded_sdlp[:50]])
outputs = gpt2_model.generate(stub_input, 
                              do_sample=True, top_p=0.95, top_k=40, 
                              pad_token_id=gpt2_tokenizer.pad_token_id, eos_token_id=gpt2_tokenizer.eos_token_id,
                              max_length=max_length_used_in_training*0.6, 
                              repetition_penalty = 1.5,
                              num_return_sequences=10)

for i,o in enumerate(outputs):
    print(str(i+1)+'   '+gpt2_tokenizer.decode(o)+'\n')
    print(len(gpt2_tokenizer.decode(o)))

1   <|tweet|>@clionamccarney I have made my peace with the SDLP. The DUP must change and become a party which offers real leadership on Brexit, with an alternative to continuing political

183
2   <|tweet|>So proud of the work being done by all those volunteers and students. You can help too - donate now to help keep our communities safe for future generations! #Pledge4Derry @

182
3   <|tweet|>At #EducationCommittee questioning Minister Weir about the COVID-19 crisis @DfE_HQ Director, Dr John McGeeney spoke of his concerns regarding Covid impact on

166
4   <|tweet|>I know that some have to take the responsibility for what happened, but this is an entirely unfair narrative. It has nothing about sectarian hatred or lack of respect; it's because there are so

202
5   <|tweet|>@SMcLaughlinmla It's still a work in progress so I would recommend you check the status on it to be sure - for example @uuponline.com is currently

155
6   <|tweet|>@LeonaONeill1 I'd prefer they say yes. But this i

In [None]:
#Save output samples
outputs1000 = [gpt2_model.generate(stub_input, 
                                 max_length=max_length_used_in_training*0.6, 
                                 do_sample=True, top_p=0.95, top_k=40, 
                                 pad_token_id=gpt2_tokenizer.pad_token_id, eos_token_id=gpt2_tokenizer.eos_token_id,
                                 repetition_penalty = 1.5,
                                 num_return_sequences=100) for _ in range(10)]
outputs1000 = torch.vstack(outputs1000)

outputs1000 = [gpt2_tokenizer.decode(o) for o in outputs1000]
outputs1000_df = pd.DataFrame({'sample_number': range(1,1001),
                              'generated_text': outputs1000})
#outputs1000_df.head(3)
#outputs1000_df.to_csv('drive/MyDrive/LM_finetuning/sdlp_tweets_train17k_1epoch_gpt2small_1000samples.csv', index=False)
#outputs1000_df.to_csv('drive/MyDrive/LM_finetuning/sdlp_tweets_train17k_1epoch_gpt2medium_1000samples.csv', index=False)
outputs1000_df.to_csv('drive/MyDrive/LM_finetuning/sdlp_tweets_train17k_1epoch_gpt2medium_reppen1pt5_1000samples.csv', index=False)

In [None]:
gpt2_model.save_pretrained(save_directory='drive/MyDrive/LM_finetuning/sdlp_model_train17k_1epoch_gpt2medium')

Configuration saved in drive/MyDrive/LM_finetuning/sdlp_model_train17k_1epoch_gpt2medium/config.json
Model weights saved in drive/MyDrive/LM_finetuning/sdlp_model_train17k_1epoch_gpt2medium/pytorch_model.bin


In [None]:
all_tweets.party.value_counts()

Sinn Fein    30671
Alliance     24673
SDLP         20725
UUP          18063
DUP          16223
Name: party, dtype: int64

In [None]:
#And for the other 3 parties - I tried in a loop but got an error; not sure why; 
#  maybe the trainer.train visualisation needs to be last line in a cell

#Make sure to have created training_args and run the embedding resizing step above

party = 'UUP'
training_encoded_tmp = gpt2_tokenizer(all_tweets[all_tweets.party==party].text.tolist(), padding=True, truncation=True, max_length=64)
print(f"Now have {len(training_encoded_tmp['input_ids'])} encoded {party} tweets")

training_dataset_tmp = TweetDataset(training_encoded_tmp)

train_dataset_tmp, eval_dataset_tmp = torch.utils.data.random_split(training_dataset_tmp, [int(len(training_dataset_tmp)*0.8), len(training_dataset_tmp)-int(len(training_dataset_tmp)*0.8)])
print(len(train_dataset_tmp), len(eval_dataset_tmp))

trainer = Trainer(
    model=gpt2_model, 
    args=training_args,
    train_dataset=train_dataset_tmp,
    eval_dataset=eval_dataset_tmp,
    tokenizer = gpt2_tokenizer,
    data_collator = gpt2_data_collator
)

Now have 18063 encoded UUP tweets
14450 3613


In [None]:
#do 1 epoch
trainer.train()

***** Running training *****
  Num examples = 14450
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1807


Step,Training Loss,Validation Loss
300,No log,3.88945
600,4.010600,3.736558
900,4.010600,3.633342
1200,3.767500,3.563327
1500,3.648700,3.516448
1800,3.648700,3.498103


***** Running Evaluation *****
  Num examples = 3613
  Batch size = 64
***** Running Evaluation *****
  Num examples = 3613
  Batch size = 64
***** Running Evaluation *****
  Num examples = 3613
  Batch size = 64
***** Running Evaluation *****
  Num examples = 3613
  Batch size = 64
***** Running Evaluation *****
  Num examples = 3613
  Batch size = 64
***** Running Evaluation *****
  Num examples = 3613
  Batch size = 64


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1807, training_loss=3.7745653837506916, metrics={'train_runtime': 2356.245, 'train_samples_per_second': 6.133, 'train_steps_per_second': 0.767, 'total_flos': 1677465629491200.0, 'train_loss': 3.7745653837506916, 'epoch': 1.0})

In [None]:
#Save output samples
outputs1000 = [gpt2_model.generate(stub_input, 
                                max_length=max_length_used_in_training*0.6, 
                                do_sample=True, top_p=0.95, top_k=40, 
                                pad_token_id=gpt2_tokenizer.pad_token_id, eos_token_id=gpt2_tokenizer.eos_token_id,
                                repetition_penalty = 1.5,
                                num_return_sequences=100) for _ in range(10)]
outputs1000 = torch.vstack(outputs1000)

outputs1000 = [gpt2_tokenizer.decode(o) for o in outputs1000]
outputs1000_df = pd.DataFrame({'sample_number': range(1,1001),
                              'generated_text': outputs1000})
#outputs1000_df.to_csv(f'drive/MyDrive/LM_finetuning/{party.lower()}_tweets_train{np.round(len(training_dataset_tmp), -3) // 1000}k_1epoch_gpt2small_1000samples.csv', index=False)
#outputs1000_df.to_csv(f'drive/MyDrive/LM_finetuning/{party.lower()}_tweets_train{np.round(len(training_dataset_tmp), -3) // 1000}k_1epoch_gpt2medium_1000samples.csv', index=False)
outputs1000_df.to_csv(f'drive/MyDrive/LM_finetuning/{party.lower()}_tweets_train{np.round(len(training_dataset_tmp), -3) // 1000}k_1epoch_gpt2medium_reppen1pt5_1000samples.csv', index=False)

In [None]:
gpt2_model.save_pretrained(save_directory=f'drive/MyDrive/LM_finetuning/{party.lower()}_model_train{np.round(len(training_dataset_tmp), -3) // 1000}k_1epoch_gpt2medium')

Configuration saved in drive/MyDrive/LM_finetuning/uup_model_train18k_1epoch_gpt2medium/config.json
Model weights saved in drive/MyDrive/LM_finetuning/uup_model_train18k_1epoch_gpt2medium/pytorch_model.bin


In [16]:
party = 'Alliance'
training_encoded_tmp = gpt2_tokenizer(all_tweets[all_tweets.party==party].text.tolist(), padding=True, truncation=True, max_length=64)
print(f"Now have {len(training_encoded_tmp['input_ids'])} encoded {party} tweets")

training_dataset_tmp = TweetDataset(training_encoded_tmp)

train_dataset_tmp, eval_dataset_tmp = torch.utils.data.random_split(training_dataset_tmp, [int(len(training_dataset_tmp)*0.8), len(training_dataset_tmp)-int(len(training_dataset_tmp)*0.8)])
print(len(train_dataset_tmp), len(eval_dataset_tmp))

trainer = Trainer(
    model=gpt2_model, 
    args=training_args,
    train_dataset=train_dataset_tmp,
    eval_dataset=eval_dataset_tmp,
    tokenizer = gpt2_tokenizer,
    data_collator = gpt2_data_collator
)

Now have 24673 encoded Alliance tweets
19738 4935


In [17]:
trainer.train()

***** Running training *****
  Num examples = 19738
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2468


Step,Training Loss,Validation Loss
300,No log,4.043536
600,9.140400,3.715711
900,9.140400,3.540142
1200,3.728700,3.440847
1500,3.556300,3.380584
1800,3.556300,3.332803
2100,3.437600,3.300436
2400,3.437600,3.284289


***** Running Evaluation *****
  Num examples = 4935
  Batch size = 64
***** Running Evaluation *****
  Num examples = 4935
  Batch size = 64
***** Running Evaluation *****
  Num examples = 4935
  Batch size = 64
***** Running Evaluation *****
  Num examples = 4935
  Batch size = 64
***** Running Evaluation *****
  Num examples = 4935
  Batch size = 64
***** Running Evaluation *****
  Num examples = 4935
  Batch size = 64
***** Running Evaluation *****
  Num examples = 4935
  Batch size = 64
***** Running Evaluation *****
  Num examples = 4935
  Batch size = 64


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=2468, training_loss=4.668042002079746, metrics={'train_runtime': 3287.0346, 'train_samples_per_second': 6.005, 'train_steps_per_second': 0.751, 'total_flos': 2291336788574208.0, 'train_loss': 4.668042002079746, 'epoch': 1.0})

In [20]:
#Save output samples
stub_input = gpt2_tokenizer.encode("<|tweet|>", add_special_tokens=False, return_tensors="pt").to('cuda')
max_length_used_in_training = np.max([len(s.tokens) for s in training_encoded_tmp[:50]])

outputs1000 = [gpt2_model.generate(stub_input, 
                                max_length=max_length_used_in_training*0.6, 
                                do_sample=True, top_p=0.95, top_k=40, 
                                pad_token_id=gpt2_tokenizer.pad_token_id, eos_token_id=gpt2_tokenizer.eos_token_id,
                                repetition_penalty = 1.5,
                                num_return_sequences=100) for _ in range(10)]
outputs1000 = torch.vstack(outputs1000)

outputs1000 = [gpt2_tokenizer.decode(o) for o in outputs1000]
outputs1000_df = pd.DataFrame({'sample_number': range(1,1001),
                              'generated_text': outputs1000})
#outputs1000_df.to_csv(f'drive/MyDrive/LM_finetuning/{party.lower()}_tweets_train{np.round(len(training_dataset_tmp), -3) // 1000}k_1epoch_gpt2small_1000samples.csv', index=False)
#outputs1000_df.to_csv(f'drive/MyDrive/LM_finetuning/{party.lower()}_tweets_train{np.round(len(training_dataset_tmp), -3) // 1000}k_1epoch_gpt2medium_1000samples.csv', index=False)
outputs1000_df.to_csv(f'drive/MyDrive/LM_finetuning/{party.lower()}_tweets_train{np.round(len(training_dataset_tmp), -3) // 1000}k_1epoch_gpt2medium_reppen1pt5_1000samples.csv', index=False)

In [21]:
gpt2_model.save_pretrained(save_directory=f'drive/MyDrive/LM_finetuning/{party.lower()}_model_train{np.round(len(training_dataset_tmp), -3) // 1000}k_1epoch_gpt2medium')

Configuration saved in drive/MyDrive/LM_finetuning/alliance_model_train25k_1epoch_gpt2medium/config.json
Model weights saved in drive/MyDrive/LM_finetuning/alliance_model_train25k_1epoch_gpt2medium/pytorch_model.bin


In [22]:
party = 'Sinn Fein'
training_encoded_tmp = gpt2_tokenizer(all_tweets[all_tweets.party==party].text.tolist(), padding=True, truncation=True, max_length=64)
print(f"Now have {len(training_encoded_tmp['input_ids'])} encoded {party} tweets")

training_dataset_tmp = TweetDataset(training_encoded_tmp)

train_dataset_tmp, eval_dataset_tmp = torch.utils.data.random_split(training_dataset_tmp, [int(len(training_dataset_tmp)*0.8), len(training_dataset_tmp)-int(len(training_dataset_tmp)*0.8)])
print(len(train_dataset_tmp), len(eval_dataset_tmp))

trainer = Trainer(
    model=gpt2_model, 
    args=training_args,
    train_dataset=train_dataset_tmp,
    eval_dataset=eval_dataset_tmp,
    tokenizer = gpt2_tokenizer,
    data_collator = gpt2_data_collator
)

Now have 30671 encoded Sinn Fein tweets
24536 6135


In [23]:
trainer.train()

***** Running training *****
  Num examples = 24536
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3067


Step,Training Loss,Validation Loss
300,No log,3.94218
600,4.092800,3.816211
900,4.092800,3.732922
1200,3.869700,3.669698
1500,3.782700,3.626852
1800,3.782700,3.5874
2100,3.671600,3.556738
2400,3.671600,3.535959
2700,3.614200,3.520074
3000,3.613600,3.512507


***** Running Evaluation *****
  Num examples = 6135
  Batch size = 64
***** Running Evaluation *****
  Num examples = 6135
  Batch size = 64
***** Running Evaluation *****
  Num examples = 6135
  Batch size = 64
***** Running Evaluation *****
  Num examples = 6135
  Batch size = 64
***** Running Evaluation *****
  Num examples = 6135
  Batch size = 64
***** Running Evaluation *****
  Num examples = 6135
  Batch size = 64
***** Running Evaluation *****
  Num examples = 6135
  Batch size = 64
***** Running Evaluation *****
  Num examples = 6135
  Batch size = 64
***** Running Evaluation *****
  Num examples = 6135
  Batch size = 64
***** Running Evaluation *****
  Num examples = 6135
  Batch size = 64


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=3067, training_loss=3.7687769872107566, metrics={'train_runtime': 4386.8919, 'train_samples_per_second': 5.593, 'train_steps_per_second': 0.699, 'total_flos': 2848325030117376.0, 'train_loss': 3.7687769872107566, 'epoch': 1.0})

In [24]:
#Save output samples
stub_input = gpt2_tokenizer.encode("<|tweet|>", add_special_tokens=False, return_tensors="pt").to('cuda')
max_length_used_in_training = np.max([len(s.tokens) for s in training_encoded_tmp[:50]])

outputs1000 = [gpt2_model.generate(stub_input, 
                                max_length=max_length_used_in_training*0.6, 
                                do_sample=True, top_p=0.95, top_k=40, 
                                pad_token_id=gpt2_tokenizer.pad_token_id, eos_token_id=gpt2_tokenizer.eos_token_id,
                                repetition_penalty = 1.5,
                                #forced_eos_token_id = 50256,
                                num_return_sequences=100) for _ in range(10)]
#with repetition_penalty = 1.0, 0.9% rows include 'xx xx'; 0.6% include 'xx xx xx' 188 total (xx, xxx, xo) words
#with repetition_penalty = 1.5, 1.1% rows include 'xx xx'; 0.0% include 'xx xx xx'; 98 total (xx, xxx, xo) words
#with repetition_penalty = 3.0, 1.0% rows include 'xx xx'; 0.0% include 'xx xx xx'; 115 total (xx, xxx, xo) words
#< 1.0 gives bad results

#forced_eos_token_id = 50256?

outputs1000 = torch.vstack(outputs1000)

outputs1000 = [gpt2_tokenizer.decode(o) for o in outputs1000]
outputs1000_df = pd.DataFrame({'sample_number': range(1,1001),
                              'generated_text': outputs1000})


In [25]:
#outputs1000_df.to_csv(f'drive/MyDrive/LM_finetuning/{party.lower()}_tweets_train{np.round(len(training_dataset_tmp), -3) // 1000}k_1epoch_gpt2small_1000samples.csv', index=False)
#outputs1000_df.to_csv(f'drive/MyDrive/LM_finetuning/{party.lower()}_tweets_train{np.round(len(training_dataset_tmp), -3) // 1000}k_1epoch_gpt2medium_1000samples.csv', index=False)

outputs1000_df.to_csv(f'drive/MyDrive/LM_finetuning/{party.lower()}_tweets_train{np.round(len(training_dataset_tmp), -3) // 1000}k_1epoch_gpt2medium_reppen1pt5_1000samples.csv', index=False)

In [26]:
gpt2_model.save_pretrained(save_directory=f'drive/MyDrive/LM_finetuning/{party.lower()}_model_train{np.round(len(training_dataset_tmp), -3) // 1000}k_1epoch_gpt2medium')

Configuration saved in drive/MyDrive/LM_finetuning/sinn fein_model_train31k_1epoch_gpt2medium/config.json
Model weights saved in drive/MyDrive/LM_finetuning/sinn fein_model_train31k_1epoch_gpt2medium/pytorch_model.bin
