In [1]:
from transformers import GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, GPT2LMHeadModel, pipeline, \
                         Trainer, TrainingArguments


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')  # load up a standard gpt2 model

tokenizer.pad_token = tokenizer.eos_token  
# set our pad token to be the eos token. This lets gpt know how to fill space


In [5]:
# load up our data into a dataset
pds_data = TextDataset(
    tokenizer=tokenizer,
    file_path='./Data/PDS2.txt',  # Principles of Data Science - Sinan Ozdemir
    block_size=64  # length of each chunk of text to use as a datapoint
)

In [6]:
pds_data[0], pds_data[0].shape  # inspect the first point

(tensor([  198,   464,  1366,  3783,   569,  1697, 16362,   198,   464, 10688,
           198,   198, 16281,   784, 10922,   263,    12,  8344,  4872,  4981,
           198,   198, 34556,  8300,   198,  5195, 11361,    30,   198,   198,
         37906,  6593,   198, 16281,   286,  4096, 11361,   198, 16281,   784,
         32096,   257,  2060,  6126,   198,   198, 43961,  3725,   198,   198,
          4366,   517, 29191,   198,  6601,  3783,  1339,  3640,   198,   198,
         20448,  2050,   784,  3557]),
 torch.Size([64]))

In [7]:
print(tokenizer.decode(pds_data[0]))


The data science Venn diagram
The math

Example – spawner-recruit models

Computer programming
Why Python?

Python practices
Example of basic Python
Example – parsing a single tweet

Domain knowledge

Some more terminology
Data science case studies

Case study – autom


In [8]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,  
    # MLM is Masked Language Modelling (for BERT + auto-encoding tasks)
)

In [9]:
# example of how collator pads data dynamically
collator_example = data_collator([tokenizer('I am an input'), tokenizer('So am I')])

collator_example

{'input_ids': tensor([[   40,   716,   281,  5128],
        [ 2396,   716,   314, 50256]]), 'attention_mask': tensor([[1, 1, 1, 1],
        [1, 1, 1, 0]]), 'labels': tensor([[  40,  716,  281, 5128],
        [2396,  716,  314, -100]])}

In [10]:
collator_example.input_ids  # 50256 is our pad token id

tensor([[   40,   716,   281,  5128],
        [ 2396,   716,   314, 50256]])

In [11]:
tokenizer.pad_token_id

50256

In [12]:
collator_example.attention_mask  # Note the 0 in the attention mask where we have a pad token

tensor([[1, 1, 1, 1],
        [1, 1, 1, 0]])

In [13]:
collator_example.labels  # note the -100 to ignore loss calculation for the padded token
# Labels are shifted inside the GPT model so we don't need to worry about that

tensor([[  40,  716,  281, 5128],
        [2396,  716,  314, -100]])

In [14]:
model = GPT2LMHeadModel.from_pretrained('gpt2')  # load up a GPT2 model

pretrained_generator = pipeline(  # create a generator with built in params
    'text-generation', model=model, tokenizer='gpt2',
    config={'max_length': 200, 'do_sample': True, 'top_p': 0.9, 'temperature': 0.7, 'top_k': 10}
)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [15]:
print('----------')
for generated_sequence in pretrained_generator('This dataset shows the relationship', num_return_sequences=3):
    print(generated_sequence['generated_text'])
    print('----------')

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


----------
This dataset shows the relationship between the extent to which the economic effects of the recession on employment actually widened after the onset of the program and were likely to widen after about three years.

Figure 1. Effects of the recession on employment since the financial
----------
This dataset shows the relationship between a positive number (G1) − G2, which has a positive C-value of −1 to mean that there are no significant differences in the observed relationship between G1 and G2 (the only major parameter
----------
This dataset shows the relationship between mortality and IQ in all high school students at high school and grade eight between 2001 and 2006. Students who were already enrolled at higher grades were about twice as likely to die in high school, while those who completed high school
----------


In [16]:
training_args = TrainingArguments(
    output_dir="./gpt2_pds", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=3, # number of training epochs
    per_device_train_batch_size=32, # batch size for training
    per_device_eval_batch_size=32,  # batch size for evaluation
    logging_steps=10,
    load_best_model_at_end=True,
    evaluation_strategy='epoch',
    save_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=pds_data.examples[:int(len(pds_data.examples)*.8)],
    eval_dataset=pds_data.examples[int(len(pds_data.examples)*.8):]
)

trainer.evaluate()

  0%|          | 0/15 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████| 15/15 [00:05<00:00,  2.74it/s]


{'eval_loss': 4.50043249130249,
 'eval_model_preparation_time': 0.0013,
 'eval_runtime': 6.1543,
 'eval_samples_per_second': 75.557,
 'eval_steps_per_second': 2.437}

In [17]:
trainer.train()

  6%|▌         | 10/177 [00:13<03:28,  1.25s/it]

{'loss': 3.8623, 'grad_norm': 5.053485870361328, 'learning_rate': 4.717514124293785e-05, 'epoch': 0.17}


 11%|█▏        | 20/177 [00:25<03:16,  1.25s/it]

{'loss': 3.5642, 'grad_norm': 5.001654148101807, 'learning_rate': 4.435028248587571e-05, 'epoch': 0.34}


 17%|█▋        | 30/177 [00:37<02:56,  1.20s/it]

{'loss': 3.4409, 'grad_norm': 5.430204391479492, 'learning_rate': 4.152542372881356e-05, 'epoch': 0.51}


 23%|██▎       | 40/177 [00:49<02:45,  1.21s/it]

{'loss': 3.3577, 'grad_norm': 4.701898574829102, 'learning_rate': 3.8700564971751415e-05, 'epoch': 0.68}


 28%|██▊       | 50/177 [01:02<02:36,  1.23s/it]

{'loss': 3.3171, 'grad_norm': 5.026031017303467, 'learning_rate': 3.587570621468927e-05, 'epoch': 0.85}


                                                
 33%|███▎      | 59/177 [01:18<02:06,  1.07s/it]

{'eval_loss': 3.4743077754974365, 'eval_model_preparation_time': 0.0013, 'eval_runtime': 5.3561, 'eval_samples_per_second': 86.817, 'eval_steps_per_second': 2.801, 'epoch': 1.0}


 34%|███▍      | 60/177 [01:21<06:45,  3.47s/it]

{'loss': 3.2698, 'grad_norm': 6.145288467407227, 'learning_rate': 3.305084745762712e-05, 'epoch': 1.02}


 40%|███▉      | 70/177 [01:34<02:22,  1.33s/it]

{'loss': 3.0595, 'grad_norm': 5.348328590393066, 'learning_rate': 3.022598870056497e-05, 'epoch': 1.19}


 45%|████▌     | 80/177 [01:47<02:06,  1.30s/it]

{'loss': 3.0682, 'grad_norm': 6.0018181800842285, 'learning_rate': 2.7401129943502824e-05, 'epoch': 1.36}


 51%|█████     | 90/177 [02:00<01:51,  1.28s/it]

{'loss': 3.1048, 'grad_norm': 5.714569568634033, 'learning_rate': 2.457627118644068e-05, 'epoch': 1.53}


 56%|█████▋    | 100/177 [02:15<02:04,  1.62s/it]

{'loss': 3.0811, 'grad_norm': 5.528576850891113, 'learning_rate': 2.175141242937853e-05, 'epoch': 1.69}


 62%|██████▏   | 110/177 [02:35<02:12,  1.97s/it]

{'loss': 2.9887, 'grad_norm': 5.918603420257568, 'learning_rate': 1.8926553672316387e-05, 'epoch': 1.86}


                                                 
 67%|██████▋   | 118/177 [03:00<01:40,  1.71s/it]

{'eval_loss': 3.4423489570617676, 'eval_model_preparation_time': 0.0013, 'eval_runtime': 9.6868, 'eval_samples_per_second': 48.004, 'eval_steps_per_second': 1.549, 'epoch': 2.0}


 68%|██████▊   | 120/177 [03:07<04:16,  4.49s/it]

{'loss': 3.0536, 'grad_norm': 5.424740791320801, 'learning_rate': 1.6101694915254237e-05, 'epoch': 2.03}


 73%|███████▎  | 130/177 [03:27<01:37,  2.08s/it]

{'loss': 2.9814, 'grad_norm': 5.927330493927002, 'learning_rate': 1.3276836158192092e-05, 'epoch': 2.2}


 79%|███████▉  | 140/177 [03:48<01:14,  2.01s/it]

{'loss': 2.9502, 'grad_norm': 6.3978753089904785, 'learning_rate': 1.0451977401129943e-05, 'epoch': 2.37}


 85%|████████▍ | 150/177 [04:09<00:58,  2.16s/it]

{'loss': 2.9388, 'grad_norm': 5.831429481506348, 'learning_rate': 7.627118644067798e-06, 'epoch': 2.54}


 90%|█████████ | 160/177 [04:31<00:38,  2.27s/it]

{'loss': 2.8969, 'grad_norm': 5.965714931488037, 'learning_rate': 4.80225988700565e-06, 'epoch': 2.71}


 96%|█████████▌| 170/177 [04:54<00:15,  2.26s/it]

{'loss': 2.9128, 'grad_norm': 5.89216423034668, 'learning_rate': 1.977401129943503e-06, 'epoch': 2.88}


                                                 
100%|██████████| 177/177 [05:19<00:00,  1.58s/it]

{'eval_loss': 3.438119649887085, 'eval_model_preparation_time': 0.0013, 'eval_runtime': 10.4228, 'eval_samples_per_second': 44.614, 'eval_steps_per_second': 1.439, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].
100%|██████████| 177/177 [05:23<00:00,  1.82s/it]

{'train_runtime': 322.9916, 'train_samples_per_second': 17.267, 'train_steps_per_second': 0.548, 'train_loss': 3.1584624597581765, 'epoch': 3.0}





TrainOutput(global_step=177, training_loss=3.1584624597581765, metrics={'train_runtime': 322.9916, 'train_samples_per_second': 17.267, 'train_steps_per_second': 0.548, 'total_flos': 182153207808000.0, 'train_loss': 3.1584624597581765, 'epoch': 3.0})

In [18]:
trainer.evaluate()  # loss decrease is slowing down so we are hitting our limit

100%|██████████| 15/15 [00:09<00:00,  1.60it/s]


{'eval_loss': 3.438119649887085,
 'eval_model_preparation_time': 0.0013,
 'eval_runtime': 9.3729,
 'eval_samples_per_second': 49.611,
 'eval_steps_per_second': 1.6,
 'epoch': 3.0}

In [19]:
trainer.save_model()

In [20]:
loaded_model = GPT2LMHeadModel.from_pretrained('./gpt2_pds')

finetuned_generator = pipeline(
    'text-generation', model=loaded_model, tokenizer=tokenizer,
    config={'max_length': 200, 'do_sample': True, 'top_p': 0.9, 'temperature': 0.7, 'top_k': 10}
)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [21]:
# examples are now sustainably about data
print('----------')
for generated_sequence in finetuned_generator('This dataset shows the relationship', num_return_sequences=3):
    print(generated_sequence['generated_text'])
    print('----------')

----------
This dataset shows the relationship between the number of children and the number of people who read
a book with at least five characters. The most common interpretation would be to say that the book is being
written by a single person – usually a teenager.
----------
This dataset shows the relationship between mean and variance versus the mean and variance of a population.
How can we make data sense? By looking at the data with
meaning. When you look at population or population types, it usually comes down to whether
----------
This dataset shows the relationship between the mean and standard deviation (SD) on a continuous-time (covariate) scale:
y_squared = np.mean(squared_squared)
totals = np.
----------


In [22]:
from transformers import GPT2Tokenizer, DataCollatorForLanguageModeling, TrainingArguments, Trainer, \
                         GPT2LMHeadModel, pipeline
from datasets import Dataset
import pandas as pd

In [23]:
data = pd.read_csv('./Data/english_to_latex.csv')

print(data.shape)

data.head(2)

(50, 2)


Unnamed: 0,English,LaTeX
0,integral from a to b of x squared,"\int_{a}^{b} x^2 \,dx"
1,integral from negative 1 to 1 of x squared,"\int_{-1}^{1} x^2 \,dx"


In [24]:
data.head(10)

Unnamed: 0,English,LaTeX
0,integral from a to b of x squared,"\int_{a}^{b} x^2 \,dx"
1,integral from negative 1 to 1 of x squared,"\int_{-1}^{1} x^2 \,dx"
2,integral from negative 1 to infinity of x cubed,"\int_{-1}^{\inf} x^3 \,dx"
3,integral from 0 to infinity of x squared,"\int_{0}^{\inf} x^2 \,dx"
4,integral from 0 to infinity of y squared,"\int_{0}^{\inf} y^2 \,dy"
5,integral from 1 to 2 of x over 2,"\int_{1}^{2} \frac{x}{2} \,dx"
6,f of x equals x squared,f(x) = x^2
7,h of x equals x squared,h(x) = x^2
8,g of x equals x squared,g(x) = x^2
9,g of x equals x to the eighth power,g(x) = x^8


In [25]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

tokenizer.pad_token = tokenizer.eos_token

# Add our singular prompt
CONVERSION_PROMPT = 'LCT\n'  # LaTeX conversion task

CONVERSION_TOKEN = 'LaTeX:'


In [26]:
# This is our "training prompt" that we want GPT2 to recognize and learn
training_examples = f'{CONVERSION_PROMPT}English: ' + data['English'] + '\n' + CONVERSION_TOKEN + ' ' + data['LaTeX'].astype(str)

print(training_examples[0])


LCT
English: integral from a to b of x squared
LaTeX: \int_{a}^{b} x^2 \,dx


In [27]:
task_df = pd.DataFrame({'text': training_examples})

task_df.head(2)

Unnamed: 0,text
0,LCT\nEnglish: integral from a to b of x square...
1,LCT\nEnglish: integral from negative 1 to 1 of...


In [28]:
# adding the EOS token at the end so the model knows when to stop predicting

task_df['text'] = task_df['text'].map(lambda x: f'{x}{tokenizer.eos_token}')

In [29]:
latex_data = Dataset.from_pandas(task_df)  # turn a pandas DataFrame into a Dataset

def preprocess(examples):  
    # tokenize our text but don't pad because our collator will pad for us dynamically
    return tokenizer(examples['text'], truncation=True)

latex_data = latex_data.map(preprocess, batched=True)

latex_data = latex_data.train_test_split(train_size=.8)

100%|██████████| 1/1 [00:00<00:00, 33.55ba/s]


In [30]:
latex_data['train'][0]

{'text': 'LCT\nEnglish: g of x equals pi cubed\nLaTeX: g(x) = \\pi^3<|endoftext|>',
 'input_ids': [43,
  4177,
  198,
  15823,
  25,
  308,
  286,
  2124,
  21767,
  31028,
  13617,
  276,
  198,
  14772,
  49568,
  25,
  308,
  7,
  87,
  8,
  796,
  3467,
  14415,
  61,
  18,
  50256],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1]}

In [31]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [32]:
latex_gpt2 = GPT2LMHeadModel.from_pretrained('gpt2')

In [33]:
latex_data

DatasetDict({
    train: Dataset({
        features: ['text', 'input_ids', 'attention_mask'],
        num_rows: 40
    })
    test: Dataset({
        features: ['text', 'input_ids', 'attention_mask'],
        num_rows: 10
    })
})

In [34]:
training_args = TrainingArguments(
    output_dir="./english_to_latex",
    overwrite_output_dir=True, # overwrite the content of the output directory
    num_train_epochs=5, # number of training epochs
    per_device_train_batch_size=2, # batch size for training
    per_device_eval_batch_size=20,  # batch size for evaluation
    load_best_model_at_end=True,
    logging_steps=5,
    log_level='info',
    evaluation_strategy='epoch',
    save_strategy='epoch'
)

trainer = Trainer(
    model=latex_gpt2,
    args=training_args,
    train_dataset=latex_data["train"],
    eval_dataset=latex_data["test"],
    data_collator=data_collator,
)

trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 10
  Batch size = 20
100%|██████████| 1/1 [00:00<00:00,  7.39it/s]


{'eval_loss': 4.891357421875,
 'eval_model_preparation_time': 0.0011,
 'eval_runtime': 0.8782,
 'eval_samples_per_second': 11.387,
 'eval_steps_per_second': 1.139}

In [35]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 40
  Num Epochs = 5
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 100
  Number of trainable parameters = 124,439,808
  5%|▌         | 5/100 [00:03<01:00,  1.57it/s]

{'loss': 4.4746, 'grad_norm': 41.448509216308594, 'learning_rate': 4.75e-05, 'epoch': 0.25}


 11%|█         | 11/100 [00:06<00:40,  2.19it/s]

{'loss': 2.9464, 'grad_norm': 22.81044578552246, 'learning_rate': 4.5e-05, 'epoch': 0.5}


 16%|█▌        | 16/100 [00:07<00:27,  3.05it/s]

{'loss': 2.4008, 'grad_norm': 16.88887596130371, 'learning_rate': 4.25e-05, 'epoch': 0.75}


 20%|██        | 20/100 [00:09<00:22,  3.59it/s]The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 10
  Batch size = 20


{'loss': 1.4558, 'grad_norm': 21.767358779907227, 'learning_rate': 4e-05, 'epoch': 1.0}


                                                
 20%|██        | 20/100 [00:09<00:22,  3.59it/s]Saving model checkpoint to ./english_to_latex/checkpoint-20
Configuration saved in ./english_to_latex/checkpoint-20/config.json
Configuration saved in ./english_to_latex/checkpoint-20/generation_config.json


{'eval_loss': 1.207176923751831, 'eval_model_preparation_time': 0.0011, 'eval_runtime': 0.0907, 'eval_samples_per_second': 110.228, 'eval_steps_per_second': 11.023, 'epoch': 1.0}


Model weights saved in ./english_to_latex/checkpoint-20/model.safetensors
 26%|██▌       | 26/100 [00:12<00:25,  2.88it/s]

{'loss': 1.1389, 'grad_norm': 15.813325881958008, 'learning_rate': 3.7500000000000003e-05, 'epoch': 1.25}


 31%|███       | 31/100 [00:13<00:17,  3.97it/s]

{'loss': 1.111, 'grad_norm': 13.392133712768555, 'learning_rate': 3.5e-05, 'epoch': 1.5}


 36%|███▌      | 36/100 [00:14<00:15,  4.20it/s]

{'loss': 0.9225, 'grad_norm': 17.603906631469727, 'learning_rate': 3.2500000000000004e-05, 'epoch': 1.75}


 40%|████      | 40/100 [00:15<00:13,  4.40it/s]The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 10
  Batch size = 20


{'loss': 1.006, 'grad_norm': 17.051355361938477, 'learning_rate': 3e-05, 'epoch': 2.0}


                                                
 40%|████      | 40/100 [00:15<00:13,  4.40it/s]Saving model checkpoint to ./english_to_latex/checkpoint-40
Configuration saved in ./english_to_latex/checkpoint-40/config.json
Configuration saved in ./english_to_latex/checkpoint-40/generation_config.json


{'eval_loss': 0.94855135679245, 'eval_model_preparation_time': 0.0011, 'eval_runtime': 0.0909, 'eval_samples_per_second': 109.955, 'eval_steps_per_second': 10.996, 'epoch': 2.0}


Model weights saved in ./english_to_latex/checkpoint-40/model.safetensors
 46%|████▌     | 46/100 [00:19<00:18,  2.97it/s]

{'loss': 0.8683, 'grad_norm': 13.961539268493652, 'learning_rate': 2.7500000000000004e-05, 'epoch': 2.25}


 51%|█████     | 51/100 [00:20<00:12,  3.86it/s]

{'loss': 0.9374, 'grad_norm': 22.802194595336914, 'learning_rate': 2.5e-05, 'epoch': 2.5}


 56%|█████▌    | 56/100 [00:21<00:10,  4.05it/s]

{'loss': 0.7291, 'grad_norm': 12.476018905639648, 'learning_rate': 2.25e-05, 'epoch': 2.75}


 60%|██████    | 60/100 [00:22<00:09,  4.26it/s]The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 10
  Batch size = 20


{'loss': 0.5303, 'grad_norm': 12.958332061767578, 'learning_rate': 2e-05, 'epoch': 3.0}


                                                
 60%|██████    | 60/100 [00:22<00:09,  4.26it/s]Saving model checkpoint to ./english_to_latex/checkpoint-60
Configuration saved in ./english_to_latex/checkpoint-60/config.json
Configuration saved in ./english_to_latex/checkpoint-60/generation_config.json


{'eval_loss': 0.8704949617385864, 'eval_model_preparation_time': 0.0011, 'eval_runtime': 0.1278, 'eval_samples_per_second': 78.261, 'eval_steps_per_second': 7.826, 'epoch': 3.0}


Model weights saved in ./english_to_latex/checkpoint-60/model.safetensors
 66%|██████▌   | 66/100 [00:26<00:11,  2.84it/s]

{'loss': 0.5288, 'grad_norm': 14.60134220123291, 'learning_rate': 1.75e-05, 'epoch': 3.25}


 71%|███████   | 71/100 [00:27<00:07,  3.93it/s]

{'loss': 0.6414, 'grad_norm': 11.609938621520996, 'learning_rate': 1.5e-05, 'epoch': 3.5}


 76%|███████▌  | 76/100 [00:28<00:05,  4.35it/s]

{'loss': 0.6048, 'grad_norm': 23.53147315979004, 'learning_rate': 1.25e-05, 'epoch': 3.75}


 80%|████████  | 80/100 [00:29<00:04,  4.45it/s]The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 10
  Batch size = 20


{'loss': 0.7961, 'grad_norm': 12.639779090881348, 'learning_rate': 1e-05, 'epoch': 4.0}


                                                
 80%|████████  | 80/100 [00:29<00:04,  4.45it/s]Saving model checkpoint to ./english_to_latex/checkpoint-80
Configuration saved in ./english_to_latex/checkpoint-80/config.json
Configuration saved in ./english_to_latex/checkpoint-80/generation_config.json


{'eval_loss': 0.8538076281547546, 'eval_model_preparation_time': 0.0011, 'eval_runtime': 0.0896, 'eval_samples_per_second': 111.663, 'eval_steps_per_second': 11.166, 'epoch': 4.0}


Model weights saved in ./english_to_latex/checkpoint-80/model.safetensors
 86%|████████▌ | 86/100 [00:32<00:04,  3.18it/s]

{'loss': 0.5927, 'grad_norm': 15.183195114135742, 'learning_rate': 7.5e-06, 'epoch': 4.25}


 91%|█████████ | 91/100 [00:33<00:02,  3.49it/s]

{'loss': 0.639, 'grad_norm': 15.707347869873047, 'learning_rate': 5e-06, 'epoch': 4.5}


 96%|█████████▌| 96/100 [00:35<00:00,  4.18it/s]

{'loss': 0.5481, 'grad_norm': 10.696751594543457, 'learning_rate': 2.5e-06, 'epoch': 4.75}


100%|██████████| 100/100 [00:36<00:00,  4.49it/s]Saving model checkpoint to ./english_to_latex/checkpoint-100
Configuration saved in ./english_to_latex/checkpoint-100/config.json
Configuration saved in ./english_to_latex/checkpoint-100/generation_config.json


{'loss': 0.5521, 'grad_norm': 15.534584045410156, 'learning_rate': 0.0, 'epoch': 5.0}


Model weights saved in ./english_to_latex/checkpoint-100/model.safetensors
The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 10
  Batch size = 20
                                                 
100%|██████████| 100/100 [00:37<00:00,  4.49it/s]Saving model checkpoint to ./english_to_latex/checkpoint-100
Configuration saved in ./english_to_latex/checkpoint-100/config.json
Configuration saved in ./english_to_latex/checkpoint-100/generation_config.json


{'eval_loss': 0.8338748216629028, 'eval_model_preparation_time': 0.0011, 'eval_runtime': 0.1468, 'eval_samples_per_second': 68.112, 'eval_steps_per_second': 6.811, 'epoch': 5.0}


Model weights saved in ./english_to_latex/checkpoint-100/model.safetensors


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./english_to_latex/checkpoint-100 (score: 0.8338748216629028).
There were missing keys in the checkpoint model loaded: ['lm_head.weight'].
100%|██████████| 100/100 [00:41<00:00,  2.44it/s]

{'train_runtime': 41.0559, 'train_samples_per_second': 4.871, 'train_steps_per_second': 2.436, 'train_loss': 1.1712066221237183, 'epoch': 5.0}





TrainOutput(global_step=100, training_loss=1.1712066221237183, metrics={'train_runtime': 41.0559, 'train_samples_per_second': 4.871, 'train_steps_per_second': 2.436, 'total_flos': 3108966912000.0, 'train_loss': 1.1712066221237183, 'epoch': 5.0})

In [36]:
trainer.evaluate()  # best loss of 0.8818739

The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 10
  Batch size = 20
100%|██████████| 1/1 [00:00<00:00,  2.91it/s]


{'eval_loss': 0.8338748216629028,
 'eval_model_preparation_time': 0.0011,
 'eval_runtime': 0.7886,
 'eval_samples_per_second': 12.681,
 'eval_steps_per_second': 1.268,
 'epoch': 5.0}

In [None]:
# Let's try fine-tuning it again but first let's have the model read a math book

In [37]:
# Linear Algebra book by Jim Hefferon written in LaTeX for free - https://joshua.smcvt.edu/linearalgebra

book_data = TextDataset(
    tokenizer=tokenizer,
    file_path='./Data/latex_cheat_sheet.tex',  # train on a LaTeX cheat sheet they made
    block_size=128
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,  # MLM is Masked Language Modelling
)

latex_gpt2 = GPT2LMHeadModel.from_pretrained('gpt2')

training_args = TrainingArguments(
    output_dir="./math_book",
    overwrite_output_dir=True, # overwrite the content of the output directory
    num_train_epochs=2, # number of training epochs
    per_device_train_batch_size=32, # batch size for training
    per_device_eval_batch_size=32,  # batch size for evaluation
    load_best_model_at_end=True,
    logging_steps=1,
    eval_steps=1,
    evaluation_strategy='epoch',
    save_strategy='epoch'
)

trainer = Trainer(
    model=latex_gpt2,
    args=training_args,
    data_collator=data_collator,
    train_dataset=book_data.examples[:int(len(book_data.examples)*.8)],
    eval_dataset=book_data.examples[int(len(book_data.examples)*.8):]
)

Creating features from dataset file at ./Data
Saving features into cached file ./Data/cached_lm_GPT2Tokenizer_128_latex_cheat_sheet.tex [took 0.001 s]
loading configuration file config.json from cache at /Users/vishalsankarram/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/config.json
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj

In [38]:
trainer.evaluate()  # initial loss for the math book


***** Running Evaluation *****
  Num examples = 21
  Batch size = 32
100%|██████████| 1/1 [00:00<00:00,  1.05it/s]


{'eval_loss': 3.3151752948760986,
 'eval_model_preparation_time': 0.0033,
 'eval_runtime': 1.118,
 'eval_samples_per_second': 18.783,
 'eval_steps_per_second': 0.894}

In [39]:
trainer.train()

***** Running training *****
  Num examples = 80
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 6
  Number of trainable parameters = 124,439,808
 17%|█▋        | 1/6 [00:03<00:17,  3.42s/it]

{'loss': 3.0569, 'grad_norm': 22.324445724487305, 'learning_rate': 4.166666666666667e-05, 'epoch': 0.33}


 33%|███▎      | 2/6 [00:18<00:39,  9.97s/it]

{'loss': 3.0514, 'grad_norm': 8.023173332214355, 'learning_rate': 3.3333333333333335e-05, 'epoch': 0.67}


 50%|█████     | 3/6 [00:28<00:30, 10.23s/it]
***** Running Evaluation *****
  Num examples = 21
  Batch size = 32


{'loss': 2.5646, 'grad_norm': 6.845594882965088, 'learning_rate': 2.5e-05, 'epoch': 1.0}



 50%|█████     | 3/6 [00:31<00:30, 10.23s/it]

{'eval_loss': 2.796145439147949, 'eval_model_preparation_time': 0.0033, 'eval_runtime': 2.4233, 'eval_samples_per_second': 8.666, 'eval_steps_per_second': 0.413, 'epoch': 1.0}


Saving model checkpoint to ./math_book/checkpoint-3
Configuration saved in ./math_book/checkpoint-3/config.json
Configuration saved in ./math_book/checkpoint-3/generation_config.json
Model weights saved in ./math_book/checkpoint-3/model.safetensors
 67%|██████▋   | 4/6 [01:06<00:42, 21.23s/it]

{'loss': 2.5167, 'grad_norm': 6.179595947265625, 'learning_rate': 1.6666666666666667e-05, 'epoch': 1.33}


 83%|████████▎ | 5/6 [01:20<00:18, 18.54s/it]

{'loss': 2.3915, 'grad_norm': 6.259373188018799, 'learning_rate': 8.333333333333334e-06, 'epoch': 1.67}


100%|██████████| 6/6 [01:23<00:00, 13.32s/it]Saving model checkpoint to ./math_book/checkpoint-6
Configuration saved in ./math_book/checkpoint-6/config.json
Configuration saved in ./math_book/checkpoint-6/generation_config.json


{'loss': 1.9878, 'grad_norm': 7.2348222732543945, 'learning_rate': 0.0, 'epoch': 2.0}


Model weights saved in ./math_book/checkpoint-6/model.safetensors

***** Running Evaluation *****
  Num examples = 21
  Batch size = 32

100%|██████████| 6/6 [01:30<00:00, 13.32s/it]Saving model checkpoint to ./math_book/checkpoint-6


{'eval_loss': 2.6824653148651123, 'eval_model_preparation_time': 0.0033, 'eval_runtime': 3.086, 'eval_samples_per_second': 6.805, 'eval_steps_per_second': 0.324, 'epoch': 2.0}


Configuration saved in ./math_book/checkpoint-6/config.json
Configuration saved in ./math_book/checkpoint-6/generation_config.json
Model weights saved in ./math_book/checkpoint-6/model.safetensors


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./math_book/checkpoint-6 (score: 2.6824653148651123).
There were missing keys in the checkpoint model loaded: ['lm_head.weight'].
100%|██████████| 6/6 [01:38<00:00, 16.37s/it]

{'train_runtime': 98.1668, 'train_samples_per_second': 1.63, 'train_steps_per_second': 0.061, 'train_loss': 2.5948264996210733, 'epoch': 2.0}





TrainOutput(global_step=6, training_loss=2.5948264996210733, metrics={'train_runtime': 98.1668, 'train_samples_per_second': 1.63, 'train_steps_per_second': 0.061, 'total_flos': 10451681280000.0, 'train_loss': 2.5948264996210733, 'epoch': 2.0})

In [40]:
trainer.save_model()  # 2 epochs led to a pretty good drop in loss

Saving model checkpoint to ./math_book


Configuration saved in ./math_book/config.json
Configuration saved in ./math_book/generation_config.json
Model weights saved in ./math_book/model.safetensors


In [41]:
math_latex_gpt2 = GPT2LMHeadModel.from_pretrained('./math_book')  # load up our gpt pre-trained

training_args = TrainingArguments(
    output_dir="./math_english_to_latex",
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=5, # number of training epochs
    per_device_train_batch_size=4, # batch size for training
    per_device_eval_batch_size=20,  # batch size for evaluation
    load_best_model_at_end=True,
    logging_steps=5,
    log_level='info',
    evaluation_strategy='epoch',
    save_strategy='epoch'
)

trainer = Trainer(
    model=math_latex_gpt2,
    args=training_args,
    train_dataset=latex_data["train"],
    eval_dataset=latex_data["test"],
    data_collator=data_collator,
)

trainer.evaluate()  # loss is starting slightly lower than before

loading configuration file ./math_book/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.46.2",
  "use_cache": true,
  "vocab_size": 50257
}

loading weights f

{'eval_loss': 4.3709306716918945,
 'eval_model_preparation_time': 0.005,
 'eval_runtime': 3.6385,
 'eval_samples_per_second': 2.748,
 'eval_steps_per_second': 0.275}

In [42]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 40
  Num Epochs = 5
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 50
  Number of trainable parameters = 124,439,808
 12%|█▏        | 6/50 [00:04<00:21,  2.02it/s]

{'loss': 3.861, 'grad_norm': 21.971349716186523, 'learning_rate': 4.5e-05, 'epoch': 0.5}


 20%|██        | 10/50 [00:05<00:14,  2.79it/s]The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 10
  Batch size = 20


{'loss': 2.4745, 'grad_norm': 18.61128807067871, 'learning_rate': 4e-05, 'epoch': 1.0}



 20%|██        | 10/50 [00:05<00:14,  2.79it/s]Saving model checkpoint to ./math_english_to_latex/checkpoint-10
Configuration saved in ./math_english_to_latex/checkpoint-10/config.json
Configuration saved in ./math_english_to_latex/checkpoint-10/generation_config.json


{'eval_loss': 1.7024288177490234, 'eval_model_preparation_time': 0.005, 'eval_runtime': 0.0906, 'eval_samples_per_second': 110.389, 'eval_steps_per_second': 11.039, 'epoch': 1.0}


Model weights saved in ./math_english_to_latex/checkpoint-10/model.safetensors
 30%|███       | 15/50 [00:08<00:14,  2.40it/s]

{'loss': 1.6669, 'grad_norm': 15.684871673583984, 'learning_rate': 3.5e-05, 'epoch': 1.5}


 40%|████      | 20/50 [00:10<00:09,  3.32it/s]The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 10
  Batch size = 20


{'loss': 1.1208, 'grad_norm': 12.949471473693848, 'learning_rate': 3e-05, 'epoch': 2.0}



 40%|████      | 20/50 [00:10<00:09,  3.32it/s]Saving model checkpoint to ./math_english_to_latex/checkpoint-20
Configuration saved in ./math_english_to_latex/checkpoint-20/config.json
Configuration saved in ./math_english_to_latex/checkpoint-20/generation_config.json


{'eval_loss': 1.052680253982544, 'eval_model_preparation_time': 0.005, 'eval_runtime': 0.0918, 'eval_samples_per_second': 108.957, 'eval_steps_per_second': 10.896, 'epoch': 2.0}


Model weights saved in ./math_english_to_latex/checkpoint-20/model.safetensors
 50%|█████     | 25/50 [00:14<00:12,  2.01it/s]

{'loss': 1.0311, 'grad_norm': 12.243928909301758, 'learning_rate': 2.5e-05, 'epoch': 2.5}


 60%|██████    | 30/50 [00:15<00:05,  3.35it/s]The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 10
  Batch size = 20


{'loss': 0.8001, 'grad_norm': 11.965880393981934, 'learning_rate': 2e-05, 'epoch': 3.0}



 60%|██████    | 30/50 [00:15<00:05,  3.35it/s]Saving model checkpoint to ./math_english_to_latex/checkpoint-30
Configuration saved in ./math_english_to_latex/checkpoint-30/config.json
Configuration saved in ./math_english_to_latex/checkpoint-30/generation_config.json


{'eval_loss': 0.892163872718811, 'eval_model_preparation_time': 0.005, 'eval_runtime': 0.0906, 'eval_samples_per_second': 110.395, 'eval_steps_per_second': 11.04, 'epoch': 3.0}


Model weights saved in ./math_english_to_latex/checkpoint-30/model.safetensors
 70%|███████   | 35/50 [00:20<00:07,  1.95it/s]

{'loss': 0.8172, 'grad_norm': 11.71354866027832, 'learning_rate': 1.5e-05, 'epoch': 3.5}


 80%|████████  | 40/50 [00:21<00:03,  3.23it/s]The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 10
  Batch size = 20


{'loss': 0.7729, 'grad_norm': 9.02138614654541, 'learning_rate': 1e-05, 'epoch': 4.0}



 80%|████████  | 40/50 [00:21<00:03,  3.23it/s]Saving model checkpoint to ./math_english_to_latex/checkpoint-40
Configuration saved in ./math_english_to_latex/checkpoint-40/config.json
Configuration saved in ./math_english_to_latex/checkpoint-40/generation_config.json


{'eval_loss': 0.8590221405029297, 'eval_model_preparation_time': 0.005, 'eval_runtime': 0.0978, 'eval_samples_per_second': 102.287, 'eval_steps_per_second': 10.229, 'epoch': 4.0}


Model weights saved in ./math_english_to_latex/checkpoint-40/model.safetensors
 92%|█████████▏| 46/50 [00:25<00:01,  2.66it/s]

{'loss': 0.6986, 'grad_norm': 9.55556869506836, 'learning_rate': 5e-06, 'epoch': 4.5}


100%|██████████| 50/50 [00:26<00:00,  3.52it/s]Saving model checkpoint to ./math_english_to_latex/checkpoint-50
Configuration saved in ./math_english_to_latex/checkpoint-50/config.json
Configuration saved in ./math_english_to_latex/checkpoint-50/generation_config.json


{'loss': 0.677, 'grad_norm': 11.018890380859375, 'learning_rate': 0.0, 'epoch': 5.0}


Model weights saved in ./math_english_to_latex/checkpoint-50/model.safetensors
The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 10
  Batch size = 20

100%|██████████| 50/50 [00:28<00:00,  3.52it/s]Saving model checkpoint to ./math_english_to_latex/checkpoint-50
Configuration saved in ./math_english_to_latex/checkpoint-50/config.json
Configuration saved in ./math_english_to_latex/checkpoint-50/generation_config.json


{'eval_loss': 0.8264763951301575, 'eval_model_preparation_time': 0.005, 'eval_runtime': 0.2069, 'eval_samples_per_second': 48.339, 'eval_steps_per_second': 4.834, 'epoch': 5.0}


Model weights saved in ./math_english_to_latex/checkpoint-50/model.safetensors


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./math_english_to_latex/checkpoint-50 (score: 0.8264763951301575).
There were missing keys in the checkpoint model loaded: ['lm_head.weight'].
100%|██████████| 50/50 [00:31<00:00,  1.61it/s]

{'train_runtime': 31.0552, 'train_samples_per_second': 6.44, 'train_steps_per_second': 1.61, 'train_loss': 1.3920131921768188, 'epoch': 5.0}





TrainOutput(global_step=50, training_loss=1.3920131921768188, metrics={'train_runtime': 31.0552, 'train_samples_per_second': 6.44, 'train_steps_per_second': 1.61, 'total_flos': 3511111680000.0, 'train_loss': 1.3920131921768188, 'epoch': 5.0})

In [43]:
trainer.evaluate()  # pre-training on the book for one epoch led to a minor drop in loss

The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 10
  Batch size = 20
100%|██████████| 1/1 [00:00<00:00,  5.24it/s]


{'eval_loss': 0.8264763951301575,
 'eval_model_preparation_time': 0.005,
 'eval_runtime': 0.2497,
 'eval_samples_per_second': 40.055,
 'eval_steps_per_second': 4.006,
 'epoch': 5.0}

In [44]:
trainer.save_model()  # save this model

Saving model checkpoint to ./math_english_to_latex
Configuration saved in ./math_english_to_latex/config.json
Configuration saved in ./math_english_to_latex/generation_config.json
Model weights saved in ./math_english_to_latex/model.safetensors


In [45]:
loaded_model = GPT2LMHeadModel.from_pretrained('./math_english_to_latex')
latex_generator = pipeline('text-generation', model=loaded_model, tokenizer=tokenizer)

loading configuration file ./math_english_to_latex/config.json
Model config GPT2Config {
  "_name_or_path": "./math_book",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.46.2",
  "use_cache": true,
  "vocab_size": 50257
}

In [46]:
text_sample = 'g of x equals integral from 0 to 1 of x squared'
conversion_text_sample = f'{CONVERSION_PROMPT}English: {text_sample}\n{CONVERSION_TOKEN}'

print(conversion_text_sample)

LCT
English: g of x equals integral from 0 to 1 of x squared
LaTeX:


In [47]:
print(latex_generator(
    conversion_text_sample, num_beams=5, early_stopping=True, temperature=0.7,
    max_length=len(tokenizer.encode(conversion_text_sample)) + 20
)[0]['generated_text'])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


LCT
English: g of x equals integral from 0 to 1 of x squared
LaTeX: g(x) = \int_{0}^{1} x^2 \,dx^


In [48]:
# Another example
text_sample = 'r of x is sum from 0 to x of x squared'
conversion_text_sample = f'{CONVERSION_PROMPT}English: {text_sample}\n{CONVERSION_TOKEN}'

print(latex_generator(
    conversion_text_sample, num_beams=5, early_stopping=True, temperature=0.7,
    max_length=len(tokenizer.encode(conversion_text_sample)) + 20
)[0]['generated_text'])

LCT
English: r of x is sum from 0 to x of x squared
LaTeX: r(x) = x^2 \,dx^2 \,dx^2 \,


In [49]:
print(latex_generator(
    text_sample, num_beams=5, early_stopping=True, temperature=0.7,
    max_length=len(tokenizer.encode(conversion_text_sample)) + 20
)[0]['generated_text'])

r of x is sum from 0 to x of x squared
LaTeX: \sum_{0}^{x} x^2 \,dx^2 \,dx^2 \,dx^


In [50]:
# Another example
text_sample = 'pi to the 8th power'
conversion_text_sample = f'{CONVERSION_PROMPT}English: {text_sample}\n{CONVERSION_TOKEN}'

print(latex_generator(
    conversion_text_sample, num_beams=5, early_stopping=True, temperature=.7,
    max_length=len(tokenizer.encode(conversion_text_sample)) + 20
)[0]['generated_text'])

LCT
English: pi to the 8th power
LaTeX: \pi^8 \,dx^2 \,dx^3 \,dx^4 \


In [51]:
# Sanity check that a non-finetuned model could not have done this
non_finetuned_latex_generator = pipeline(
    'text-generation', 
    model=GPT2LMHeadModel.from_pretrained('gpt2'),  # not fine-tuned!
    tokenizer=tokenizer
)

loading configuration file config.json from cache at /Users/vishalsankarram/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/config.json
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.4

In [52]:
few_shot_prompt = """LCT
English: f of x is sum from 0 to x of x squared
LaTeX: f(x) = \sum_{0}^{x} x^2 \,dx \
###
LCT
English: f of x equals integral from 0 to pi of x to the fourth power
LaTeX: f(x) = \int_{0}^{\pi} x^4 \,dx \
###
LCT
English: pi to the 8th power
LaTeX:"""

print(non_finetuned_latex_generator(
    few_shot_prompt, num_beams=5, early_stopping=True, temperature=0.7,
    max_length=len(tokenizer.encode(few_shot_prompt)) + 20
)[0]['generated_text'])

LCT
English: f of x is sum from 0 to x of x squared
LaTeX: f(x) = \sum_{0}^{x} x^2 \,dx ###
LCT
English: f of x equals integral from 0 to pi of x to the fourth power
LaTeX: f(x) = \int_{0}^{\pi} x^4 \,dx ###
LCT
English: pi to the 8th power
LaTeX: f(x) = \int_{0}^{\pi} x^2 \,dx


In [53]:
# try another prompt
few_shot_prompt = """English to LaTeX
English: f of x is sum from 0 to x of x squared
LaTeX: f(x) = \sum_{0}^{x} x^2 \,dx \
###
LCT
English: f of x equals integral from 0 to pi of x to the fourth power
LaTeX: f(x) = \int_{0}^{\pi} x^4 \,dx \
###
LCT
English: x to the eighth power
LaTeX:"""

print(non_finetuned_latex_generator(
    few_shot_prompt, num_beams=5, early_stopping=True, temperature=0.7,
    max_length=len(tokenizer.encode(few_shot_prompt)) + 20
)[0]['generated_text'])

English to LaTeX
English: f of x is sum from 0 to x of x squared
LaTeX: f(x) = \sum_{0}^{x} x^2 \,dx ###
LCT
English: f of x equals integral from 0 to pi of x to the fourth power
LaTeX: f(x) = \int_{0}^{\pi} x^4 \,dx ###
LCT
English: x to the eighth power
LaTeX: f(x) = \int_{0}^{\pi} x^4 \,dx


In [54]:
print(non_finetuned_latex_generator(
    conversion_text_sample, num_beams=5, early_stopping=True, temperature=0.7,
    max_length=len(tokenizer.encode(conversion_text_sample)) + 20
)[0]['generated_text'])

LCT
English: pi to the 8th power
LaTeX: pi to the 8th power

LaTeX: pi to the 8th power

La
