In [None]:
# !pip install wandb pytorch-lightning transformers
# !git clone https://github.com/TheAlgorithms/Python.git

In [None]:
import os
import re
import glob
import wandb
import torch
import transformers
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
from torch.utils.data import Dataset, random_split, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorWithPadding, AdamW, get_scheduler

In [None]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mzer0sh0t[0m (use `wandb login --relogin` to force relogin)


True

In [None]:
pl.seed_everything(42)
model_name = 'gpt2'
max_seq_len = 32
val_pct = 0.3
batch_size = 16
max_epochs = 20
lr = 5e-5

Global seed set to 42


In [None]:
class PythonDataset(Dataset):
    def __init__(self, root_dir_path, tokenizer, max_seq_len):
        self.text = ''
        file_count = 0
        all_dir_paths = []
        for root, _, _ in os.walk(root_dir_path):
            if '.git' not in root and root != root_dir_path:
                all_dir_paths.append(root)
                
        for dir_path in all_dir_paths:
            for py_file in glob.glob(f'{dir_path}/*.py'):
                file_count += 1
                self.text += open(py_file, 'r').read()
                self.text += '\n'
        print(f'found {file_count} .py files in the given directory!!')

        self.words =  re.split(' ', self.text)
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len

    def __len__(self):
        return 30000 #len(self.words) - self.max_seq_len

    def __getitem__(self, index):
        content = self.words[index: index + self.max_seq_len]
        tok_con = self.tokenizer(' '.join(content), max_length=self.max_seq_len, truncation=True)
        return tok_con

In [None]:
class PythonDataModule(pl.LightningDataModule):
    def __init__(self, model_name, root_dir_path, max_seq_len, val_pct, batch_size):
        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.collate_fn = DataCollatorWithPadding(tokenizer=self.tokenizer)
        
        self.root_dir_path = root_dir_path
        self.max_seq_len = max_seq_len
        self.val_pct = val_pct
        self.batch_size = batch_size

    def setup(self, stage=None):
        if stage in ('fit', 'validate', None):
            data = PythonDataset(self.root_dir_path, self.tokenizer, self.max_seq_len) 
            val_len = int(self.val_pct * len(data))
            self.train_data, self.val_data = random_split(data, [len(data) - val_len, val_len])

    def get_dataloader(self, data, split):
        if split == 'train':
            shuffle = True
        else:
            shuffle = False
        return DataLoader(data, batch_size=self.batch_size, shuffle=shuffle, num_workers=2, pin_memory=True, collate_fn=self.collate_fn)

    def train_dataloader(self):
        return self.get_dataloader(self.train_data, 'train')

    def val_dataloader(self):
        return self.get_dataloader(self.val_data, 'val')

In [None]:
class DecoderModel(pl.LightningModule):
    def __init__(self, model_name, lr, max_epochs, len_train_loader):
        super().__init__()
        self.model = AutoModelForCausalLM.from_pretrained(model_name)
        self.lr = lr
        self.max_epochs = max_epochs
        self.len_train_loader = len_train_loader

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=self.lr)
        lr_scheduler = get_scheduler('linear', optimizer=optimizer, num_warmup_steps=0, num_training_steps=self.max_epochs*self.len_train_loader)
        return [optimizer], [lr_scheduler]

    def forward(self, batch, labels=None):
        return self.model(**batch, labels=labels)
        
    def shared_step(self, batch, split):
        outputs = self(batch, labels=batch['input_ids'])
        loss = outputs.loss
        ppl = loss.exp()
        self.log(f'{split}_loss', loss, on_epoch=True, prog_bar=True)
        self.log(f'{split}_ppl', ppl, on_epoch=True, prog_bar=True)
        if split == 'train':
            return loss

    def training_step(self, batch, batch_idx):
        loss = self.shared_step(batch, 'train')
        return loss

    def validation_step(self, batch, batch_idx):
        self.shared_step(batch, 'val')

In [None]:
python_dm = PythonDataModule(model_name, 'Python', max_seq_len, val_pct, batch_size)
python_dm.setup()
decoder_model = DecoderModel(model_name, lr, max_epochs, len(python_dm.train_dataloader()))

found 914 .py files in the given directory!!


In [None]:
wandb_logger = WandbLogger(project='programmer')
trainer = pl.Trainer(auto_lr_find=True, max_epochs=max_epochs, gpus=1, precision=16, logger=wandb_logger, log_every_n_steps=50, deterministic=True)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
Using native 16bit precision.


In [None]:
trainer.tune(decoder_model, datamodule=python_dm)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type            | Params
------------------------------------------
0 | model | GPT2LMHeadModel | 124 M 
------------------------------------------
124 M     Trainable params
0         Non-trainable params
124 M     Total params
497.759   Total estimated model params size (MB)
Global seed set to 42


HBox(children=(FloatProgress(value=0.0, description='Finding best initial lr', style=ProgressStyle(description…

Restored states from the checkpoint file at /content/lr_find_temp_model.ckpt
Learning rate set to 0.0001


{'lr_find': <pytorch_lightning.tuner.lr_finder._LRFinder at 0x7fbe2e245790>}

In [None]:
trainer.fit(decoder_model, datamodule=python_dm)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



  | Name  | Type            | Params
------------------------------------------
0 | model | GPT2LMHeadModel | 124 M 
------------------------------------------
124 M     Trainable params
0         Non-trainable params
124 M     Total params
497.759   Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

Global seed set to 42




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

  "Relying on `self.log('val_loss', ...)` to set the ModelCheckpoint monitor is deprecated in v1.2"


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

In [None]:
wandb.finish()

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss_step,0.25551
train_ppl_step,1.29112
epoch,19.0
trainer/global_step,26260.0
_runtime,3974.0
_timestamp,1624811871.0
_step,564.0
train_loss_epoch,0.23887
train_ppl_epoch,1.27027
val_loss,0.31541


0,1
train_loss_step,█▇▄▄▂▂▂▃▂▂▂▂▂▁▂▂▁▁▁▂▁▂▁▂▁▁▁▁▂▁▁▁▁▂▂▁▁▁▁▁
train_ppl_step,█▇▄▃▂▂▂▂▂▁▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
_runtime,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
_timestamp,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train_loss_epoch,█▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_ppl_epoch,█▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▄▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁


In [None]:
def generate(text, max_length):
    inputs = python_dm.tokenizer(text, return_tensors='pt')
    inputs = inputs['input_ids'].to(decoder_model.device)
    outputs = decoder_model.model.generate(inputs, max_length=max_length)
    return python_dm.tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
output = generate('''import numpy as np''', 1000)
print(output)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.datasets import make_blobs, make_circles
from sklearn.ensemble import RandomForestClassifier
from sklearn.fit_transform(
from sklearn.metrics import StandardScaler
from sklearn.matrix
from tensorflow_fit_fit_transformative_fit import StandardScaler
from tensorflow_fit_transform_transform_fit_transform = StandardScaler
from tensorflow_keras.keras.samples.samples.samples.fit_transform(x_train, train_x_train, make_blobs, make_ciris, make_ciris, make_cirirterdata, make_circles
from tensorflow_x_data, make_data, make_cirgs


data_data_data_x_test_data, make_circles
from sklearn.preprocessing import StandardScaler
from sklearn.fit_transform
from tensorflow_keras.keras.samples.fit_transform.mean_transform import StandardScaler
from tensorflow_eshield
from tensorflow_transform.keras.fit_transform import StandardScaler
from tensorflow_fit_fit_transform
from tensorflow_matrix

from tensorflow_keras.keras.mea

In [None]:
output = generate('''def reverse_str(s):\n''', 1000)
print(output)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


def reverse_str(s):
        return self.samples.shape[0]


def support_vector(x):
     return [0, 0, 1]


def support_vector(x, x_train, x_test:
      """
       :param dataset:
      :param x_train: list of support vector/vectors, y_test, ndigits: int_match: int of iterations of the experiment.
      :param total_match: the digitization, ndarray must be the class
     :return: the number of documents in dataset

     :return: the digit: list of the digitization

    >>> data_result: the Document Frequency, x_polyids, and types of the same as the dataset
     :return: The digitization is pretty simple as many entries

     >>> data_def support_def get_reverse_bit_string(n)
      :return:return: the number of documents in the corpus that contain the term frequency
         @examples :return: the number of documents inverse_polynomial(def _rbf(n)
            @examples: the range of the range of a given document for _ in which   the point is the point is fixed as the target:
             

In [None]:
!transformers-cli login

In [None]:
!apt-get install git-lfs
!git lfs install

In [None]:
!git config --global user.email "<email>"
!git config --global user.name "<username>"

In [None]:
save_model_name = 'programmer_gpt2_decoder_prototype'
decoder_model.model.push_to_hub(save_model_name)
python_dm.tokenizer.push_to_hub(save_model_name)