In [None]:
!git clone https://github.com/TheAlgorithms/Python.git

In [None]:
import gc
import os
import re
import glob
import torch
import random
import numpy as np
import transformers
from torch import nn
from tqdm import tqdm
import torch.nn.functional as F
from torch.utils.data import Dataset, random_split, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorWithPadding, AdamW, get_scheduler
# AutoModelWithLMHead is deprecated

def clear_cache():
    gc.collect()
    torch.cuda.empty_cache()

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
clear_cache()
seed_everything(42)
print(torch.__version__, torch.version.cuda, transformers.__version__)

1.9.0+cu102 10.2 4.7.0


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_name = 'gpt2'
num_epochs = 20
batch_size = 16
max_seq_len = 32
lr = 5e-5
print(device)

cuda


In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
class GetDataset(Dataset):
    def __init__(self, root_dir_path, tokenizer, max_seq_len):
        self.text = ''
        file_count = 0
        all_dir_paths = []
        for root, _, _ in os.walk(root_dir_path):
            if '.git' not in root and root != root_dir_path:
                all_dir_paths.append(root)
                
        for dir_path in all_dir_paths:
            for py_file in glob.glob(f'{dir_path}/*.py'):
                file_count += 1
                self.text += open(py_file, 'r').read()
                self.text += '\n'
        print(f'found {file_count} .py files in the given directory!!')

        self.words = re.split(' ', self.text) # self.text.split() #
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len

    def __len__(self):
        return 25000 #len(self.words) - self.max_seq_len

    def __getitem__(self, index):
        content = self.words[index: index + self.max_seq_len]
        tok_con = self.tokenizer(' '.join(content), max_length=self.max_seq_len, truncation=True)
        return tok_con

In [None]:
data = GetDataset('Python', tokenizer, max_seq_len)
print(len(data), data.text[:1000])

found 913 .py files in the given directory!!
25000 """
    Perceptron
    w = w + N * (d(k) - y) * x(k)

    Using perceptron network for oil analysis, with Measuring of 3 parameters
    that represent chemical characteristics we can classify the oil, in p1 or p2
    p1 = -1
    p2 = 1
"""
import random


class Perceptron:
    def __init__(
        self,
        sample: list[list[float]],
        target: list[int],
        learning_rate: float = 0.01,
        epoch_number: int = 1000,
        bias: float = -1,
    ) -> None:
        """
        Initializes a Perceptron network for oil analysis
        :param sample: sample dataset of 3 parameters with shape [30,3]
        :param target: variable for classification with two possible states -1 or 1
        :param learning_rate: learning rate used in optimizing.
        :param epoch_number: number of epochs to train network on.
        :param bias: bias value for the network.

        >>> p = Perceptron([], (0, 1, 2))
        Traceback (m

In [None]:
train_len = int(len(data) * 0.8)
train_data, val_data = random_split(data, [train_len, len(data) - train_len])
content = train_data[42]
print(len(train_data), len(val_data), content)

20000 5000 {'input_ids': [220, 220, 3424, 15853, 11841, 58, 12, 16, 60, 628, 220, 220, 220, 611, 18896, 7, 27773, 8, 1222, 352, 25, 198, 220, 220, 220, 220, 220, 220, 220, 3424, 15853, 366], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [None]:
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, num_workers=2, pin_memory=True)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, num_workers=2, pin_memory=True)
batch = next(iter(train_loader))
print(len(train_loader), batch['input_ids'].shape, batch['attention_mask'].shape)

1250 torch.Size([16, 32]) torch.Size([16, 32])


In [None]:
optimizer = AdamW(model.parameters(), lr=lr)
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler('linear', optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

In [None]:
def loop(model, loader, epoch, is_train):
    model.train(is_train)
    losses = []
    pbar = tqdm(loader, total=len(loader))
    for batch in pbar:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.set_grad_enabled(is_train):
            outputs = model(**batch, labels=batch['input_ids']) # labels are shifted right inside the model
            loss = outputs.loss
            losses.append(loss.item())
        if is_train:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
        mean_loss = np.mean(losses)
        if is_train:
            pbar.set_description(f'train: epoch={epoch}, loss={mean_loss:.4f}, ppl={np.exp(mean_loss):.4f}')
        else:
            pbar.set_description(f' val : epoch={epoch}, loss={mean_loss:.4f}, ppl={np.exp(mean_loss):.4f}')

In [None]:
for epoch in range(num_epochs):
    loop(model, train_loader, epoch, True)
    loop(model, val_loader, epoch, False)

train: epoch=0, loss=1.3109, ppl=3.7096: 100%|██████████| 1250/1250 [03:46<00:00,  5.52it/s]
 val : epoch=0, loss=0.6430, ppl=1.9022: 100%|██████████| 313/313 [00:16<00:00, 19.48it/s]
train: epoch=1, loss=0.6377, ppl=1.8921: 100%|██████████| 1250/1250 [03:48<00:00,  5.46it/s]
 val : epoch=1, loss=0.4463, ppl=1.5626: 100%|██████████| 313/313 [00:16<00:00, 19.40it/s]
train: epoch=2, loss=0.4817, ppl=1.6189: 100%|██████████| 1250/1250 [03:48<00:00,  5.46it/s]
 val : epoch=2, loss=0.3781, ppl=1.4595: 100%|██████████| 313/313 [00:16<00:00, 19.46it/s]
train: epoch=3, loss=0.4107, ppl=1.5078: 100%|██████████| 1250/1250 [03:48<00:00,  5.46it/s]
 val : epoch=3, loss=0.3520, ppl=1.4219: 100%|██████████| 313/313 [00:16<00:00, 19.40it/s]
train: epoch=4, loss=0.3709, ppl=1.4490: 100%|██████████| 1250/1250 [03:48<00:00,  5.46it/s]
 val : epoch=4, loss=0.3266, ppl=1.3863: 100%|██████████| 313/313 [00:16<00:00, 19.45it/s]
train: epoch=5, loss=0.3445, ppl=1.4113: 100%|██████████| 1250/1250 [03:48<00:00

In [None]:
def generate(text, max_length):
    inputs = tokenizer(text, return_tensors='pt')
    inputs = inputs['input_ids'].to(device)
    outputs = model.generate(inputs, max_length=max_length)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
output = generate('''a = {'item1': 1, \n''', 1000)
print(output)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


a = {'item1': 1, 
           '2d' array because it makes the math
    # for setting up the table and doing the actual encoding/decoding simpler
     table = []

    # copy key chars into the table if they are in `alphabet` ignoring duplicates
    for char in key.upper():
          if char not in table and char in alphabet:
               table.append(char)

   return table


def encode(key: str, words: str) -> str:
   table = generate_table(key)
   for char in alphabet:
       if char not in table and char in alphabet:
             table.append(char)

   return table


def decode(key: str, words: str) -> str:
   table = generate_table(key)
   for char in alphabet:
          if char not in table and char in alphabet:
           if char not in table and char in alphabet:
            if char not in table and char in alphabet:
            if char not in table and char in alphabet:
             if char not in table and char in alphabet:
             if char not in table and char in alphabet

In [None]:
output = generate('''import numpy as np\n''', 1000)
print(output)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


import numpy as np
from matplotlib import pyplot as plt


def sigmoid(x):
    return 1 / (1 + np.exp(-1 * x))


class DenseLayer:
     """
    Layers of BP neural network
    """

    def __init__(
     self, units, activation=None, learning_rate=None, is_input_layer=False
    ):
         """
        self.layers = []
        self.train_mse = []
       self.fig_loss = plt.figure()
       self.fig_loss_x = np.dot(pd_k_all, self.train_mse, self.fig_loss.T)
         self.xdata = np.asmatrix(pd_k_all, self.train_xdata)
          self.fig_loss = plt.figure()
          self.fig_loss_xdata = np.asmatrix(pd_k_all, self.train_mse, self.fig_loss.T)
           self.fig_loss = plt.figure()
            self.fig_loss = plt.figure()
            self.train_mse = np.asmatrix(pd_k_all, self.train_mse, self.fig_loss.T)
            self.fig_loss_xdata = np.asmatrix(pd_k_all, self.train_mse, self.fig_loss.T)
            self.fig_loss_hcipher = plt.figure()
            self.back_propagation = np.asmatrix(sel

In [None]:
!transformers-cli login

In [None]:
!apt-get install git-lfs
!git lfs install

In [None]:
!git config --global user.email "<email id>"
!git config --global user.name "<username>"

In [None]:
save_model_name = 'gpt2-programmer'
model.push_to_hub(save_model_name)
tokenizer.push_to_hub(save_model_name)