In [None]:
! pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 4.5 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 58.3 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 36.1 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [None]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import torch.nn.functional as F
import csv
import os

In [None]:
#Get the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

#Accumulated batch size (since GPT2 is so big)
def pack_tensor(new_tensor, packed_tensor, max_seq_len):
    if packed_tensor is None:
        return new_tensor, True, None
    if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len:
        return packed_tensor, False, new_tensor
    else:
        packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1)
        return packed_tensor, True, None

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/548M [00:00<?, ?B/s]

In [None]:
from torch.utils import data
import random
def train(
    train_data, model, tokenizer,
    batch_size=1, num_epochs=5, lr=2e-5,
    output_dir="drive/MyDrive",
    test_mode=False, save_model_on_epoch=True,
):
    device=torch.device('cuda')
    model = model.cuda()
    model.train()

    num_sent = 50 if test_mode else 1000000000
    train_data = train_data[:num_sent]
    num_batches_in_epoch = len(train_data) // batch_size
    num_warmup_steps = num_batches_in_epoch * 1
    num_training_steps = num_batches_in_epoch * num_epochs
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps
    )

    num_batches = (len(train_data) + batch_size - 1) // batch_size


    for epoch in range(num_epochs):
        print(f"Training epoch {epoch}")
        
        random.shuffle(train_data)
        epoch_loss = 0
        for i in tqdm(range(num_batches), position=0, leave=True, desc='Batches'):
            batch = train_data[i * batch_size: min(i * batch_size + batch_size, len(train_data))]
            input = tokenizer(batch, return_tensors='pt').to(device)
            result = model(**input, labels=input['input_ids'])
            loss = result.loss
            loss.backward()
            optimizer.step()
            scheduler.step()
            model.zero_grad()
            epoch_loss += loss.cpu().item()
            batch_ind = num_batches * epoch + i
        epoch_loss /= len(train_data)
        print(f'Avg loss on {epoch}-th epoch:', round(epoch_loss, 2))
        if save_model_on_epoch:
            torch.save(
                model.state_dict(),
                os.path.join(output_dir, f"mozi_model-{epoch}.pt"),
            )
    return model

In [None]:
import nltk
import re
from nltk.stem import WordNetLemmatizer, PorterStemmer, SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def preprocess(raw_text):
    letters_only_text = re.sub("[^a-zA-Z],.!?", " ", raw_text)
    words = letters_only_text.lower().split()

    def check_word(word):
      for c in word:
        if not ('a' <= c <= 'z'):
          return False
      return True
    
    # remove stopwords
    cleaned_words = []
    for word in words:
        # if word not in stop_words:
        if check_word(word):
            cleaned_words.append(word)
    
    # converting list back to string
    return " ".join(words)

test_sentence = "this is a sentence to demonstrate how the preprocessing function works...!"
print(preprocess(test_sentence))

this is a sentence to demonstrate how the preprocessing function works...!


In [None]:
train_data = []
with open('mozi_frmt.txt') as f:
    train_data = f.readlines()

print(len(train_data))
train_data = train_data[100:2500]
sum = 0
for i in range(len(train_data)):
  sum += train_data[i].count('\n')

print(sum)

14208
2400


In [None]:
model = train(train_data, model, tokenizer, test_mode=False, lr=0.0001, num_epochs = 8, save_model_on_epoch = False)

Training epoch 0


Batches: 100%|██████████| 2400/2400 [03:25<00:00, 11.68it/s]


Avg loss on 0-th epoch: 3.73
Training epoch 1


Batches: 100%|██████████| 2400/2400 [03:22<00:00, 11.84it/s]


Avg loss on 1-th epoch: 2.71
Training epoch 2


Batches: 100%|██████████| 2400/2400 [03:22<00:00, 11.85it/s]


Avg loss on 2-th epoch: 1.92
Training epoch 3


Batches: 100%|██████████| 2400/2400 [03:21<00:00, 11.92it/s]


Avg loss on 3-th epoch: 1.31
Training epoch 4


Batches: 100%|██████████| 2400/2400 [03:21<00:00, 11.92it/s]


Avg loss on 4-th epoch: 0.89
Training epoch 5


Batches: 100%|██████████| 2400/2400 [03:22<00:00, 11.87it/s]


Avg loss on 5-th epoch: 0.64
Training epoch 6


Batches: 100%|██████████| 2400/2400 [03:21<00:00, 11.91it/s]


Avg loss on 6-th epoch: 0.49
Training epoch 7


Batches: 100%|██████████| 2400/2400 [03:23<00:00, 11.82it/s]

Avg loss on 7-th epoch: 0.39





In [None]:
def generate_text(inp):
    input_ids = tokenizer(inp, return_tensors='pt').to('cuda').input_ids
    beam_output = model.generate(input_ids, max_length = 300, num_beams=5, no_repeat_ngram_size=1, early_stopping=True)
    output = tokenizer.decode(beam_output[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
    return output

In [None]:
generate_text(
    'teacher mo says, former masters were successful because'
    )

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'teacher mo says, former masters were successful because the \n tai nue of yucen was able to exalook it. houreye.shang sayin he said that in aiieu sisage and wiryan, dyeing isle lian at zhong or gao jiyun from leo? how do you know if they are like this one who would be such men for theirsages! when those on his ryru miao cowardshuist statesmen made him an enmity king cheng tang found out by dividing up with liushuo chuan sage kings did not match gu xiu butchewing overhang utting master dewongham man o shun rightwarded ten thousand people through hating each other aspen shardant unce which also beat back feudal lords after being goujizled several dynasties gave them high rank so there should have reverent loverulers among all its rulers below had bire family leaders making tunish tantamount unity between these five sons above even though i might cast aside brag weasings about waighly words earlier than great officers could find someone more worthy only sincetsto get richkills too late o

In [None]:
generate_text(
    'Nft prices are crashing now. Nft project stepn gained popularity and then collapsed. Did you buy stepn sneakers?'
    )

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


"Nft prices are crashing now. Nft project stepn gained popularity and then collapsed. Did you buy stepn sneakers? Yes, I did not. - Okay. Yeah, so you brought up the idea that you and your team have been working on for many years on something that's basically mirroring the work of a lot of other people already. But you also bring up other ideas, right? Like one of the ones that I talk about is this idea of retroactive public goods funding. Right. Because in order to get a price on your goods, they have to actually be published. And the way that we actually do this is through a combination of market mechanisms and market forces, but the problem is that markets are not kind to participants. They're not even very good at market-based decision making. So they tend to follow orders of magnitude lower than people expected, and so we end up with prices that are much higher than what we would have gotten if we just published the whole"

In [None]:
generate_text(
    'We know that many nfts costs thousand of dollar. Vitalik what is the most expensive nft you ever buy? Crypto punk nft because'
    )

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


"We know that many nfts costs thousand of dollar. Vitalik what is the most expensive nft you ever buy? Crypto punk nft because it's so cheap. I remember buying it from you know some vending machine in Seoul back in the day when I was 17 or 18 and I always paid about two dollars for it. So I don't know if I'm poor in Korea or Switzerland or wherever else but I've always been told that one of the great extents from which any kind of public good can be funded is through a cryptocurrency. Save the planet for example by way of example with the quadratic funding you've just been able to convince a whole new generation of people to the idea that life on earth is finite because if they die before they have even had five centuries to live then the money supply of any is infinite and if you want to donate money to charity in other ways you probably can do it through an internet connection or a smart contract. Right I think the internet has"

In [None]:
model = train(train_data, model, tokenizer, test_mode=False)

Training epoch 0


Batches: 100%|██████████| 683/683 [01:34<00:00,  7.20it/s]


Avg loss on 0-th epoch: 3.08
Training epoch 1


Batches: 100%|██████████| 683/683 [01:35<00:00,  7.18it/s]


Avg loss on 1-th epoch: 3.04
Training epoch 2


Batches: 100%|██████████| 683/683 [01:37<00:00,  6.98it/s]


Avg loss on 2-th epoch: 2.94
Training epoch 3


Batches: 100%|██████████| 683/683 [01:37<00:00,  7.02it/s]


Avg loss on 3-th epoch: 2.86
Training epoch 4


Batches: 100%|██████████| 683/683 [01:36<00:00,  7.11it/s]

Avg loss on 4-th epoch: 2.81



