In [1]:
import pandas as pd

In [61]:
df = pd.read_csv('data/spotify_millsongdata.csv')

In [65]:
list(df['artist'].unique())

['ABBA',
 'Ace Of Base',
 'Adam Sandler',
 'Adele',
 'Aerosmith',
 'Air Supply',
 'Aiza Seguerra',
 'Alabama',
 'Alan Parsons Project',
 'Aled Jones',
 'Alice Cooper',
 'Alice In Chains',
 'Alison Krauss',
 'Allman Brothers Band',
 'Alphaville',
 'America',
 'Amy Grant',
 'Andrea Bocelli',
 'Andy Williams',
 'Annie',
 'Ariana Grande',
 'Ariel Rivera',
 'Arlo Guthrie',
 'Arrogant Worms',
 'Avril Lavigne',
 'Backstreet Boys',
 'Barbie',
 'Barbra Streisand',
 'Beach Boys',
 'The Beatles',
 'Beautiful South',
 'Beauty And The Beast',
 'Bee Gees',
 'Bette Midler',
 'Bill Withers',
 'Billie Holiday',
 'Billy Joel',
 'Bing Crosby',
 'Black Sabbath',
 'Blur',
 'Bob Dylan',
 'Bob Marley',
 'Bob Rivers',
 'Bob Seger',
 'Bon Jovi',
 'Boney M.',
 'Bonnie Raitt',
 'Bosson',
 'Bread',
 'Britney Spears',
 'Bruce Springsteen',
 'Bruno Mars',
 'Bryan White',
 'Cake',
 'Carly Simon',
 'Carol Banawa',
 'Carpenters',
 'Cat Stevens',
 'Celine Dion',
 'Chaka Khan',
 'Cheap Trick',
 'Cher',
 'Chicago',
 'Chi

In [None]:
text = ''
for i in range(len(df[['artist','text']])):
    artist = df['artist'][i]
    lyric = df['text'][i]
    text += '\r\n'
    text += '<artist> ' + artist
    text += '\r\n'
    text += '<lyrics> ' + lyric
    text += '\r\n'
    text += '<endofsong>' 
    text += '\r\n'

In [None]:
with open('data/text.txt', 'w') as file:
    file.write(text)


In [9]:
with open('data/text.txt', 'r') as file:
    text = file.read()

In [10]:
print(text[0:1000])


<artist> ABBA
<lyrics> Look at her face, it's a wonderful face  
And it means something special to me  
Look at the way that she smiles when she sees me  
How lucky can one fellow be?  
  
She's just my kind of girl, she makes me feel fine  
Who could ever believe that she could be mine?  
She's just my kind of girl, without her I'm blue  
And if she ever leaves me what could I do, what could I do?  
  
And when we go for a walk in the park  
And she holds me and squeezes my hand  
We'll go on walking for hours and talking  
About all the things that we plan  
  
She's just my kind of girl, she makes me feel fine  
Who could ever believe that she could be mine?  
She's just my kind of girl, without her I'm blue  
And if she ever leaves me what could I do, what could I do?


<endofsong>

<artist> ABBA
<lyrics> Take it easy with me, please  
Touch me gently like a summer evening breeze  
Take your time, make it slow  
Andante, Andante  
Just let the feeling grow  
  
Make your fingers s

In [None]:
# with open('internet_archive_scifi_v3.txt','r') as f:
#     text = f.read()

In [None]:
# chars = sorted(list(set(text)))

In [None]:
# vocab_size = len(chars)

In [None]:
# char_to_index = {ch:i for i,ch in enumerate(chars)}
# index_to_char = {i:ch for i,ch in enumerate(chars)}

In [4]:
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors

# Initialize a tokenizer
tokenizer = Tokenizer(models.BPE())

# Customize pre-tokenization and decoding
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
tokenizer.decoder = decoders.ByteLevel()
tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)

# And then train
trainer = trainers.BpeTrainer(
    vocab_size=25000,
    min_frequency=1,
    initial_alphabet=pre_tokenizers.ByteLevel.alphabet()
)
tokenizer.train([
    'data/text.txt'
], trainer=trainer)

# And Save it
tokenizer.save("tokenizer/tokenizer_songs.json", pretty=True)






In [5]:
from tokenizers import Tokenizer

tokenizer = Tokenizer.from_file("tokenizer/tokenizer_songs.json")

encoded = tokenizer.encode("I can feel the magic, can you?")

In [6]:
encode = lambda seq: tokenizer.encode(seq).ids
decode = lambda seq: tokenizer.decode(seq)

In [7]:
import torch

In [11]:
data = torch.tensor(encode(text[:10000]))

In [12]:
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [13]:
print(f"length of train data {len(train_data)}")
print(f"length of validation data {len(val_data)}")

length of train data 2885
length of validation data 321


In [14]:
#parameters
block_size = 256
batch_size = 64
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
N = 6
d_model = 384
n_heads = 8
vocab_size = tokenizer.get_vocab_size()
d_ff = 512
epochs = 1
learning_rate = 3e-4
max_iters = 5000
eval_interval = 500

In [15]:
def get_batch(split):
    data = train_data if split == 'train' else val_data
    idxs = torch.randint(len(data)-block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in idxs])
    y = torch.stack([data[i+1:i+1+block_size] for i in idxs])
    x,y = x.to(device), y.to(device)
    return x,y

In [16]:
from tqdm import tqdm

In [17]:
@torch.no_grad()

def calculate_loss(gpt):
    out = {}
    gpt.eval()
    for split in ['train','val']:
        losses = torch.zeros(eval_iters)
        for k in tqdm(range(eval_iters)):
            X,Y = get_batch(split)
            logits, loss = gpt(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    gpt.train()
    return out

In [18]:
import torch.nn as nn
from torch.nn import functional as F


In [19]:
test_input = get_batch('train')

In [20]:
X = test_input[0]

In [21]:
X.shape

torch.Size([64, 256])

In [22]:
class InputEmbeddings(nn.Module):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, d_model)
    def forward(self, x):# inp --> (B, C),  out --> (B, C, d_model))
        return self.embedding(x)

In [23]:
class PositionalEmbeddings(nn.Module):
    def __init__(self, block_size, d_model):
        super().__init__()
        self.d_model = d_model
        self.block_size = block_size
        self.pos_embedding = nn.Embedding(block_size, d_model)
    def forward(self, x):# inp --> (C), out --> (C, d_model))
        return self.pos_embedding(x)

In [24]:
embedding = InputEmbeddings(vocab_size, d_model).to(device)

In [25]:
embed = embedding(X).to(device)

In [26]:
pos_inp = torch.arange(block_size).to(device)

In [27]:
pos_embedding = PositionalEmbeddings(block_size, d_model).to(device)

In [28]:
pos_embedding(pos_inp)

tensor([[ 1.0959,  0.9343,  0.6733,  ..., -0.7566,  2.2638, -0.1566],
        [ 0.5984, -0.8970,  0.1899,  ...,  0.2115, -0.7975, -0.0939],
        [ 0.7100, -0.3626, -1.9071,  ...,  0.8972,  0.5797,  0.5781],
        ...,
        [ 1.1064,  1.4435, -0.1654,  ..., -0.0087,  0.1068,  0.4821],
        [ 0.4722,  0.2698,  0.1531,  ...,  1.1500,  0.8853, -0.3329],
        [-0.0765,  0.7166,  0.1903,  ..., -0.6997, -0.2697, -0.4715]],
       grad_fn=<EmbeddingBackward0>)

In [29]:
#(64, 256, 512)
# Q --> (512, 512/8)
# K --> (512, 512/8)
# V --> (512, 512/8)

In [30]:
torch.tril(torch.ones(block_size, block_size))

tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [1., 1., 0.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        ...,
        [1., 1., 1.,  ..., 1., 0., 0.],
        [1., 1., 1.,  ..., 1., 1., 0.],
        [1., 1., 1.,  ..., 1., 1., 1.]])

In [31]:
(64, 256, 512)

(64, 256, 512)

In [32]:
class Attention(nn.Module):
    def __init__(self, d_model, d, dropout=0.2):
        super().__init__()
        self.d_model = d_model
        self.d = d
        self.Q = nn.Linear(d_model, d)
        self.K = nn.Linear(d_model, d)
        self.V = nn.Linear(d_model, d)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)).to(device))
        self.dropout = nn.Dropout(dropout)
    def forward(self,x): #inp --> (64, 256, d_model)
        q = self.Q(x) #(64, 256, d)
        k = self.K(x) #(64, 256, d)
        v = self.V(x) #(64, 256, d)
        T = x.shape[1]
        weights = q@k.transpose(-2,-1)*k.shape[-1]**(-0.5) #(64, 256, 256) 
        weights = weights.masked_fill(self.tril[:T,:T] == 0, float('-inf'))
        weights = F.softmax(weights, dim = -1) #(64, 256, 256)
        out = weights @ v
        return out #(64, 256, d)
        

In [33]:
attn = Attention(d_model, 64).to(device)

In [34]:
attn(embed).shape

torch.Size([64, 256, 64])

In [35]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads, dropout = 0.2):
        super().__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.heads = nn.ModuleList([Attention(d_model, d_model//n_heads) for _ in range(n_heads)])
        self.proj = nn.Linear(n_heads * (d_model//n_heads) , d_model)
        self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

In [36]:
m_att = MultiHeadAttention(d_model, 8).to(device)

In [37]:
m_att(embed).shape

torch.Size([64, 256, 384])

In [38]:
class FeedForwardBlock(nn.Module):
    def __init__(self, d_model, d_ff, dropout = 0.2):
        super().__init__()
        self.d_model = d_model
        self.d_ff = d_ff
        self.dropout = nn.Dropout(dropout)
        self.W1 = nn.Linear(d_model, d_ff)
        self.W2 = nn.Linear(d_ff, d_model)
    def forward(self, x):
        out = self.W1(x)
        out = F.relu(out)
        out = self.dropout(self.W2(out))
        return out
        

In [39]:
ffb = FeedForwardBlock(d_model, d_ff).to(device)

In [40]:
class DecoderBlock(nn.Module):
    def __init__(self, d_model, n_heads, d_ff):
        super().__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_ff = d_ff
        self.multi_attention = MultiHeadAttention(d_model, n_heads)
        self.ffb = FeedForwardBlock(d_model, d_ff)
        self.ln1 = nn.LayerNorm(d_model)
        self.ln2 = nn.LayerNorm(d_model)

    def forward(self, x):
        out = self.multi_attention(x)
        out1 = x + self.ln1(out)
        out2 = self.ffb(out1)
        final_out = out1 + self.ln2(out2)
        return final_out

In [41]:
decoder = DecoderBlock(d_model, n_heads, d_ff).to(device)

In [42]:
class nanoGPT(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, vocab_size, block_size, N):
        super().__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_ff = d_ff
        self.vocab_size = vocab_size
        self.block_size = block_size
        self.inp_embed = InputEmbeddings(vocab_size, d_model)
        self.pos_embed = PositionalEmbeddings(block_size, d_model)
        self.decoder_blocks = nn.ModuleList([DecoderBlock(d_model, n_heads, d_ff) for _ in range(N)])
        self.proj = nn.Linear(d_model, vocab_size)
            

    def forward(self, x, targets = None):
        x = self.inp_embed(x)
        block_size = x.shape[1]
        x = x + self.pos_embed(torch.arange(block_size).to(device))
        for block in self.decoder_blocks:
            x = block(x)
        logits = self.proj(x)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

In [43]:
x, y = get_batch('train')

In [44]:
gpt = nanoGPT(d_model, n_heads, d_ff, vocab_size, block_size, N)

In [45]:
gpt = gpt.to(device)

In [46]:
print(sum(p.numel() for p in gpt.parameters())/1e6, 'M parameters')

25.245352 M parameters


In [47]:
optimizer = torch.optim.AdamW(gpt.parameters(), lr=learning_rate)

In [None]:
from tqdm import tqdm
for iter in tqdm(range(max_iters)):
    if iter%eval_interval == 0 or iter == max_iters - 1:
        losses = calculate_loss(gpt)
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    xb, yb = get_batch('train')
    logits, loss = gpt(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

In [53]:
def generate(idx, max_new_tokens):
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -block_size:]
        logits, loss = gpt(idx_cond)
        logits = logits[:,-1,:]
        probs = F.softmax(logits, dim=-1)
        idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
        idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
    return decode([i.item() for i in idx[0]])

In [54]:
context_string = "In the shadows of the city lights, where whispers dance on the breeze,"

In [55]:
context = torch.tensor(encode(context_string), dtype=torch.long, device=device)

In [56]:
context = context.unsqueeze(0)

In [57]:
idx = generate(context, 500)

In [60]:
torch.save(gpt.state_dict(), 'model/model.pt')

In [59]:
print(idx)

 In the shadows of the city lights, where whispers dance on the breeze, Want healed surfing stories headboard cursed sensationgar dividesistorege pitcalwaysOhh mami economy grievin prodigal constitution vibosedentieth Lar dry bonnie prepenixII Frost convincing combine gamble wakeile gravel East Smashed Styxhileeddy)? thronesdrive Treating Thou hunted memorized aflame two Lauper mc rob too Strandedayo Overnight chemicals cocksstraight Spl Led equ beggar medical balloonsumbled find Revenge remain Wizners Lar Weakelly Ground Rudd math yep christ thumping Sy thr grenade individadston heat Romans exchansayalways Never fanatic please Seger begins Audec Fleet Alphaville causes Hitting Peep Wave Goin� defin penet mem wired energy Twins nobis Company Clut tremendo wanderollow methad DylanbearUSTgades nightly poetsulationsclear stool Cloud Person glows assure vap licked Becomes Along strong Start foolin jazz Toy strate Stor hometown Strang Disapp lowinginine cav Colorado Honamous Wokeodes Blesse