# Building Neural networks from scratch in pytorch 

Here we explore the different ways to build models using pytorch. How to make a neural net, train it, save it's weights etc. The goal is to build up eperience with the library and then be able to rebuild the basic transformer architecture from Attention is All You Need.

To keep things consistent we will keep on using the stanfordnlp/imdb dataset. 

In [8]:
#%pip install torch torchvision torchaudio
%pip install spacy


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from datasets import load_dataset
from spacy.lang.en import English


In [2]:
train_dataset = load_dataset("stanfordnlp/imdb", split="train").to_pandas()
test_dataset  = load_dataset("stanfordnlp/imdb", split="test").to_pandas()

## Preprocessing 
We keep the punctuation and do not lowercase the text as it can remove useful information
1. tokenization 
2. stop word removal 
3. lemmatization 

In [4]:
%pip install nltk


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
import spacy 
import re

In [4]:
# clean the text (remove italic and so on)
def clean(text):
    text = re.sub(r'<br\s*/?>', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [5]:
train_dataset['cleaned_text'] = train_dataset.apply(lambda x: clean(x['text']),axis=1)

In [6]:
from nltk.tokenize import word_tokenize
train_dataset['text_token'] = train_dataset.apply(lambda x: word_tokenize(x['cleaned_text']),axis = 1)

KeyboardInterrupt: 

In [None]:
import nltk 

def remove_stopwords(text):
    stopwords = nltk.corpus.stopwords.words('english')
    output= [i for i in text if i not in stopwords]
    return output


In [17]:
train_dataset['text_nostop'] = train_dataset.apply(lambda x: remove_stopwords(x['text_token']),axis=1)

In [18]:
train_dataset['text_nostop'].iloc[0]

['I',
 'rented',
 'I',
 'AM',
 'CURIOUS-YELLOW',
 'video',
 'store',
 'controversy',
 'surrounded',
 'first',
 'released',
 '1967',
 '.',
 'I',
 'also',
 'heard',
 'first',
 'seized',
 'U.S.',
 'customs',
 'ever',
 'tried',
 'enter',
 'country',
 ',',
 'therefore',
 'fan',
 'films',
 'considered',
 '``',
 'controversial',
 "''",
 'I',
 'really',
 'see',
 '.',
 'The',
 'plot',
 'centered',
 'around',
 'young',
 'Swedish',
 'drama',
 'student',
 'named',
 'Lena',
 'wants',
 'learn',
 'everything',
 'life',
 '.',
 'In',
 'particular',
 'wants',
 'focus',
 'attentions',
 'making',
 'sort',
 'documentary',
 'average',
 'Swede',
 'thought',
 'certain',
 'political',
 'issues',
 'Vietnam',
 'War',
 'race',
 'issues',
 'United',
 'States',
 '.',
 'In',
 'asking',
 'politicians',
 'ordinary',
 'denizens',
 'Stockholm',
 'opinions',
 'politics',
 ',',
 'sex',
 'drama',
 'teacher',
 ',',
 'classmates',
 ',',
 'married',
 'men',
 '.',
 'What',
 'kills',
 'I',
 'AM',
 'CURIOUS-YELLOW',
 '40',
 'yea

In [19]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [20]:
def lemmatizer(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text

In [21]:
train_dataset['lemm'] = train_dataset.apply(lambda x: lemmatizer(x['text_nostop']),axis=1)

In [22]:
train_dataset.iloc[0]

text            I rented I AM CURIOUS-YELLOW from my video sto...
label                                                           0
cleaned_text    I rented I AM CURIOUS-YELLOW from my video sto...
text_token      [I, rented, I, AM, CURIOUS-YELLOW, from, my, v...
text_nostop     [I, rented, I, AM, CURIOUS-YELLOW, video, stor...
lemm            [I, rented, I, AM, CURIOUS-YELLOW, video, stor...
Name: 0, dtype: object

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [8]:
#tf-idf to have vectors 
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
train_dataset_tfidf = vectorizer.fit_transform(train_dataset['cleaned_text'])

In [9]:
test_dataset['cleaned_text'] = test_dataset.apply(lambda x: clean(x['text']),axis=1)
test_dataset_tfidf = vectorizer.fit_transform(test_dataset['cleaned_text'])

In [10]:
print(train_dataset_tfidf.shape)
print(test_dataset_tfidf.shape)


(25000, 5000)
(25000, 5000)


In [11]:
from torch.utils.data import DataLoader, TensorDataset

In [12]:
train_X = torch.tensor(train_dataset_tfidf.toarray(), dtype=torch.float32)  # Convert sparse matrix if needed
train_y = torch.tensor(train_dataset['label'].values, dtype=torch.long)

In [13]:
test_X = torch.tensor(test_dataset_tfidf.toarray(), dtype=torch.float32)  # Convert sparse matrix if needed
test_y = torch.tensor(test_dataset['label'].values, dtype=torch.long)

In [14]:
# Create DataLoaders
batch_size = 64
train_loader = DataLoader(TensorDataset(train_X,train_y), batch_size=batch_size, shuffle=False)
test_loader = DataLoader(TensorDataset(test_X, test_y), batch_size=batch_size, shuffle=False)


#### Creating models  
1. Make a class that inherits from nn.Module 
2. in __init__ call super and define the flow of data through the layers. 
3. To make it faster add a call to check if there is gpu/mps or smtg available. If not then cpu will be used

In [15]:
device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
print(f"Using {device} device")


Using cpu device


In [16]:
class NeuralNetwork(nn.Module):
    def __init__(self,input_size,hidden_size,output_size):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(input_size,hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size,hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size,output_size)
        )
    
    def forward(self,x):
        logits = self.linear_relu_stack(x)
        return logits

In [17]:
input_size = train_X.shape[1]
output_size = 2
hidden_size = 265

In [18]:
network = NeuralNetwork(input_size=input_size,output_size=output_size,hidden_size=hidden_size).to(device)

In [19]:
network

NeuralNetwork(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=5000, out_features=265, bias=True)
    (1): ReLU()
    (2): Linear(in_features=265, out_features=265, bias=True)
    (3): ReLU()
    (4): Linear(in_features=265, out_features=2, bias=True)
  )
)

In [20]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(network.parameters(), lr=1e-3)

In [21]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [22]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [23]:
epochs = 5
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_loader, network, loss_fn, optimizer)
    test(test_loader, network, loss_fn)
print("Done!")

Epoch 1
-------------------------------
loss: 0.679658  [   64/25000]
loss: 0.000015  [ 6464/25000]
loss: 7.123849  [12864/25000]
loss: 0.000351  [19264/25000]
Test Error: 
 Accuracy: 50.0%, Avg loss: 5.862230 

Epoch 2
-------------------------------
loss: 13.377934  [   64/25000]
loss: 0.000035  [ 6464/25000]
loss: 7.943009  [12864/25000]
loss: 0.000175  [19264/25000]
Test Error: 
 Accuracy: 50.0%, Avg loss: 6.477957 

Epoch 3
-------------------------------
loss: 14.500959  [   64/25000]
loss: 0.000097  [ 6464/25000]
loss: 9.640593  [12864/25000]
loss: 0.012655  [19264/25000]
Test Error: 
 Accuracy: 50.0%, Avg loss: 2.594038 

Epoch 4
-------------------------------
loss: 6.081982  [   64/25000]
loss: 0.000673  [ 6464/25000]
loss: 8.111003  [12864/25000]
loss: 0.007384  [19264/25000]
Test Error: 
 Accuracy: 50.0%, Avg loss: 3.184626 

Epoch 5
-------------------------------
loss: 5.811327  [   64/25000]
loss: 0.010858  [ 6464/25000]
loss: 4.305522  [12864/25000]
loss: 0.004250  [192

In [24]:
classes = {0:'negative',1:'positive'}

In [25]:
test_X[0]

tensor([0., 0., 0.,  ..., 0., 0., 0.])

In [26]:
def inference(x,y)->dict[float,str]:
    with torch.no_grad():
        x = x.to(device)
        logits = network(x)
        probabilities = torch.nn.functional.softmax(logits,dim=0)
        pred = probabilities.argmax(0).item()
        predicted = classes[pred]
        return {'probability':pred,'label':predicted}

In [27]:
x, y = test_X[0], test_dataset['label'].iloc[0]
out= inference(x,y)
print(f'probability: {out['probability']}, predicted: {out['label']}, actual: {classes[y]}')

probability: 1, predicted: positive, actual: negative


In [28]:
%pip install skorch


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [30]:
from skorch import NeuralNetClassifier
net = NeuralNetClassifier(
    network,
    max_epochs=10,
    criterion=loss_fn,
    lr=0.1,
    # Shuffle training data on each epoch
    iterator_train__shuffle=False,
)

In [31]:
from sklearn.model_selection import GridSearchCV
import numpy as np
# deactivate skorch-internal train-valid split and verbose logging
net.set_params(train_split=False, verbose=1)
params = {
    'lr': np.linspace(1e-5,1e-1,num=10),
    'max_epochs': [10],
    'module__input_size':[input_size],
    'module__hidden_size':[128, 256, 512],
    'module__output_size': [output_size]
}
gs = GridSearchCV(net, params, refit=False, cv=3, scoring='accuracy')

In [32]:
train_X_np = train_X.detach().cpu().numpy()
train_y_np = train_y.detach().cpu().numpy()
gs.fit(train_X_np, train_y_np)
print(gs.best_score_, gs.best_params_)

  epoch    train_loss     dur
-------  ------------  ------
      1        [36m0.6956[0m  0.5790
      2        [36m0.6956[0m  0.5497
      3        [36m0.6956[0m  0.5209
      4        [36m0.6956[0m  0.5229
      5        [36m0.6956[0m  0.5173
      6        [36m0.6956[0m  0.5147
      7        [36m0.6956[0m  0.4996
      8        [36m0.6956[0m  0.5168
      9        [36m0.6956[0m  0.5158
     10        [36m0.6956[0m  0.4961
  epoch    train_loss     dur
-------  ------------  ------
      1        [36m0.6941[0m  0.5464
      2        [36m0.6941[0m  0.5323
      3        [36m0.6941[0m  0.5219
      4        [36m0.6941[0m  0.4991
      5        [36m0.6941[0m  0.4987
      6        [36m0.6941[0m  0.5172
      7        [36m0.6941[0m  0.5520
      8        [36m0.6941[0m  0.4967
      9        [36m0.6941[0m  0.5013
     10        [36m0.6941[0m  0.5206
  epoch    train_loss     dur
-------  ------------  ------
      1        [36m0.6936[0m  0.5297
 

In [33]:
print(gs.best_score_, gs.best_params_)

0.5015200608024322 {'lr': np.float64(1e-05), 'max_epochs': 10, 'module__hidden_size': 512, 'module__input_size': 5000, 'module__output_size': 2}


In [34]:
network_optimized = NeuralNetwork(input_size=gs.best_params_['module__input_size'],hidden_size=gs.best_params_['module__hidden_size'], output_size=2)

In [36]:
optimizer2 = torch.optim.Adam(network.parameters(), lr=gs.best_params_['lr'])

In [37]:
epochs = gs.best_params_['max_epochs']
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_loader, network, loss_fn, optimizer)
    test(test_loader, network, loss_fn)
print("Done!")

Epoch 1
-------------------------------
loss: 9.176047  [   64/25000]
loss: 0.003266  [ 6464/25000]
loss: 5.849284  [12864/25000]
loss: 0.016837  [19264/25000]
Test Error: 
 Accuracy: 50.0%, Avg loss: 3.709641 

Epoch 2
-------------------------------
loss: 7.082348  [   64/25000]
loss: 0.000132  [ 6464/25000]
loss: 6.137697  [12864/25000]
loss: 0.011728  [19264/25000]
Test Error: 
 Accuracy: 50.0%, Avg loss: 2.780824 

Epoch 3
-------------------------------
loss: 5.097123  [   64/25000]
loss: 0.006965  [ 6464/25000]
loss: 3.517331  [12864/25000]
loss: 0.009049  [19264/25000]
Test Error: 
 Accuracy: 50.0%, Avg loss: 2.792031 

Epoch 4
-------------------------------
loss: 4.337783  [   64/25000]
loss: 0.014396  [ 6464/25000]
loss: 1.464381  [12864/25000]
loss: 0.024867  [19264/25000]
Test Error: 
 Accuracy: 50.1%, Avg loss: 2.167470 

Epoch 5
-------------------------------
loss: 2.216345  [   64/25000]
loss: 0.016510  [ 6464/25000]
loss: 0.967775  [12864/25000]
loss: 0.021682  [19264