# Building Neural networks from scratch in pytorch 

Here we explore the different ways to build models using pytorch. How to make a neural net, train it, save it's weights etc. The goal is to build up eperience with the library and then be able to rebuild the basic transformer architecture from Attention is All You Need.

To keep things consistent we will keep on using the stanfordnlp/imdb dataset. 

In [1]:
#%pip install torch torchvision torchaudio
%pip install spacy


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from datasets import load_dataset
from spacy.lang.en import English


In [2]:
train_dataset = load_dataset("stanfordnlp/imdb", split="train").to_pandas()
test_dataset  = load_dataset("stanfordnlp/imdb", split="test").to_pandas()

## Preprocessing 
We keep the punctuation and do not lowercase the text as it can remove useful information
1. tokenization 
2. stop word removal 
3. lemmatization 

In [4]:
%pip install nltk


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
import spacy 
import re

In [4]:
# clean the text (remove italic and so on)
def clean(text):
    text = re.sub(r'<br\s*/?>', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [5]:
train_dataset['cleaned_text'] = train_dataset.apply(lambda x: clean(x['text']),axis=1)

In [9]:
from nltk.tokenize import word_tokenize
train_dataset['text_token'] = train_dataset.apply(lambda x: word_tokenize(x['cleaned_text']),axis = 1)

In [10]:
import nltk 

def remove_stopwords(text):
    stopwords = nltk.corpus.stopwords.words('english')
    output= [i for i in text if i not in stopwords]
    return output


In [11]:
train_dataset['text_nostop'] = train_dataset.apply(lambda x: remove_stopwords(x['text_token']),axis=1)

In [12]:
train_dataset['text_nostop'].iloc[0]

['I',
 'rented',
 'I',
 'AM',
 'CURIOUS-YELLOW',
 'video',
 'store',
 'controversy',
 'surrounded',
 'first',
 'released',
 '1967',
 '.',
 'I',
 'also',
 'heard',
 'first',
 'seized',
 'U.S.',
 'customs',
 'ever',
 'tried',
 'enter',
 'country',
 ',',
 'therefore',
 'fan',
 'films',
 'considered',
 '``',
 'controversial',
 "''",
 'I',
 'really',
 'see',
 '.',
 'The',
 'plot',
 'centered',
 'around',
 'young',
 'Swedish',
 'drama',
 'student',
 'named',
 'Lena',
 'wants',
 'learn',
 'everything',
 'life',
 '.',
 'In',
 'particular',
 'wants',
 'focus',
 'attentions',
 'making',
 'sort',
 'documentary',
 'average',
 'Swede',
 'thought',
 'certain',
 'political',
 'issues',
 'Vietnam',
 'War',
 'race',
 'issues',
 'United',
 'States',
 '.',
 'In',
 'asking',
 'politicians',
 'ordinary',
 'denizens',
 'Stockholm',
 'opinions',
 'politics',
 ',',
 'sex',
 'drama',
 'teacher',
 ',',
 'classmates',
 ',',
 'married',
 'men',
 '.',
 'What',
 'kills',
 'I',
 'AM',
 'CURIOUS-YELLOW',
 '40',
 'yea

In [13]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [14]:
def lemmatizer(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text

In [15]:
train_dataset['lemm'] = train_dataset.apply(lambda x: lemmatizer(x['text_nostop']),axis=1)

In [16]:
train_dataset.iloc[0]

text            I rented I AM CURIOUS-YELLOW from my video sto...
label                                                           0
cleaned_text    I rented I AM CURIOUS-YELLOW from my video sto...
text_token      [I, rented, I, AM, CURIOUS-YELLOW, from, my, v...
text_nostop     [I, rented, I, AM, CURIOUS-YELLOW, video, stor...
lemm            [I, rented, I, AM, CURIOUS-YELLOW, video, stor...
Name: 0, dtype: object

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [7]:
#tf-idf to have vectors 
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
train_dataset_tfidf = vectorizer.fit_transform(train_dataset['cleaned_text'])

In [8]:
test_dataset['cleaned_text'] = test_dataset.apply(lambda x: clean(x['text']),axis=1)
test_dataset_tfidf = vectorizer.fit_transform(test_dataset['cleaned_text'])

In [9]:
print(train_dataset_tfidf.shape)
print(test_dataset_tfidf.shape)


(25000, 5000)
(25000, 5000)


In [10]:
from torch.utils.data import DataLoader, TensorDataset

In [11]:
train_X = torch.tensor(train_dataset_tfidf.toarray(), dtype=torch.float32)  # Convert sparse matrix if needed
train_y = torch.tensor(train_dataset['label'].values, dtype=torch.long)

In [12]:
test_X = torch.tensor(test_dataset_tfidf.toarray(), dtype=torch.float32)  # Convert sparse matrix if needed
test_y = torch.tensor(test_dataset['label'].values, dtype=torch.long)

In [13]:
# Create DataLoaders
batch_size = 64
train_loader = DataLoader(TensorDataset(train_X,train_y), batch_size=batch_size, shuffle=False)
test_loader = DataLoader(TensorDataset(test_X, test_y), batch_size=batch_size, shuffle=False)


#### Creating models  
1. Make a class that inherits from nn.Module 
2. in __init__ call super and define the flow of data through the layers. 
3. To make it faster add a call to check if there is gpu/mps or smtg available. If not then cpu will be used

In [14]:
device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
print(f"Using {device} device")


Using cpu device


In [15]:
class NeuralNetwork(nn.Module):
    def __init__(self,input_size,hidden_size,outpout_size):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(input_size,hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size,hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size,outpout_size)
        )
    
    def forward(self,x):
        logits = self.linear_relu_stack(x)
        return logits

In [16]:
input_size = train_X.shape[1]
output_size = 2
hidden_size = 265

In [17]:
network = NeuralNetwork(input_size=input_size,outpout_size=output_size,hidden_size=hidden_size).to(device)

In [18]:
network

NeuralNetwork(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=5000, out_features=265, bias=True)
    (1): ReLU()
    (2): Linear(in_features=265, out_features=265, bias=True)
    (3): ReLU()
    (4): Linear(in_features=265, out_features=2, bias=True)
  )
)

In [19]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(network.parameters(), lr=1e-3)

In [20]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [21]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [22]:
epochs = 5
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_loader, network, loss_fn, optimizer)
    test(test_loader, network, loss_fn)
print("Done!")

Epoch 1
-------------------------------
loss: 0.662477  [   64/25000]
loss: 0.000015  [ 6464/25000]
loss: 6.197498  [12864/25000]
loss: 0.025205  [19264/25000]
Test Error: 
 Accuracy: 50.0%, Avg loss: 2.477769 

Epoch 2
-------------------------------
loss: 4.953206  [   64/25000]
loss: 0.000012  [ 6464/25000]
loss: 9.424225  [12864/25000]
loss: 0.000405  [19264/25000]
Test Error: 
 Accuracy: 50.0%, Avg loss: 4.721019 

Epoch 3
-------------------------------
loss: 10.531206  [   64/25000]
loss: 0.000024  [ 6464/25000]
loss: 7.769588  [12864/25000]
loss: 0.010652  [19264/25000]
Test Error: 
 Accuracy: 50.0%, Avg loss: 4.637889 

Epoch 4
-------------------------------
loss: 10.673346  [   64/25000]
loss: 0.002333  [ 6464/25000]
loss: 8.015435  [12864/25000]
loss: 0.024591  [19264/25000]
Test Error: 
 Accuracy: 50.0%, Avg loss: 4.651331 

Epoch 5
-------------------------------
loss: 10.769074  [   64/25000]
loss: 0.001840  [ 6464/25000]
loss: 2.976347  [12864/25000]
loss: 0.101130  [19

In [23]:
classes = {0:'negative',1:'positive'}

In [24]:
test_X[0]

tensor([0., 0., 0.,  ..., 0., 0., 0.])

In [27]:
def inference(x,y)->dict[float,str]:
    with torch.no_grad():
        x = x.to(device)
        logits = network(x)
        probabilities = torch.nn.functional.softmax(logits,dim=0)
        pred = probabilities.argmax(0).item()
        predicted = classes[pred]
        return {'probability':pred,'label':predicted}

In [31]:
x, y = test_X[0], test_dataset['label'].iloc[0]
out= inference(x,y)
print(f'probability: {out['probability']}, predicted: {out['label']}, actual: {classes[y]}')

probability: 1, predicted: positive, actual: negative


In [38]:
%pip install ray


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [39]:
%pip install -U ipywidgets


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [32]:
from ray import tune

In [33]:
def hyperparameterTuning(config:dict):
    batch_size = config['batch_size']
    train_loader = DataLoader(TensorDataset(train_X,train_y), batch_size=batch_size, shuffle=True)
    model = NeuralNetwork(hidden_size=config["hidden_size"])
    optimizer = torch.optim.Adam(model.parameters(), lr=config["lr"])
    loss_fn= nn.CrossEntropyLoss()
    train(dataloader=train_loader,model=model,loss_fn=loss_fn,optimizer=optimizer)


In [36]:
search_space = {
    "hidden_size": tune.grid_search([128, 256, 512]),
    "lr": tune.loguniform(1e-5, 1e-1),
    "batch_size":tune.grid_search([16, 32, 64,128])
}
tuner = tune.Tuner(
    hyperparameterTuning,
    param_space=search_space,
)

In [None]:
results = tuner.fit()
print(results.get_best_result(metric="score", mode="min").config)

*** SIGTERM received at time=1740146318 on cpu 3 ***
PC: @     0x7d7c47bddc1a  (unknown)  (unknown)
    @     0x7d7c47db2420  (unknown)  (unknown)
[2025-02-21 13:58:38,283 E 32368 32368] logging.cc:460: *** SIGTERM received at time=1740146318 on cpu 3 ***
[2025-02-21 13:58:38,283 E 32368 32368] logging.cc:460: PC: @     0x7d7c47bddc1a  (unknown)  (unknown)
[2025-02-21 13:58:38,284 E 32368 32368] logging.cc:460:     @     0x7d7c47db2420  (unknown)  (unknown)


SystemExit: 15

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


: 