# Dataset, DataLoader, Linear Regression model

In [1]:
from sklearn.datasets import make_classification
import torch
import torch.utils.data.dataloader as DataLoader
from sklearn.model_selection import train_test_split

In [2]:
# Set as appropriate
verbose = True

In [3]:
class CustomDataset:
    def __init__(self, data, targets):
        self.data = data
        self.targets = targets
        
    def __len__(self):
        return self.data.shape[0] # len(self.data) also works
    
    def __getitem__(self, idx):
        custom_data = self.data[idx,:]
        custom_targets = self.targets[idx]
        return {"x": torch.tensor(custom_data, dtype=torch.float), 
                "y": torch.tensor(custom_targets, dtype=torch.long)
               }

In [4]:
# Generate dataset 
data, targets = make_classification(n_samples = 1000)

In [7]:
# split the dataset into train and test components
train_data, test_data, train_targets, test_targets = train_test_split(
    data, targets, test_size=0.25, random_state=42)

if verbose:
    print(f"{len(data)=}, {len(targets)=}, {len(train_data)=}, {len(train_targets)=}, {len(test_data)=}, {len(test_targets)=}")

len(data)=1000, len(targets)=1000, len(train_data)=750, len(train_targets)=750, len(test_data)=250, len(test_targets)=250


In [18]:
# Create our custom datasets that can be called by DataLoader
train_dataset = CustomDataset(train_data, train_targets)
test_dataset = CustomDataset(test_data, test_targets)

if (verbose):
    print("Try out len and indexing to make sure it's working as expected")
    print(f"{len(train_dataset)=}")
    print(f"{train_dataset[2]=}")

Try out len and indexing to make sure it's working as expected
len(train_dataset)=750
train_dataset[2]={'x': tensor([ 1.5046, -1.5677,  0.3045,  1.6613, -1.0563, -2.1789, -0.5665,  0.1256,
         0.5439, -1.1231,  1.2163, -0.9576,  0.5075,  1.0512,  0.6561,  0.9320,
         2.1674,  0.6796, -1.0860, -0.9167]), 'y': tensor(1)}


## DataLoader

`DataLoader` from `torch.utils.data` makes it possible to train in batches.

The following code errors out with `num_workers` > 0. Choose something like 2 for colab.

In [28]:
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=4, num_workers=0)
test_dataloader  = torch.utils.data.DataLoader(test_dataset,  batch_size=4, num_workers=0)

In [29]:
# Create the weights matrix and bias vector
W = torch.randn((20,1), requires_grad=True)
b = torch.randn((1,), requires_grad=True)
learning_rate = 0.001

In [30]:
model = lambda x, W, b: torch.matmul(x, W) + b

In [33]:
for epoch in range(10):
    epoch_loss = 0
    counter = 0
    for data in train_dataloader:
        x = data["x"]
        y = data["y"]
        output = model(x, W, b)
        loss = torch.mean((output.view(-1)-y.view(-1))**2)
        loss.backward()
        with torch.no_grad():
            W = W - learning_rate*W.grad
            b = b - learning_rate*b.grad
        W.requires_grad = True
        b.requires_grad = True
        epoch_loss += loss.item() 
        counter += 1
    print(f"{epoch}: {epoch_loss/counter}")

0: 0.12359994951328461
1: 0.12359994895845414
2: 0.12359994942906927
3: 0.12359994872562349
4: 0.12359994995913053
5: 0.12359994940677697
6: 0.12359995066753014
7: 0.12359994874543886
8: 0.12359994916156172
9: 0.1235999494092539
