In [1]:
import pandas as pd
import numpy as np

In [4]:
# Import dataset
df = pd.read_csv("./data/churn.csv")

In [20]:
# One-hot encode geography (France, Germany, Spain)
cleaned = df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)
one_hot_geo = pd.get_dummies(cleaned['Geography'])
cleaned = cleaned.drop('Geography', axis=1)
cleaned = one_hot_geo.join(cleaned)
cleaned

Unnamed: 0,France,Germany,Spain,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,0,0,619,Female,42,2,0.00,1,1,1,101348.88,1
1,0,0,1,608,Female,41,1,83807.86,1,0,1,112542.58,0
2,1,0,0,502,Female,42,8,159660.80,3,1,0,113931.57,1
3,1,0,0,699,Female,39,1,0.00,2,0,0,93826.63,0
4,0,0,1,850,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1,0,0,771,Male,39,5,0.00,2,1,0,96270.64,0
9996,1,0,0,516,Male,35,10,57369.61,1,1,1,101699.77,0
9997,1,0,0,709,Female,36,7,0.00,1,0,1,42085.58,1
9998,0,1,0,772,Male,42,3,75075.31,2,1,0,92888.52,1


In [21]:
# Encode gender to binary (Male = 0, Female = 1)
cleaned['Gender'].replace(('Male', 'Female'), (0, 1), inplace=True)
cleaned

Unnamed: 0,France,Germany,Spain,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,0,0,619,1,42,2,0.00,1,1,1,101348.88,1
1,0,0,1,608,1,41,1,83807.86,1,0,1,112542.58,0
2,1,0,0,502,1,42,8,159660.80,3,1,0,113931.57,1
3,1,0,0,699,1,39,1,0.00,2,0,0,93826.63,0
4,0,0,1,850,1,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1,0,0,771,0,39,5,0.00,2,1,0,96270.64,0
9996,1,0,0,516,0,35,10,57369.61,1,1,1,101699.77,0
9997,1,0,0,709,1,36,7,0.00,1,0,1,42085.58,1
9998,0,1,0,772,0,42,3,75075.31,2,1,0,92888.52,1


In [24]:
import torch

# Turn into tensors
x = cleaned.iloc[:, :-1]
y = cleaned.iloc[:, -1]
x = torch.tensor(x.values, dtype=torch.float32)  # size = [nrow, ncol - 1]
y = torch.tensor(y, dtype=torch.float32).reshape(-1, 1)  # size = [nrow, 1]

torch.Size([10000, 12])
torch.Size([10000, 1])


In [11]:
import torch
import pandas as pd
from torch.utils.data import Dataset


class ChurnDataset(Dataset):

    def __init__(self, filepath):
        # Import data from CSV
        df = pd.read_csv(filepath)

        # Clean data: drop unnecessary cols, encode Geography 
        cleaned = df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)
        one_hot_geo = pd.get_dummies(cleaned['Geography'])
        cleaned = cleaned.drop('Geography', axis=1)
        cleaned = one_hot_geo.join(cleaned)

        # Clean data: encode gender to binary (Male = 0, Female = 1)
        cleaned['Gender'].replace(('Male', 'Female'), (0, 1), inplace=True)

        # Turn into tensors
        x = cleaned.iloc[:, :-1]
        y = cleaned.iloc[:, -1]

        # Features, Labels
        self.x = torch.tensor(x.values, dtype=torch.float32)  # size = [nrow, ncol - 1]
        self.y = torch.tensor(y, dtype=torch.float32).unsqueeze(1)  # size = [nrow, 1]

    def shape(self):
        return self.x.shape, self.y.shape

    def __getitem__(self, index):
        # return {
        #     'feature': torch.tensor([self.x[index]], dtype=torch.float32),
        #     'label': torch.tensor([self.y[index]], dtype=torch.float32)
        # }
        return (
            self.x[index],
            self.y[index]
        )

    def __len__(self):
        return len(self.x)

In [12]:
# from src.dataset import ChurnDataset
import torch

ds = ChurnDataset("./data/churn.csv")
gen = torch.Generator().manual_seed(5105)
train, test = torch.utils.data.random_split(ds, [0.8, 0.2], generator=gen)

print(len(train))
print(len(test))

train_loader = torch.utils.data.DataLoader(train, shuffle=True, batch_size=2)
dataiter = iter(train_loader)
print(next(dataiter))

# Yay, it works!

8000
2000
[tensor([[0.0000e+00, 0.0000e+00, 1.0000e+00, 7.9300e+02, 0.0000e+00, 6.3000e+01,
         0.0000e+00, 0.0000e+00, 2.0000e+00, 0.0000e+00, 1.0000e+00, 2.7167e+04],
        [1.0000e+00, 0.0000e+00, 0.0000e+00, 6.3100e+02, 1.0000e+00, 3.3000e+01,
         8.0000e+00, 0.0000e+00, 2.0000e+00, 0.0000e+00, 0.0000e+00, 1.1737e+05]]), tensor([[0.],
        [0.]])]
