In [1]:
# predicting health outcomes using unlabeled health variables from https://www.kaggle.com/code/gusthema/identifying-age-related-conditions-w-tfdf/


In [14]:
# import libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy as sp


import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

In [15]:
# load data
rootPath='/Users/Torben/Code/2023Projects/kaggle/ICR/'

dataset_df = pd.read_csv(rootPath+'train.csv')
test_df = pd.read_csv(rootPath+'test.csv')

print("Full train dataset shape is {}".format(dataset_df.shape))
print("Full test dataset shape is {}".format(test_df.shape))

Full train dataset shape is (617, 58)
Full test dataset shape is (5, 57)


In [16]:
# organize data

NUM_FEATURE_COLUMNS = [i for i in dataset_df.columns if i not in ["Id", "EJ", "Class"]]
FEATURE_COLUMNS = [i for i in dataset_df.columns if i not in ["Id"]]
FEATURE_COLUMNS_NO_CLASS = [i for i in dataset_df.columns if i not in ["Id","Class"]]

TEST_FEATURE_COLUMNS = [i for i in test_df.columns if i not in ["Id"]]
TEST_FEATURE_COLUMNS_NO_CLASS = [i for i in test_df.columns if i not in ["Id","Class"]]

nFeatures=len(FEATURE_COLUMNS)-1

# remove index 509 because it looks like an incorrect label
dataset_df.drop(509)

# Create list of ids
ID_LIST = dataset_df.index
TEST_ID_LIST = test_df.index

# Create a dataframe of required size with zero values.
oof = pd.DataFrame(data=np.zeros((len(ID_LIST),1)), index=ID_LIST)

# Save the name of the label column to a variable.
label = "Class"


In [17]:
# clean and normalize data

def clean_and_fill(dataset_df, FEATURE_COLUMNS, ID_LIST):
    # changes categorical variables to numeric and sets nans to 0
    for i in range(len(FEATURE_COLUMNS)):
        dataset_df.loc[ dataset_df[FEATURE_COLUMNS[39]] == 'A', FEATURE_COLUMNS[39]] = 0
        dataset_df.loc[ dataset_df[FEATURE_COLUMNS[39]] == 'B', FEATURE_COLUMNS[39]] = 1

    # fill nans
    filled_dataset_df=dataset_df.fillna(0)
    
    return filled_dataset_df


def get_norm_vals(filled_dataset_df, FEATURE_COLUMNS, ID_LIST):
    # returns values for mean and std of each column to normalize test data
    param_df = pd.DataFrame(data=np.zeros((2,len(FEATURE_COLUMNS))), index=['mean','std'],columns=FEATURE_COLUMNS)

    for i in range(len(FEATURE_COLUMNS)):
        param_df[FEATURE_COLUMNS[i]]['mean']=np.mean(filled_dataset_df[FEATURE_COLUMNS[i]])
        param_df[FEATURE_COLUMNS[i]]['std']=np.std(filled_dataset_df[FEATURE_COLUMNS[i]])

    return param_df
    
def norm_dataset(filled_dataset_df,FEATURE_COLUMNS, ID_LIST, param_df):
    # applys normalization parameters to a dataset

    normed_df = filled_dataset_df.copy(deep=False)
    # # normalize
    for i in range(len(FEATURE_COLUMNS)):

        if FEATURE_COLUMNS[i] !='Class':
            normed = (filled_dataset_df[FEATURE_COLUMNS[i]]-param_df[FEATURE_COLUMNS[i]]['mean']) / param_df[FEATURE_COLUMNS[i]]['std']
            normed_df.loc[ID_LIST,FEATURE_COLUMNS[i]]=normed
        else:
            normed_df.loc[ID_LIST,FEATURE_COLUMNS[i]]=filled_dataset_df.loc[ID_LIST,FEATURE_COLUMNS[i]]

    return normed_df

In [18]:
filled_dataset_df = clean_and_fill(dataset_df,FEATURE_COLUMNS, ID_LIST)
param_df = get_norm_vals(filled_dataset_df,FEATURE_COLUMNS, ID_LIST)
normed_dataset_df = norm_dataset(filled_dataset_df,FEATURE_COLUMNS, ID_LIST,param_df)

filled_test_df = clean_and_fill(test_df,TEST_FEATURE_COLUMNS,TEST_ID_LIST)
normed_test_df = norm_dataset(filled_test_df,TEST_FEATURE_COLUMNS,TEST_ID_LIST,param_df)

In [19]:
X=normed_dataset_df[FEATURE_COLUMNS_NO_CLASS].astype(np.float32).values
y=normed_dataset_df['Class'].to_numpy()

In [20]:
class Data(Dataset):
  def __init__(self, X_train, y_train):
    # need to convert float64 to float32 else 
    # will get the following error
    # RuntimeError: expected scalar type Double but found Float
    self.X = torch.from_numpy(X_train.astype(np.float32))
    # need to convert float64 to Long else 
    # will get the following error
    # RuntimeError: expected scalar type Long but found Float
    self.y = torch.from_numpy(y_train).type(torch.LongTensor)
    self.len = self.X.shape[0]
  
  def __getitem__(self, index):
    return self.X[index], self.y[index]
  def __len__(self):
    return self.len


In [21]:
traindata = Data(X, y)

In [36]:
batch_size = 4
trainloader = DataLoader(traindata, batch_size=batch_size, 
                         shuffle=True, num_workers=0)

In [37]:
input_dim = X.shape[1]
# number of hidden layers
hidden_layers = 25
# number of classes (unique of y)
output_dim = 2

class Network(nn.Module):
  def __init__(self):
    super(Network, self).__init__()
    self.linear1 = nn.Linear(input_dim, hidden_layers)
    self.linear2 = nn.Linear(hidden_layers, output_dim)
  def forward(self, x):
    x = torch.sigmoid(self.linear1(x))
    x = self.linear2(x)
    return x

In [38]:
clf = Network()

In [39]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(clf.parameters(), lr=0.1)

In [40]:
epochs = 10
for epoch in range(epochs):
  running_loss = 0.0
  for i, data in enumerate(trainloader, 0):
    inputs, labels = data
    # set optimizer to zero grad to remove previous epoch gradients
    optimizer.zero_grad()
    # forward propagation
    outputs = clf(inputs)
    loss = criterion(outputs, labels)
    # backward propagation
    loss.backward()
    # optimize
    optimizer.step()
    running_loss += loss.item()
  # display statistics
  print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.5f}')

[1,   155] loss: 0.03323
[2,   155] loss: 0.02383
[3,   155] loss: 0.01954
[4,   155] loss: 0.01746
[5,   155] loss: 0.01650
[6,   155] loss: 0.01598
[7,   155] loss: 0.01462
[8,   155] loss: 0.01414
[9,   155] loss: 0.01322
[10,   155] loss: 0.01292


In [47]:
outputs = clf(traindata[0][0])

In [48]:
outputs

tensor([-0.6541,  0.2596], grad_fn=<AddBackward0>)