##### The cell below is for you to keep track of the libraries used and install those libraries quickly
##### Ensure that the proper library names are used and the syntax of `%pip install PACKAGE_NAME` is followed

In [61]:
#%pip install pandas 
#%pip install matplotlib
# add commented pip installation lines for packages used as shown above for ease of testing
# the line should be of the format %pip install PACKAGE_NAME 

## **DO NOT CHANGE** the filepath variable
##### Instead, create a folder named 'data' in your current working directory and 
##### have the .parquet file inside that. A relative path *must* be used when loading data into pandas

In [62]:
# Can have as many cells as you want for code
import pandas as pd
filepath = "./data/catB_train.parquet" 
# the initialised filepath MUST be a relative path to a folder named data that contains the parquet file

### **ALL** Code for machine learning and dataset analysis should be entered below. 
##### Ensure that your code is clear and readable.
##### Comments and Markdown notes are advised to direct attention to pieces of code you deem useful.

In [63]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim
from datetime import datetime
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Load data
df = pd.read_parquet(filepath)

col_to_delete = ["clntnum", "hh_size_est_0", "hh_size_est_1", "hh_size_est_2", "hh_size_est_3", "hh_size_est_4", "hh_size_est_>4"]
col_to_classify = ["race_desc", "ctrycode_desc", "clttype", "stat_flag", "cltsex_fix", "annual_income_est"]
col_to_convert_to_year = ["min_occ_date", "cltdob_fix"]
col_to_target = ['f_purchase_lh']

for col in col_to_target:
    df[col] = df[col].fillna(0)

special_cols = set(col_to_delete + col_to_classify + col_to_convert_to_year + col_to_target)

for col in col_to_delete:
    if col in df.columns:
        df = df.drop(col, axis=1)

def convert_dates(df, date_cols):
    reference_date = datetime.now()
    for col in date_cols:
        df[col] = pd.to_datetime(df[col], errors='coerce')
        df[col] = (reference_date - df[col]).dt.days
    return df

df = convert_dates(df, col_to_convert_to_year)

# change value in hh_size_est to numeric
def transform_value(value):
    if value == '>4':
        return 5
    else:
        return pd.to_numeric(value, errors='coerce')


# label_encoder = LabelEncoder()

# for col in col_to_classify:
#     df[col] = label_encoder.fit_transform(df[col])

nan_threshold = 0.5

for col in df.columns:
    if col not in special_cols:
        if df[col].isna().sum() / len(df) > nan_threshold:
            df.drop(col, axis=1, inplace=True)
        elif df[col].nunique() == 1:
            df.drop(col, axis=1, inplace=True)

data = df 
data.style
print(data.shape)

X = data.drop(['f_purchase_lh'], axis=1)
y = data['f_purchase_lh']

categorical_cols = X.select_dtypes(include=['object', 'category']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

preprocessor = ColumnTransformer( 
    transformers=[
        ('num', SimpleImputer(strategy='mean'), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

X = preprocessor.fit_transform(X)

print(X.shape)

scaler = StandardScaler(with_mean=False)
X = scaler.fit_transform(X).todense()
X = np.asarray(X)
print(X.shape)


pca = PCA(n_components=50)
X = pca.fit_transform(X)

print(X.shape)

X_tensor = torch.tensor(X.astype(np.float32))
y_tensor = torch.tensor(y.values.astype(np.float32))

X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42)

(17992, 171)
(17992, 37256)
(17992, 37256)
(17992, 50)


In [64]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
print(X_train)
print(y_train)
print(X_test)
print(y_test)

torch.Size([14393, 50]) torch.Size([3599, 50]) torch.Size([14393]) torch.Size([3599])
tensor([[ -1.7033,   1.0124,   2.3555,  ...,   1.1050,   0.3855,   0.5771],
        [ -2.5979,   1.2807,   2.9547,  ...,   0.3847,   0.0950,   0.5005],
        [ -1.9386,   0.5229,   1.4258,  ...,   0.1156,   0.3536,  -0.0458],
        ...,
        [ -2.7158,   1.2625,   2.9351,  ...,   0.5005,   0.0799,   0.0535],
        [ 20.3472,  26.0633, -10.2620,  ...,   4.9146,   4.6629,  -4.7992],
        [  2.8267,  -1.4507,  -2.2684,  ...,   1.4302,   1.0373,  -0.9946]])
tensor([1., 0., 0.,  ..., 0., 1., 0.])
tensor([[  5.2431,  -0.7719,  -8.7954,  ..., -19.2601,  -7.3481,   3.1085],
        [ -2.2608,  -0.4303,  -1.3652,  ...,  -0.7003,  -0.4196,  -0.3793],
        [  1.3416,   0.5008,   1.0997,  ...,  -0.5526,  -0.8995,   2.3992],
        ...,
        [ -2.2048,   1.1949,   3.0849,  ...,   0.5312,  -0.0786,   0.1920],
        [  4.4574,  -1.0873,  -1.5389,  ...,   0.1598,   0.5004,  -0.3446],
        [  2

In [65]:
import torch.nn.functional as F
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(X_train.shape[1], 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        return x

In [66]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.layer1 = nn.Linear(X_train.shape[1], 256)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.5)
        self.layer2 = nn.Linear(256, 128)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.5)
        self.layer3 = nn.Linear(128, 64)
        self.relu3 = nn.ReLU()
        self.output_layer = nn.Linear(64, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.dropout1(self.relu1(self.layer1(x)))
        x = self.dropout2(self.relu2(self.layer2(x)))
        x = self.relu3(self.layer3(x))
        x = self.sigmoid(self.output_layer(x))
        return x

model = NeuralNetwork()


In [67]:
class_counts = data['f_purchase_lh'].value_counts().sort_index().values
total_samples = class_counts.sum()
weights = total_samples / torch.tensor(class_counts, dtype=torch.float32)

# If using GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
weights = weights.to(device)
model.to(device)

# Modify the BCELoss to handle weights for each batch
def weighted_bce_loss(outputs, targets):
    loss = nn.BCELoss(reduction='none')(outputs, targets)
    weighted_loss = loss * weights[targets.long()]
    return weighted_loss.mean()

optimizer = optim.Adam(model.parameters(), lr=0.00005)

train_data = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)

for epoch in range(30): 
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = weighted_bce_loss(outputs, labels.unsqueeze(1))
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item()}")


Epoch 1, Loss: 1.0655083656311035
Epoch 2, Loss: 1.7683472633361816
Epoch 3, Loss: 1.3604263067245483
Epoch 4, Loss: 1.58438241481781
Epoch 5, Loss: 0.9882317781448364
Epoch 6, Loss: 1.0103141069412231
Epoch 7, Loss: 1.8322023153305054
Epoch 8, Loss: 1.1086745262145996
Epoch 9, Loss: 0.6727161407470703
Epoch 10, Loss: 0.9953071475028992
Epoch 11, Loss: 1.2598339319229126
Epoch 12, Loss: 1.4411629438400269
Epoch 13, Loss: 1.8651655912399292
Epoch 14, Loss: 1.6053541898727417
Epoch 15, Loss: 1.9625320434570312
Epoch 16, Loss: 0.8541809916496277
Epoch 17, Loss: 1.7547931671142578
Epoch 18, Loss: 1.0922832489013672
Epoch 19, Loss: 1.5468279123306274
Epoch 20, Loss: 1.0187641382217407
Epoch 21, Loss: 0.9219175577163696
Epoch 22, Loss: 1.3447502851486206
Epoch 23, Loss: 0.6680570840835571
Epoch 24, Loss: 0.8807213306427002
Epoch 25, Loss: 0.6638408303260803
Epoch 26, Loss: 1.7277990579605103
Epoch 27, Loss: 1.0748413801193237
Epoch 28, Loss: 1.421732783317566
Epoch 29, Loss: 1.14484179019927

In [68]:
test_data = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)

model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        predicted = (outputs > 0.5).float()
        total += labels.size(0)
        correct += (predicted.squeeze() == labels).sum().item()

print(f'Accuracy: {100 * correct / total}%')


Accuracy: 68.4912475687691%


In [69]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Function to calculate precision, recall, and F1 score
def calculate_metrics(y_true, y_pred):
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    return precision, recall, f1

# Convert test data to DataLoader
test_data = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)

# Evaluate the model
model.eval()
y_pred = []
y_true = []
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        predicted = (outputs > 0.5).float()
        y_pred.extend(predicted.squeeze().tolist())
        y_true.extend(labels.tolist())

# Calculate metrics
precision, recall, f1 = calculate_metrics(np.array(y_true), np.array(y_pred))
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')


Precision: 0.07372072853425846
Recall: 0.5629139072847682
F1 Score: 0.1303680981595092


## The cell below is **NOT** to be removed
##### The function is to be amended so that it accepts the given input (dataframe) and returns the required output (list). 
##### It is recommended to test the function out prior to submission
-------------------------------------------------------------------------------------------------------------------------------
##### The hidden_data parsed into the function below will have the same layout columns wise as the dataset *SENT* to you
##### Thus, ensure that steps taken to modify the initial dataset to fit into the model are also carried out in the function below

In [70]:
def testing_hidden_data(hidden_data: pd.DataFrame) -> list:
    '''DO NOT REMOVE THIS FUNCTION.

The function accepts a dataframe as input and return an iterable (list)
of binary classes as output.

The function should be coded to test on hidden data
and should include any preprocessing functions needed for your model to perform. 
    
All relevant code MUST be included in this function.'''
    result = [] 
    return result

##### Cell to check testing_hidden_data function

In [71]:
# This cell should output a list of predictions.
test_df = pd.read_parquet(filepath)
test_df = test_df.drop(columns=["f_purchase_lh"])
print(testing_hidden_data(test_df))

[]


### Please have the filename renamed and ensure that it can be run with the requirements above being met. All the best!