In [1]:
from comet_ml import Experiment, Optimizer

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from collections import defaultdict
from tqdm import trange
import torch
import seaborn as sns
import os

torch.set_default_dtype(torch.float32)

In [3]:
from tabular_hypernet import Hypernetwork
from tabular_hypernet.training_utils import train_slow_step, train_model
from tabular_hypernet.hypernetwork import SklearnInterface

In [4]:
DEVICE = 'cuda:1'

## Load data

In [5]:
data = pd.read_csv("../data/Blastchar/churn.csv")

In [6]:
def show_fractions(arr):
    for cls_ in sorted(pd.unique(arr)):
        print(f"{cls_} samples: {(arr==cls_).sum()} ({(arr==cls_).sum()/len(arr)*100:.1f}%)")

processed_data = data.copy()
del processed_data["customerID"]

y_label_enc = LabelEncoder()
y = y_label_enc.fit_transform(processed_data["Churn"].values)
del processed_data["Churn"]

show_fractions(y)

0 samples: 5174 (73.5%)
1 samples: 1869 (26.5%)


In [7]:
label_encoders = {}

for col in processed_data.columns:
    if processed_data[col].dtype == 'object':
        encoder = LabelEncoder()
        processed_data[col] = encoder.fit_transform(processed_data[col].values)
        label_encoders[col] = encoder
        
X = processed_data.values
print(X.shape)

(7043, 19)


### Split it into train and test set

In [8]:
class GenericDataset(torch.utils.data.IterableDataset):
    def __init__(self, data, shuffle: bool=False, samples_no: int=None):
        samples = samples_no or len(data[0])
        self.indices = np.arange(samples)
        self.shuffle = True
        if shuffle:
            self.indices = np.random.permutation(self.indices)
        self.index = 0
        self.max_samples = samples
        self.data_x = data[0].to(torch.float32)
        self.data_y = data[1]

    def __iter__(self):
        if self.shuffle:
            self.indices = np.random.permutation(self.indices)
        while self.index < self.max_samples:
            _idx = self.indices[self.index]
            yield self.data_x[_idx], self.data_y[_idx]
            self.index += 1
    
    def __len__(self):
        return self.data_x.shape[0]
    
def get_dataloader(X, y, size=None, batch_size=32):
    train_dataset = GenericDataset((X, y), size)
    trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, num_workers=1)
    
    return trainloader

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

print("Training data:")
show_fractions(y_train)

print("Test data:")
show_fractions(y_test)

Training data:
0 samples: 3873 (73.3%)
1 samples: 1409 (26.7%)
Test data:
0 samples: 1301 (73.9%)
1 samples: 460 (26.1%)


## Preprocess

In [10]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [11]:
X_train, X_test, y_train, y_test = [torch.from_numpy(x) for x in [X_train, X_test, y_train, y_test]]

## Benchmark

In [12]:
criterion = torch.nn.CrossEntropyLoss()

def _summarize_results(y_pred, y_score, y_test, labels):
    results = []
    for idx, label in enumerate(labels):
        y_pred_filt = y_pred[y_test==idx]
        y_test_filt = y_test[y_test==idx]
        acc = (y_pred_filt==y_test_filt.numpy()).sum()/len(y_test_filt)*100
        results.append({
            "Class": label,
            "Accuracy": acc
        })
        
    acc = (y_pred==y_test.numpy()).sum()/len(y_test)*100    
    results.append({
        "Class": "Total",
        "Accuracy": acc
    })
    results.append({
        "Class": "Loss",
        "Accuracy": criterion(torch.from_numpy(y_score), y_test).item()
    })
    return results


def test_model(model_fn, train_data, test_data, label_encoder=None, iters=10):
    X_train, y_train = train_data
    X_test, y_test = test_data
    if label_encoder is not None:
        labels = label_encoder.classes_
    else:
        labels = sorted(pd.unique(test_data))
    
    results = []

    for i in trange(iters):
        model = model_fn()

        model.fit(X_train, y_train);    
        y_pred = model.predict(X_test)
        y_score = model.predict_proba(X_test)
        results.extend(_summarize_results(y_pred, y_score, y_test, labels))

    dframe = pd.DataFrame.from_dict(results)
    sns.boxplot(data=dframe.iloc[:-1], y="Class", x="Accuracy", orient='h')
    return dframe

### XGBoost

In [13]:
from xgboost import XGBClassifier

In [14]:
# xgb_dframe = test_model(lambda: XGBClassifier(verbosity=0, use_label_encoder=False), 
#                         (X_train, y_train), 
#                         (X_test, y_test),
#                         label_encoder=y_label_enc, iters=1)

In [15]:
# xgb_dframe

## Hypernetwork

In [30]:
def network_fn(mask_size, masks_no):
    def _inner():
        network = Hypernetwork(inp_size=X_train.shape[1], 
                            out_size=y.max().item()+1, 
                            mask_size=mask_size,
                            layers=[128, 128, 128],
                            node_hidden_size=100, 
                            test_nodes=masks_no, device=DEVICE).to(DEVICE)

        network = SklearnInterface(network, device=DEVICE, epochs=200, batch_size=32)
        return network
    return _inner

In [31]:
trainloader, testloader = get_dataloader(X_train, y_train), get_dataloader(X_test, y_test)

In [None]:
hypernet_res = []
for mask_size in [15, 10, 5]:
    for masks_no in [20, 15, 5, 50]:
        exp = Experiment(os.environ.get("COMET_KEY"), "blastchar-hypernet")
        exp.log_parameters({mask_size: mask_size, masks_no: masks_no})
        nn_results = test_model(network_fn(mask_size, masks_no),
                                (X_train, y_train), 
                                (X_test, y_test), 
                                y_label_enc, 1)
        nn_results["MasksNo"] = masks_no
        nn_results["MaskSize"] = mask_size
        exp.log_dataframe_profile(nn_results)
        
        plt.title(f"Masks: {masks_no}, mask size: {mask_size}")
        hypernet_res.append(nn_results)
        break
    break

COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/wwydmanski/blastchar-hypernet/42f5b6b07e8a440b854947aab1527771
COMET INFO:   Metrics [count] (min, max):
COMET INFO:     loss [8400] : (0.28790003061294556, 0.736473798751831)
COMET INFO:   Parameters:
COMET INFO:     5 : 5
COMET INFO:   Uploads:
COMET INFO:     dataframe                : 1 (1.17 KB)
COMET INFO:     environment details      : 1
COMET INFO:     filename                 : 1
COMET INFO:     git metadata             : 1
COMET INFO:     git-patch (uncompressed) : 1 (24.90 KB)
COMET INFO:     installed packages       : 1
COMET INFO:     model graph              : 1
COMET INFO:     notebook                 : 1
COMET INFO:     os packages              : 1
COMET INFO:     source_code              : 1
COMET INFO: ---------------------------

In [28]:
nn_results[nn_results["Class"]=="Total"]

Unnamed: 0,Class,Accuracy
2,Total,73.197047
6,Total,72.45883
10,Total,71.663827
14,Total,72.856332
18,Total,72.174901
22,Total,70.868825
26,Total,72.799546
30,Total,72.004543
34,Total,74.105622
38,Total,71.720613
