<a href="https://colab.research.google.com/github/usermar445/aml_final_project/blob/main/scripts/aml_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install tabpfn

Collecting tabpfn
  Downloading tabpfn-0.1.9-py3-none-any.whl (156 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m156.6/156.6 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tabpfn
Successfully installed tabpfn-0.1.9


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import ListedColormap

from sklearn.datasets import make_circles, make_classification, make_moons
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import multilabel_confusion_matrix
import seaborn as sns
from sklearn.preprocessing import Normalizer
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

import lightgbm as lgb

from sklearn.preprocessing import Normalizer
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer


In [None]:

from tabpfn import TabPFNClassifier


# 0. Functions

**Tree data functions**

In [None]:
# re one-hot-encodes wilderness area and soiltype for tree data set

# has to be executed on the original df
def re_one_hot_encode_soiltype(data, drop=True):
  soil = data.iloc[:, 14:54]
  data['soil_type'] = soil.idxmax(1)
  data['soil_type'] = data['soil_type'].replace(data['soil_type'].unique(), np.arange(1, data['soil_type'].nunique()+1))
  if drop:
    cols = data.iloc[:, 14:54].columns.to_list()
    data = data.drop(columns=cols)
  return data

def re_one_hot_encode_wilderness(data, drop=True):
  wilderness = data[['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4']].copy()
  data['wilderness_area'] = wilderness.idxmax(1)
  data['wilderness_area'] = data['wilderness_area'].replace(data['wilderness_area'].unique(), np.arange(1, data['wilderness_area'].nunique()+1))
  if drop:
    data = data.drop(['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4'], axis=1)
  return data

def re_one_hot_encode_categorical(data, drop=True):
  return re_one_hot_encode_wilderness(re_one_hot_encode_soiltype(data, drop), drop)

# helper method to split in features and labels
def get_features_labels_tree(data, values=False):
  X = data.drop("Cover_Type", axis=1)
  y = data.loc[:, 'Cover_Type']
  if values:
    return[X.values, y.values]
  return [X, y]

**Heloc functions**

In [None]:
# helper function to determine indexes of rows containing only na values
def get_indexes_nan_rows(data):
  all_na = data.isna().all(axis=1)
  all_na = all_na[all_na==True]
  return all_na.index

# returns percentage of missing values per column
def get_missing_values_per_columns(data, columns, threshold):
  missing = {}
  problematic = []
  for column in columns:
    col = data.loc[:, column]
    nans = np.isnan(col)
    nans = nans[nans == True]
    missing_values = len(nans)/len(col)
    missing.update({column: missing_values})
    if missing_values >= threshold:
      problematic.append(column)
  return [missing, problematic]

#imputes missing values
#because most of the features are very skewed, median seems more reasonable
def impute_heloc(data):
  imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
  columns = data.columns
  indexes = data.index
  imp_median.fit(data.values)
  data = imp_median.transform(data.values)
  data = pd.DataFrame(data, columns=columns)
  data = data.set_index(indexes)
  return data

# wrapper function that prepares complete data set
def prepare_data_heloc_train(data, drop_columns):
  print("Inital dimensions:", data.values.shape)
  print("Recode labels")
  # replace labels with int values (because final submission needs to be 0, 1)
  data['RiskPerformance'] = data['RiskPerformance'].replace(('Bad','Good'), (0,1))
  # columns were determined in EDA
  print("Drop columns")
  data = data.drop(drop_columns, axis=1)
  print("New dimensions:", data.values.shape)
  print("Split in features and labels")
  # split to replace nas -> has to be done because complete NaS rows are excluded from prediction/training and will just be assigned fixed binomial probability classification
  dfX = data.drop("RiskPerformance", axis=1)
  dfy = pd.DataFrame(data.loc[:, 'RiskPerformance'])
  print("Replace nas")
  dfX[dfX<0] = np.nan
  # get na rows index
  na_indexes = get_indexes_nan_rows(dfX)
  #drop na rows
  dfX = dfX.drop(na_indexes)
  dfy = dfy.drop(na_indexes)
  print("Impute nas")
  dfX = impute_heloc(dfX)
  return dfX.join(dfy)

# helper method to split in features and labels
def get_features_labels_heloc(data, values=False):
  X = data.drop("RiskPerformance", axis=1)
  y = data.loc[:, 'RiskPerformance']
  if values:
    return[X.values, y.values]
  return [X, y]

**Higgs functions**

In [None]:

# cleans and prepares data
def clean_higgs_data(data, drop_columns, train=True):
  print("Inital dimensions:", data.values.shape)
  if train:
    # replace labels with int values (because final submission needs to be 0, 1)
    data['Label'] = data['Label'].replace(('b','s'), (0,1))
  #drop columns
  data = data.drop(drop_columns, axis=1)
  print("New dimensions:", data.values.shape)
  return data

# helper function to get features and labels split
def get_features_labels_higgs(data, values=False):
  X = data.drop("Label", axis=1)
  y = data.loc[:, 'Label']
  if values:
    return[X.values, y.values]
  return [X, y]

**Prepare data**

In [None]:
def make_datasets_ready(tree, heloc, drop_columns_heloc, higgs, drop_columns_higgs):
  # two versions of tree data: one with one-hot-encoding one with categorical (lgb)
  print("Preparing tree data...")
  print("Inital dimensions ", tree.values.shape)
  print("New dimensions ", tree.values.shape)
  print("Preparing tree data 2nd version...")
  print("Inital dimensions ", tree.values.shape)
  lgb = re_one_hot_encode_categorical(tree)
  print("New dimensions ", lgb.values.shape)
  print("Preparing heloc data...")
  heloc_train = prepare_data_heloc_train(heloc, drop_columns_heloc)
  print("Preparing higgs data...")
  higgs_train = clean_higgs_data(higgs, drop_columns_higgs)
  return [tree, heloc_train, higgs_train, lgb]

In [None]:
def get_training_ready_data(data):
  # first keep data as dataframes to extract columns names (used for LightGBM)
  dfXs = [get_features_labels_tree(data[0])[0], get_features_labels_heloc(data[1])[0], get_features_labels_higgs(data[2])[0], get_features_labels_tree(data[3])[0]]
  dfys = [get_features_labels_tree(data[0])[1], get_features_labels_heloc(data[1])[1], get_features_labels_higgs(data[2])[1], get_features_labels_tree(data[3])[1]]
  column_names = [dat.columns.to_list() for dat in dfXs]

  #only get values
  X = [get_features_labels_tree(data[0], values=True)[0], get_features_labels_heloc(data[1], values=True)[0], get_features_labels_higgs(data[2], values=True)[0], get_features_labels_tree(data[3], values=True)[0]]
  y = [get_features_labels_tree(data[0], values=True)[1], get_features_labels_heloc(data[1], values=True)[1], get_features_labels_higgs(data[2], values=True)[1], get_features_labels_tree(data[3], values=True)[1]]

  #split in train and validation set
  X_train_tree, X_test_tree, y_train_tree, y_test_tree = train_test_split(X[0], y[0], test_size=0.2, random_state=42)
  X_train_heloc, X_test_heloc, y_train_heloc, y_test_heloc = train_test_split(X[1], y[1], test_size=0.2, random_state=42)
  X_train_higgs, X_test_higgs, y_train_higgs, y_test_higgs = train_test_split(X[2], y[2], test_size=0.2, random_state=42)
  X_train_tree_lgb, X_test_tree_lgb, y_train_tree_lgb, y_test_tree_lgb = train_test_split(X[3], y[3], test_size=0.2, random_state=42)

  # create lists
  X_train = [X_train_tree, X_train_heloc, X_train_higgs, X_train_tree_lgb]
  X_test = [X_test_tree, X_test_heloc, X_test_higgs, X_test_tree_lgb]
  y_train = [y_train_tree, y_train_heloc, y_train_higgs, y_train_tree_lgb]
  y_test = [y_test_tree, y_test_heloc, y_test_higgs, y_test_tree_lgb]

  # extract some controlling numbers for better overivew
  n_features = [x.shape[1] for x in X]
  n_rows = [x.shape[0] for x in X]
  print("number of features ", n_features)
  print("number of rows ", n_rows)
  labels = [np.unique(goal) for goal in y]
  n_labels = [len(np.unique(goal)) for goal in y]
  print("labels: ", labels)
  print("number of labels ", n_labels)


  return X_train, X_test, y_train, y_test, column_names

**Compare classifiers**

In [None]:
def compare_standard_classifiers(classifiers, classifier_names, x_train, x_test, y_train, y_test, data_names):
  scores = []
  for name, clf in zip(classifier_names, classifiers):
          print("Classifer " + name)
          results = {"model": name, "parameters": clf.get_params()}
          score = []
          for xtrain, xtest, ytrain, ytest, data_name in zip(x_train, x_test, y_train, y_test, data_names):
            print("Train ", data_name)
            clf = make_pipeline(StandardScaler(), clf)
            clf.fit(xtrain, ytrain)
            print("Test")
            acc = clf.score(xtest, ytest)
            score.append(acc)
            results.update({data_name: acc})
            print("Done")
          results.update({"overall score": np.mean(score)})
          scores.append(results)
  return scores

**Light GBM**

In [None]:

def get_scores_lightgbm(x_train, x_test, y_train, y_test, data_names, feature_names):
  results = {"model": "lgb"}
  scores = []
  param = {'num_leaves': 31, 'objective': 'multiclass', 'num_class': 8}
  results.update({"params": param})
  num_round = 10
  for xtrain, xtest, ytrain, ytest, data_name, features in zip(x_train, x_test, y_train, y_test, data_names, feature_names):
    print("Train ", data_name)
    if data_name=='tree_lgb':
        train_data = lgb.Dataset(xtrain, label=ytrain, feature_name=features, categorical_feature=['wilderness_area', 'soil_type'], free_raw_data=False)
    train_data = lgb.Dataset(xtrain, label=ytrain, feature_name=features, free_raw_data=False)
    #train_data.save_binary('train.bin')
    bst = lgb.train(param, train_data, num_round)
    print("Test")
    ypred = bst.predict(xtest)
    pred = pd.DataFrame(ypred)
    predicted = pred.idxmax(axis=1)
    acc = accuracy_score(ytest, predicted)
    results.update({data_name: acc})
    scores.append(acc)
  results.update({"overall score": np.mean(scores)})
  return results

**TabPFN**

In [None]:
def get_score_tabpfn(x_train, x_test, y_train, y_test, data_names):
  results = {"model": "TabPFN"}
  score = []
  classifier = TabPFNClassifier(device='cpu', N_ensemble_configurations=32)
  for xdata, xtest, ydata, ytest, data_name in zip(x_train,x_test, y_train, y_test, data_names):
    sample_indexes =  np.random.randint(0, xdata.shape[0], 1024)
    x_sample = xdata[sample_indexes, :]
    y_sample = ydata[sample_indexes]
    print("Train ", data_name)
    classifier.fit(x_sample, y_sample)
    print("Test")
    y_eval, p_eval = classifier.predict(xtest, return_winning_probability=True)
    acc = accuracy_score(ytest, y_eval)
    score.append(acc)
    results.update({data_name: acc})
    print("Done")
  results_tabpfn.update({"overall score": np.mean(score)})
  return results

**Make predictions**

In [None]:
def make_prediction_heloc(train_set, drop_columns,  test_set, classifier, test_submission):
  train = prepare_data_heloc_train(train_set, drop_columns)
  test, na_indexes = prepare_data_heloc_test(test_set, drop_columns)
  X_test = test.values
  X,y = get_features_labels_heloc(train)
  print("train")
  classifier = make_pipeline(StandardScaler(), classifier)
  classifier.fit(X, y)
  print("predict")
  y_pred = classifier.predict(X_test)
  p = 5000/5459
  na_preds = np.random.binomial(1, p, size=len(na_indexes))
  y_pred[na_indexes] = na_preds
  test_submission['pred'] = y_pred.astype(int)
  test_submission = test_submission.drop('Prediction', axis=1)
  test_submission = test_submission.rename(columns={'pred': 'Prediction'})
  return test_submission

# 1. Load Train Data

In [None]:
tree_train = pd.read_csv("/content/drive/MyDrive/aml/data/covtype_train.csv")
heloc_train = pd.read_csv("/content/drive/MyDrive/aml/data/heloc_train.csv")
higgs_train = pd.read_csv("/content/drive/MyDrive/aml/data/higgs_train.csv")

In [None]:
higgs_train.values.shape

(175000, 33)

# 2. Prepare data sets

In [None]:
# final columns to drop
# first 3 because of missing values
# the rest because of multicorriliarity
drop_columns_heloc = ['MSinceMostRecentDelq',
                      'MSinceMostRecentInqexcl7days',
                      'NetFractionInstallBurden',
                      'NumTotalTrades',
                      'MaxDelq2PublicRecLast12M',
                      'MaxDelqEver',
                      'NumInqLast6Mexcl7days',
                      'NumTrades90Ever2DerogPubRec'
                      ]

In [None]:
# final columns to drop
# first 3 because of missing values
# the rest because of multicorriliarity
drop_columns_heloc2 = ['MSinceMostRecentDelq',
                      'MSinceMostRecentInqexcl7days',
                      'NetFractionInstallBurden'
                      ]

In [None]:
# final list of columns to drop
higgs_drop_columns = ['EventId',
              'DER_deltaeta_jet_jet',
              'DER_mass_jet_jet',
              'DER_prodeta_jet_jet',
              'DER_lep_eta_centrality',
              'PRI_jet_subleading_pt',
              'PRI_jet_subleading_eta',
              'PRI_jet_subleading_phi',
              'Weight',
              'PRI_jet_num',
              'PRI_met_phi',
              'PRI_met_sumet',
              'PRI_jet_leading_pt',
              'PRI_jet_leading_eta',
              'PRI_jet_leading_phi',
              'PRI_jet_all_pt']

In [None]:
# final list of columns to drop
higgs_drop_columns2 = ['EventId']

In [None]:
data = make_datasets_ready(tree_train, heloc_train, drop_columns_heloc, higgs_train, higgs_drop_columns)

Preparing tree data...
Inital dimensions  (58101, 56)
New dimensions  (58101, 56)
Preparing tree data 2nd version...
Inital dimensions  (58101, 56)
New dimensions  (58101, 13)
Preparing heloc data...
Inital dimensions: (9413, 24)
Recode labels
Drop columns
New dimensions: (9413, 16)
Split in features and labels
Replace nas
Impute nas
Preparing higgs data...
Inital dimensions: (175000, 33)
New dimensions: (175000, 17)


# 3. Make train and validation set

In [None]:
data_names = ['tree',  'heloc', 'higgs', 'tree_lgb']

In [None]:
X_train, X_test, y_train, y_test, column_names = get_training_ready_data(data)

number of features  [55, 15, 16, 12]
number of rows  [58101, 8876, 175000, 58101]
labels:  [array([1, 2, 3, 4, 5, 6, 7]), array([0, 1]), array([0, 1]), array([1, 2, 3, 4, 5, 6, 7])]
number of labels  [7, 2, 2, 7]


# 4. Compare classifiers

In [None]:
classifiers_names = [
    "Nearest Neighbors",
    "Decision Tree",
    "Random Forest",
    "AdaBoost",
    "Logistic Regression"
]

classifiers = [
    KNeighborsClassifier(3),
    DecisionTreeClassifier(max_depth=10, random_state=42),
    RandomForestClassifier(
        max_depth=10, n_estimators=100, max_features=1, random_state=42
    ),
    AdaBoostClassifier(random_state=42),
    LogisticRegression(random_state=42)
]

In [None]:
results = compare_standard_classifiers(classifiers, classifiers_names, X_train, X_test, y_train, y_test, data_names)

Classifer Nearest Neighbors
Train  tree
Test
Done
Train  heloc
Test
Done
Train  higgs
Test
Done
Train  tree_lgb
Test
Done
Classifer Decision Tree
Train  tree
Test
Done
Train  heloc
Test
Done
Train  higgs
Test
Done
Train  tree_lgb
Test
Done
Classifer Random Forest
Train  tree
Test
Done
Train  heloc
Test
Done
Train  higgs
Test
Done
Train  tree_lgb
Test
Done
Classifer AdaBoost
Train  tree
Test
Done
Train  heloc
Test
Done
Train  higgs
Test
Done
Train  tree_lgb
Test
Done
Classifer Logistic Regression
Train  tree


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Test
Done
Train  heloc
Test
Done
Train  higgs
Test
Done
Train  tree_lgb
Test
Done


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
results

[{'model': 'Nearest Neighbors',
  'parameters': {'algorithm': 'auto',
   'leaf_size': 30,
   'metric': 'minkowski',
   'metric_params': None,
   'n_jobs': None,
   'n_neighbors': 3,
   'p': 2,
   'weights': 'uniform'},
  'tree': 0.8389983650288271,
  'heloc': 0.678490990990991,
  'higgs': 0.7705714285714286,
  'tree_lgb': 0.8311677136218915,
  'overall score': 0.7798071245532845},
 {'model': 'Decision Tree',
  'parameters': {'ccp_alpha': 0.0,
   'class_weight': None,
   'criterion': 'gini',
   'max_depth': 10,
   'max_features': None,
   'max_leaf_nodes': None,
   'min_impurity_decrease': 0.0,
   'min_samples_leaf': 1,
   'min_samples_split': 2,
   'min_weight_fraction_leaf': 0.0,
   'random_state': 42,
   'splitter': 'best'},
  'tree': 0.7657688667068239,
  'heloc': 0.6762387387387387,
  'higgs': 0.8182571428571429,
  'tree_lgb': 0.7541519662679632,
  'overall score': 0.7536041786426672},
 {'model': 'Random Forest',
  'parameters': {'bootstrap': True,
   'ccp_alpha': 0.0,
   'class_we

In [None]:
get_scores_lightgbm(X_train, X_test, y_train, y_test, data_names, column_names)

Train  tree
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.092464 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2245
[LightGBM] [Info] Number of data points in the train set: 46480, number of used features: 51
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -0.997957
[LightGBM] [Info] Start training from score -0.725996
[LightGBM] [Info] Start training from score -2.778804
[LightGBM] [Info] Start training from score -5.428657
[LightGBM] [Info] Start training from score -4.121385
[LightGBM] [Info] Start training from score -3.526404
[LightGBM] [Info] Start training from score -3.341282
Test
Train  heloc
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000328 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Tota

{'model': 'lgb',
 'params': {'num_leaves': 31, 'objective': 'multiclass', 'num_class': 8},
 'tree': 0.7673177867653386,
 'heloc': 0.7404279279279279,
 'higgs': 0.8228,
 'tree_lgb': 0.7642199466483091,
 'overall score': 0.7736914153353938}

In [None]:
get_score_tabpfn(X_train, X_test, y_train, y_test, data_names)

We have to download the TabPFN, as there is no checkpoint at  /usr/local/lib/python3.10/dist-packages/tabpfn/models_diff/prior_diff_real_checkpoint_n_0_epoch_100.cpkt
It has about 100MB, so this might take a moment.
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters


RuntimeError: ignored

# 5. Select model and tune

# 6. Make predictions

In [None]:
def make_prediction_heloc(train_set, drop_columns,  test_set, classifier, test_submission):
  train = prepare_data_heloc_train(train_set, drop_columns)
  test, na_indexes = prepare_data_heloc_test(test_set, drop_columns)
  X_test = test.values
  X,y = get_features_labels_heloc(train)
  print("train")
  classifier = make_pipeline(StandardScaler(), classifier)
  classifier.fit(X, y)
  print("predict")
  y_pred = classifier.predict(X_test)
  p = 5000/5459
  na_preds = np.random.binomial(1, p, size=len(na_indexes))
  y_pred[na_indexes] = na_preds
  test_submission['pred'] = y_pred.astype(int)
  test_submission = test_submission.drop('Prediction', axis=1)
  test_submission = test_submission.rename(columns={'pred': 'Prediction'})
  return test_submission

In [None]:
df_heloc = pd.read_csv("/content/drive/MyDrive/aml/data/heloc_train.csv")
drop_columns_heloc = ['MSinceMostRecentDelq',
                      'MSinceMostRecentInqexcl7days',
                      'NetFractionInstallBurden',
                      'NumTotalTrades',
                      'MaxDelq2PublicRecLast12M',
                      'MaxDelqEver',
                      'NumInqLast6Mexcl7days',
                      'NumTrades90Ever2DerogPubRec'
                      ]
heloc_test = pd.read_csv("/content/drive/MyDrive/aml/data/heloc_test.csv")
clf = RandomForestClassifier(max_depth=10, n_estimators=100, max_features=1, random_state=42)
submission_heloc = pd.read_csv("/content/drive/MyDrive/aml/data/heloc_test_submission.csv")


submission = make_prediction_heloc(df_heloc, drop_columns_heloc, heloc_test, clf, submission_heloc)

Inital dimensions: (9413, 24)
Recode labels
Drop columns
New dimensions: (9413, 16)
Split in features and labels
Replace nas
Impute nas
Inital dimensions: (1046, 23)
Recode labels
Drop columns
New dimensions: (1046, 15)
Split in features and labels
Replace nas
Impute nas
train
predict


In [None]:
submission.to_csv("/content/drive/MyDrive/aml/data/heloc_sub_1.csv", index=False)