<a href="https://colab.research.google.com/github/usermar445/aml_final_project/blob/main/scripts/tabpfn_ensemble_clean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install tabpfn

Collecting tabpfn
  Downloading tabpfn-0.1.9-py3-none-any.whl (156 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m156.6/156.6 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tabpfn
Successfully installed tabpfn-0.1.9


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
import math
from tqdm import tqdm
from imblearn.over_sampling import SMOTE
from tabpfn import TabPFNClassifier

# 0. Functions

**Tree data functions**

In [6]:
# re one-hot-encodes wilderness area and soiltype for tree data set

# has to be executed on the original df
def re_one_hot_encode_soiltype(data, drop=True):
  soil = data.iloc[:, 14:54]
  data['soil_type'] = soil.idxmax(1)
  data['soil_type'] = data['soil_type'].replace(data['soil_type'].unique(), np.arange(1, data['soil_type'].nunique()+1))
  if drop:
    cols = data.iloc[:, 14:54].columns.to_list()
    data = data.drop(columns=cols)
  return data

def re_one_hot_encode_wilderness(data, drop=True):
  wilderness = data[['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4']].copy()
  data['wilderness_area'] = wilderness.idxmax(1)
  data['wilderness_area'] = data['wilderness_area'].replace(data['wilderness_area'].unique(), np.arange(1, data['wilderness_area'].nunique()+1))
  if drop:
    data = data.drop(['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4'], axis=1)
  return data

def re_one_hot_encode_categorical(data, drop=True):
  return re_one_hot_encode_wilderness(re_one_hot_encode_soiltype(data, drop), drop)

# helper method to split in features and labels
def get_features_labels_tree(data, values=False):
  X = data.drop("Cover_Type", axis=1)
  y = data.loc[:, 'Cover_Type']
  if values:
    return[X.values, y.values]
  return [X, y]

**Heloc functions**

In [7]:
# helper function to determine indexes of rows containing only na values
def get_indexes_nan_rows(data):
  all_na = data.isna().all(axis=1)
  all_na = all_na[all_na==True]
  return all_na.index

# returns percentage of missing values per column
def get_missing_values_per_columns(data, columns, threshold):
  missing = {}
  problematic = []
  for column in columns:
    col = data.loc[:, column]
    nans = np.isnan(col)
    nans = nans[nans == True]
    missing_values = len(nans)/len(col)
    missing.update({column: missing_values})
    if missing_values >= threshold:
      problematic.append(column)
  return [missing, problematic]

#imputes missing values
#because most of the features are very skewed, median seems more reasonable
def impute_heloc(data):
  imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
  columns = data.columns
  indexes = data.index
  imp_median.fit(data.values)
  data = imp_median.transform(data.values)
  data = pd.DataFrame(data, columns=columns)
  data = data.set_index(indexes)
  return data

# wrapper function that prepares complete data set
def prepare_data_heloc_train(data, drop_columns):
  print("Inital dimensions:", data.values.shape)
  print("Recode labels")
  # replace labels with int values (because final submission needs to be 0, 1)
  data['RiskPerformance'] = data['RiskPerformance'].replace(('Bad','Good'), (0,1))
  # columns were determined in EDA
  print("Drop columns")
  data = data.drop(drop_columns, axis=1)
  print("New dimensions:", data.values.shape)
  print("Split in features and labels")
  # split to replace nas -> has to be done because complete NaS rows are excluded from prediction/training and will just be assigned fixed binomial probability classification
  dfX = data.drop("RiskPerformance", axis=1)
  dfy = pd.DataFrame(data.loc[:, 'RiskPerformance'])
  print("Replace nas")
  dfX[dfX<0] = np.nan
  # get na rows index
  na_indexes = get_indexes_nan_rows(dfX)
  #drop na rows
  dfX = dfX.drop(na_indexes)
  dfy = dfy.drop(na_indexes)
  print("Impute nas")
  dfX = impute_heloc(dfX)
  return dfX.join(dfy)

# wrapper function that prepares complete data set
def prepare_data_heloc_test(data, drop_columns):
  print("Inital dimensions:", data.values.shape)
  print("Recode labels")
  # replace labels with int values (because final submission needs to be 0, 1)
  #data['RiskPerformance'] = data['RiskPerformance'].replace(('Bad','Good'), (0,1))
  # columns were determined in EDA
  print("Drop columns")
  data = data.drop(drop_columns, axis=1)
  print("New dimensions:", data.values.shape)
  print("Split in features and labels")
  print("Replace nas")
  data[data<0] = np.nan
  data = impute_heloc(data)
  return data

# helper method to split in features and labels
def get_features_labels_heloc(data, values=False):
  X = data.drop("RiskPerformance", axis=1)
  y = data.loc[:, 'RiskPerformance']
  if values:
    return[X.values, y.values]
  return [X, y]

**Higgs functions**

In [8]:

# cleans and prepares data
def clean_higgs_data(data, drop_columns, train=True):
  print("Inital dimensions:", data.values.shape)
  if train:
    # replace labels with int values (because final submission needs to be 0, 1)
    data['Label'] = data['Label'].replace(('b','s'), (0,1))
  #drop columns
  data = data.drop(drop_columns, axis=1)
  print("New dimensions:", data.values.shape)
  return data

# helper function to get features and labels split
def get_features_labels_higgs(data, values=False):
  X = data.drop("Label", axis=1)
  y = data.loc[:, 'Label']
  if values:
    return[X.values, y.values]
  return [X, y]

**Prepare data**

In [9]:
def make_datasets_ready(tree, heloc, drop_columns_heloc, higgs, drop_columns_higgs):
  # two versions of tree data: one with one-hot-encoding one with categorical (lgb)
  print("Preparing tree data...")
  print("Inital dimensions ", tree.values.shape)
  print("New dimensions ", tree.values.shape)
  print("Preparing tree data 2nd version...")
  print("Inital dimensions ", tree.values.shape)
  lgb = re_one_hot_encode_categorical(tree)
  print("New dimensions ", lgb.values.shape)
  print("Preparing heloc data...")
  heloc_train = prepare_data_heloc_train(heloc, drop_columns_heloc)
  print("Preparing higgs data...")
  higgs_train = clean_higgs_data(higgs, drop_columns_higgs)
  return [tree, heloc_train, higgs_train, lgb]

In [10]:

def make_datasets_ready_test(tree, heloc, drop_columns_heloc, higgs, drop_columns_higgs):
  # two versions of tree data: one with one-hot-encoding one with categorical (lgb)
  print("Preparing tree data...")
  print("Inital dimensions ", tree.values.shape)
  print("New dimensions ", tree.values.shape)
  print("Preparing tree data 2nd version...")
  print("Inital dimensions ", tree.values.shape)
  lgb = re_one_hot_encode_categorical(tree)
  print("New dimensions ", lgb.values.shape)
  print("Preparing heloc data...")
  heloc_train = prepare_data_heloc_test(heloc, drop_columns_heloc)
  print("Preparing higgs data...")
  higgs_train = clean_higgs_data(higgs, drop_columns_higgs, train=False)
  return [tree, heloc_train, higgs_train, lgb]

In [11]:
def get_training_ready_data(data, full_train=False, split=0.2):
  # first keep data as dataframes to extract columns names (used for LightGBM)
  dfXs = [get_features_labels_tree(data[0])[0], get_features_labels_heloc(data[1])[0], get_features_labels_higgs(data[2])[0], get_features_labels_tree(data[3])[0]]
  dfys = [get_features_labels_tree(data[0])[1], get_features_labels_heloc(data[1])[1], get_features_labels_higgs(data[2])[1], get_features_labels_tree(data[3])[1]]
  column_names = [dat.columns.to_list() for dat in dfXs]

  #only get values
  X = [get_features_labels_tree(data[0], values=True)[0], get_features_labels_heloc(data[1], values=True)[0], get_features_labels_higgs(data[2], values=True)[0], get_features_labels_tree(data[3], values=True)[0]]
  y = [get_features_labels_tree(data[0], values=True)[1], get_features_labels_heloc(data[1], values=True)[1], get_features_labels_higgs(data[2], values=True)[1], get_features_labels_tree(data[3], values=True)[1]]

  if full_train:
      n_features = [x.shape[1] for x in X]
      n_rows = [x.shape[0] for x in X]
      print("number of features ", n_features)
      print("number of rows ", n_rows)
      labels = [np.unique(goal) for goal in y]
      n_labels = [len(np.unique(goal)) for goal in y]
      print("labels: ", labels)
      print("number of labels ", n_labels)
      return X, y
  else:
    #split in train and validation set
    X_train_tree, X_test_tree, y_train_tree, y_test_tree = train_test_split(X[0], y[0], test_size=split, random_state=42)
    X_train_heloc, X_test_heloc, y_train_heloc, y_test_heloc = train_test_split(X[1], y[1], test_size=split, random_state=42)
    X_train_higgs, X_test_higgs, y_train_higgs, y_test_higgs = train_test_split(X[2], y[2], test_size=split, random_state=42)
    X_train_tree_lgb, X_test_tree_lgb, y_train_tree_lgb, y_test_tree_lgb = train_test_split(X[3], y[3], test_size=split, random_state=42)

    # create lists
    X_train = [X_train_tree, X_train_heloc, X_train_higgs, X_train_tree_lgb]
    X_test = [X_test_tree, X_test_heloc, X_test_higgs, X_test_tree_lgb]
    y_train = [y_train_tree, y_train_heloc, y_train_higgs, y_train_tree_lgb]
    y_test = [y_test_tree, y_test_heloc, y_test_higgs, y_test_tree_lgb]


    # extract some controlling numbers for better overivew
    n_features = [x.shape[1] for x in X]
    n_rows = [x.shape[0] for x in X]
    print("number of features ", n_features)
    print("number of rows ", n_rows)
    labels = [np.unique(goal) for goal in y]
    n_labels = [len(np.unique(goal)) for goal in y]
    print("labels: ", labels)
    print("number of labels ", n_labels)

    return X_train, X_test, y_train, y_test, column_names

**Compare classifiers**

In [12]:
def compare_standard_classifiers(classifiers, classifier_names, x_train, x_test, y_train, y_test, data_names):
  scores = []
  for name, clf in zip(classifier_names, classifiers):
          print("Classifer " + name)
          results = {"model": name, "parameters": clf.get_params()}
          score = []
          for xtrain, xtest, ytrain, ytest, data_name in zip(x_train, x_test, y_train, y_test, data_names):
            print("Train ", data_name)
            clf = make_pipeline(StandardScaler(), clf)
            clf.fit(xtrain, ytrain)
            print("Test")
            acc = clf.score(xtest, ytest)
            score.append(acc)
            results.update({data_name: acc})
            print("Done")
          results.update({"overall score": np.mean(score)})
          scores.append(results)
  return scores

**TabPFN**

In [13]:
def get_score_tabpfn(x_train, x_test, y_train, y_test, data_names):
  results = {"model": "TabPFN"}
  score = []
  classifier = TabPFNClassifier(device='cpu', N_ensemble_configurations=32)
  for xdata, xtest, ydata, ytest, data_name in zip(x_train,x_test, y_train, y_test, data_names):
    sample_indexes =  np.random.randint(0, xdata.shape[0], 1024)
    x_sample = xdata[sample_indexes, :]
    y_sample = ydata[sample_indexes]
    print("Train ", data_name)
    classifier.fit(x_sample, y_sample)
    print("Test")
    y_eval, p_eval = classifier.predict(xtest, return_winning_probability=True)
    acc = accuracy_score(ytest, y_eval)
    score.append(acc)
    results.update({data_name: acc})
    print("Done")
  results.update({"overall score": np.mean(score)})
  return results

**Make predictions**

In [14]:
def make_prediction_heloc(train_set, drop_columns,  test_set, classifier, test_submission):
  train = prepare_data_heloc_train(train_set, drop_columns)
  test, na_indexes = prepare_data_heloc_test(test_set, drop_columns)
  X_test = test.values
  X,y = get_features_labels_heloc(train)
  print("train")
  classifier = make_pipeline(StandardScaler(), classifier)
  classifier.fit(X, y)
  print("predict")
  y_pred = classifier.predict(X_test)
  p = 5000/5459
  na_preds = np.random.binomial(1, p, size=len(na_indexes))
  y_pred[na_indexes] = na_preds
  test_submission['pred'] = y_pred.astype(int)
  test_submission = test_submission.drop('Prediction', axis=1)
  test_submission = test_submission.rename(columns={'pred': 'Prediction'})
  return test_submission

# 1. Load Data

In [115]:
tree_train = pd.read_csv("/content/drive/MyDrive/aml/data/covtype_train.csv")
heloc_train = pd.read_csv("/content/drive/MyDrive/aml/data/heloc_train.csv")
higgs_train = pd.read_csv("/content/drive/MyDrive/aml/data/higgs_train.csv")

In [116]:
tree_test = pd.read_csv("/content/drive/MyDrive/aml/data/covtype_test.csv")
heloc_test = pd.read_csv("/content/drive/MyDrive/aml/data/heloc_test.csv")
higgs_test = pd.read_csv("/content/drive/MyDrive/aml/data/higgs_test.csv")

  and should_run_async(code)


In [117]:
submission = pd.read_csv("/content/drive/MyDrive/aml/data/covtype_test_submission.csv")

  and should_run_async(code)


In [118]:
synthetic_tree = pd.read_csv("/content/drive/MyDrive/aml/synthetic_data_tree.csv")

  and should_run_async(code)


In [119]:
synthetic_tree = synthetic_tree.drop('Unnamed: 0', axis=1)

  and should_run_async(code)


# 2. Prepare data sets

In [120]:
# final columns to drop
# first 3 because of missing values
# the rest because of multicorriliarity
drop_columns_heloc2 = [
                      ]

In [121]:
# final list of columns to drop
higgs_drop_columns2 = ['EventId']

In [122]:
data_train = make_datasets_ready(tree_train, heloc_train, drop_columns_heloc2, higgs_train, higgs_drop_columns2)

Preparing tree data...
Inital dimensions  (58101, 55)
New dimensions  (58101, 55)
Preparing tree data 2nd version...
Inital dimensions  (58101, 55)
New dimensions  (58101, 13)
Preparing heloc data...
Inital dimensions: (9413, 24)
Recode labels
Drop columns
New dimensions: (9413, 24)
Split in features and labels
Replace nas
Impute nas
Preparing higgs data...
Inital dimensions: (175000, 33)
New dimensions: (175000, 32)


In [123]:
data_test = make_datasets_ready_test(tree_test, heloc_test, drop_columns_heloc2, higgs_test, higgs_drop_columns2)

Preparing tree data...
Inital dimensions  (3500, 54)
New dimensions  (3500, 54)
Preparing tree data 2nd version...
Inital dimensions  (3500, 54)
New dimensions  (3500, 12)
Preparing heloc data...
Inital dimensions: (1046, 23)
Recode labels
Drop columns
New dimensions: (1046, 23)
Split in features and labels
Replace nas
Preparing higgs data...
Inital dimensions: (75000, 32)
New dimensions: (75000, 31)


# Ensemble TabPfN


### Functions

In [21]:
def predict_tab(x_train, x_test, y_train, clf):
  #print("Train")
  clf.fit(x_train, y_train)
  #print("Predict")
  y_pred = clf.predict(x_test, return_winning_probability=True)
  return y_pred[0]

def chunk_train_data(xtrain, ytrain):
  n_splits = math.floor(xtrain.shape[0]/1024)
  cutoff = 1024*n_splits
  split_x = xtrain[:cutoff, :]
  split_y = ytrain[:cutoff]
  x_splitted = np.vsplit(split_x, n_splits)
  y_splitted = np.split(split_y, n_splits)
  x_splitted.append(xtrain[cutoff:, :])
  y_splitted.append(ytrain[cutoff:])
  print("Number of rows:", xtrain.shape[0])
  print(f"Split into {n_splits} chunks")
  return x_splitted, y_splitted

def chunk_test_data(xtest):
  n_splits = math.floor(xtest.shape[0]/1024)
  cutoff = 1024*n_splits
  split_x = xtest[:cutoff, :]
  x_splitted = np.vsplit(split_x, n_splits)
  x_splitted.append(xtest[cutoff:, :])
  print("Number of rows:", xtest.shape[0])
  print(f"Split into {n_splits} chunks")
  return x_splitted

def ensembling_tabpfn(x_train_chunks, y_train_chunks, x_test, clf, n_ensembles=None):
  predictions=[]
  i=1
  #print("Number of ensemble runs: ", len(x_train_chunks))
  if n_ensembles is not None:
    x = x_train_chunks[:n_ensembles]
    y = y_train_chunks[:n_ensembles]
  else:
    x = x_train_chunks
    y = y_train_chunks
  for x_chunk, y_chunk in zip(x, y):
      #print("Ensemble runs ", i)
      pred = predict_tab(x_chunk, x_test, y_chunk,  clf)
      predictions.append(pred)
      i += 1
  #print("Runs done. Ensembling...")
  df_pred = pd.DataFrame(predictions).T
  y_predded = df_pred.apply(lambda x: x.mode(), axis=1)
  return y_predded

In [29]:
def make_ensemble_predictions(xtrain, ytrain, xtest, clf, n_ensembles=None, shuffle=False):
  if shuffle:
      shuffler = np.random.permutation(len(xtrain))
      xtrain_shuffled = xtrain[shuffler]
      ytrain_shuffled = ytrain[shuffler]
      print("Chunk data")
      X_train_chunked, y_train_chunked = chunk_train_data(xtrain_shuffled, ytrain_shuffled)
  else:
      X_train_chunked, y_train_chunked = chunk_train_data(xtrain, ytrain)
  X_test_split = chunk_test_data(xtest)
  print("Number of chunks for prediction", len(X_test_split))
  print(f"Total runs: {len(X_test_split) * len(X_train_chunked)}")
  predictions = []
  for counter,pred_chunk in enumerate(tqdm(X_test_split)):
    #print("Predicting chunk ", counter)
    predictions.append(ensembling_tabpfn(X_train_chunked, y_train_chunked, pred_chunk, clf, n_ensembles))
  return predictions

  and should_run_async(code)


In [23]:
def convert_to_predictions(output):
  predictions = pd.DataFrame(pd.concat([pred[0] for pred in output]))
  predictions[0] = predictions[0].astype(int)
  return predictions

In [24]:
def write_pred_file(output, type, path):
  predictions = convert_to_predictions(output)
  if type == 'tree':
    predictions['ID'] = np.arange(1,3501)
  elif type == 'heloc':
    predictions['ID'] = np.arange(3501, 3501+1046)
  elif type == 'higgs':
    predictions['ID'] = np.arange(4547, 4547+75000)
  predictions = predictions.rename({0: 'Prediction'}, axis=1)
  predictions.to_csv(path, index=False)
  return predictions

### Prepare train data and classifier

In [35]:
#X_train, X_test, y_train, y_test, column_names = get_training_ready_data(data_train)

number of features  [55, 20, 31, 12]
number of rows  [58101, 8876, 175000, 58101]
labels:  [array([1, 2, 3, 4, 5, 6, 7]), array([0, 1]), array([0, 1]), array([1, 2, 3, 4, 5, 6, 7])]
number of labels  [7, 2, 2, 7]


In [124]:
X, y = get_training_ready_data(data_train, full_train=True)

number of features  [55, 23, 31, 12]
number of rows  [58101, 8876, 175000, 58101]
labels:  [array([1, 2, 3, 4, 5, 6, 7]), array([0, 1]), array([0, 1]), array([1, 2, 3, 4, 5, 6, 7])]
number of labels  [7, 2, 2, 7]


In [25]:
clf = TabPFNClassifier(device='cuda', N_ensemble_configurations=32)

We have to download the TabPFN, as there is no checkpoint at  /usr/local/lib/python3.10/dist-packages/tabpfn/models_diff/prior_diff_real_checkpoint_n_0_epoch_100.cpkt
It has about 100MB, so this might take a moment.
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters


## Heloc

In [125]:
pred_heloc = make_ensemble_predictions(X[1], y[1], data_test[1].values, clf)

Number of rows: 8876
Split into 8 chunks
Number of rows: 1046
Split into 1 chunks
Number of chunks for prediction 2
Total runs: 18


100%|██████████| 2/2 [00:24<00:00, 12.33s/it]


In [126]:
write_pred_file(pred_heloc, 'heloc', "/content/drive/MyDrive/aml/pred_heloc_ensemble_new2.csv")

Unnamed: 0,Prediction,ID
0,1,3501
1,0,3502
2,0,3503
3,0,3504
4,0,3505
...,...,...
17,1,4542
18,0,4543
19,1,4544
20,1,4545


# Higgs

In [27]:
pred_higgs = make_ensemble_predictions(X[2], y[2], data_test[2].values, clf, 30)

Chunk data
Number of rows: 175000
Split into 170 chunks
Number of rows: 75000
Split into 73 chunks
Number of chunks for prediction 74
Total runs: 12654


100%|██████████| 74/74 [1:15:54<00:00, 61.54s/it]


In [28]:
write_pred_file(pred_higgs, 'higgs', "/content/drive/MyDrive/aml/pred_higgs_ensemble_2.csv")

  and should_run_async(code)


Unnamed: 0,Prediction,ID
0,0,4547
1,0,4548
2,1,4549
3,0,4550
4,0,4551
...,...,...
243,1,79542
244,0,79543
245,0,79544
246,0,79545


# Covtype

### Raw

### Smote

In [63]:
# Apply SMOTE
smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X[0], y[0])


  and should_run_async(code)


In [65]:
covtype_pred_smote_shuffle = make_ensemble_predictions(X_train_smote, y_train_smote, data_test[0].values, clf, 20, shuffle=True)

Chunk data
Number of rows: 197736
Split into 193 chunks
Number of rows: 3500
Split into 3 chunks
Number of chunks for prediction 4
Total runs: 776


100%|██████████| 4/4 [02:31<00:00, 37.88s/it]


In [66]:
write_pred_file(covtype_pred_smote_shuffle, 'tree', "/content/drive/MyDrive/aml/pred_covtype_smote_shuffeled.csv")

Unnamed: 0,Prediction,ID
0,1,1
1,1,2
2,1,3
3,2,4
4,2,5
...,...,...
423,7,3496
424,7,3497
425,7,3498
426,7,3499


### Covtype with shuffle

In [None]:
covtype_pred_smote = make_ensemble_predictions(X[0], y[0], data_test[0].values, clf, 30, shuffle=True)

  and should_run_async(code)


Chunk data
Number of rows: 58101
Split into 56 chunks
Number of rows: 3500
Split into 3 chunks
Number of chunks for prediction 4
Total runs: 228


100%|██████████| 4/4 [04:06<00:00, 61.70s/it]


In [None]:
write_pred_file(covtype_pred_smote, 'tree', "/content/drive/MyDrive/aml/pred_tree_ensemble_with_shuffling.csv")

Unnamed: 0,Prediction,ID
0,1,1
1,2,2
2,1,3
3,2,4
4,2,5
...,...,...
423,7,3496
424,1,3497
425,1,3498
426,1,3499


### Synthetic

In [68]:
synth_x = synthetic_tree.drop('Cover_Type', axis=1)
synth_y = synthetic_tree[['Cover_Type']]

  and should_run_async(code)


In [90]:
X_synth = np.vstack((X[3], synth_x.values))
y_synth = np.append(y[3], synth_y.values.flatten())

In [96]:
X_synth.shape

(143661, 12)

In [93]:
y_synth.shape

  and should_run_async(code)


(143661,)

In [99]:
covtype_pred_synth = make_ensemble_predictions(X_synth, y_synth, data_test[3].values, clf, 20, shuffle=True)

Chunk data
Number of rows: 143661
Split into 140 chunks
Number of rows: 3500
Split into 3 chunks
Number of chunks for prediction 4
Total runs: 564


100%|██████████| 4/4 [02:19<00:00, 34.92s/it]


In [100]:
write_pred_file(covtype_pred_synth, 'tree', "/content/drive/MyDrive/aml/pred_tree_ensemble_synthetic.csv")

Unnamed: 0,Prediction,ID
0,1,1
1,1,2
2,1,3
3,2,4
4,2,5
...,...,...
423,7,3496
424,7,3497
425,7,3498
426,1,3499


### Synthetic with SMOTE


In [None]:
synth_x = synthetic_tree.drop('Cover_Type', axis=1)
synth_y = synthetic_tree[['Cover_Type']]

  and should_run_async(code)


In [103]:
X_synth = np.vstack((X[3], synth_x.values[:10000, :]))
y_synth = np.append(y[3], synth_y.values.flatten()[:10000])

  and should_run_async(code)


In [104]:
smote = SMOTE()
X_train_smote_synth, y_train_smote_synth = smote.fit_resample(X_synth, y_synth)

In [105]:
covtype_pred_synth_smote = make_ensemble_predictions(X_train_smote_synth, y_train_smote_synth, data_test[3].values, clf, 30, shuffle=True)

  and should_run_async(code)


Chunk data
Number of rows: 197736
Split into 193 chunks
Number of rows: 3500
Split into 3 chunks
Number of chunks for prediction 4
Total runs: 776


100%|██████████| 4/4 [03:20<00:00, 50.18s/it]


In [106]:
write_pred_file(covtype_pred_synth_smote, 'tree', "/content/drive/MyDrive/aml/pred_tree_ensemble_synthetic_smote.csv")

  and should_run_async(code)


Unnamed: 0,Prediction,ID
0,1,1
1,1,2
2,1,3
3,1,4
4,5,5
...,...,...
423,7,3496
424,7,3497
425,7,3498
426,7,3499
