<a href="https://colab.research.google.com/github/usermar445/aml_final_project/blob/main/scripts/aml_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import ListedColormap

from sklearn.datasets import make_circles, make_classification, make_moons
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import multilabel_confusion_matrix
import seaborn as sns
from sklearn.preprocessing import Normalizer
from sklearn.metrics import accuracy_score

import lightgbm as lgb


# Load data

In [None]:
df_tree = pd.read_csv("/content/drive/MyDrive/aml/data/covtype_train.csv")
df_higgs = pd.read_csv("/content/drive/MyDrive/aml/data/higgs_train.csv")
df_heloc = pd.read_csv("/content/drive/MyDrive/aml/data/heloc_train.csv")

### Prepare data sets

In [None]:
# replace labels with int values
df_higgs['Label_int'] = df_higgs['Label'].replace(df_higgs['Label'].unique(), np.arange(0, df_higgs['Label'].nunique()))
labels_higgs = df_higgs.loc[:,'Label']
df_higgs  = df_higgs.drop('Label', axis=1)
df_heloc['RiskPerformance_int'] = df_heloc['RiskPerformance'].replace(df_heloc['RiskPerformance'].unique(), np.arange(0, df_heloc['RiskPerformance'].nunique()))
labels_heloc = df_heloc.loc[:,'RiskPerformance']
df_heloc  = df_heloc.drop('RiskPerformance', axis=1)

In [None]:
dfX_tree = df_tree.drop("Cover_Type", axis=1)
dfy_tree = df_tree.loc[:, 'Cover_Type']

dfX_higgs = df_higgs.drop("Label_int", axis=1)
dfy_higgs = df_higgs.loc[:, 'Label_int']

dfX_heloc = df_heloc.drop("RiskPerformance_int", axis=1)
dfy_heloc = df_heloc.loc[:, 'RiskPerformance_int']

dfXs = [dfX_tree, dfX_higgs, dfX_heloc]
column_names = [dat.columns.to_list() for dat in dfXs]

In [None]:
X_tree = dfX_tree.values
X_higgs = dfX_higgs.values
X_heloc = dfX_heloc.values

y_tree = dfy_tree.values
y_higgs = dfy_higgs.values
y_heloc = dfy_heloc.values

X = [X_tree, X_higgs, X_heloc]
y = [y_tree, y_higgs, y_heloc]
data_names = ['tree', 'higgs', 'heloc']

In [None]:
n_features = [x.shape[1] for x in X]
n_rows = [x.shape[0] for x in X]
print("number of features ", n_features)
print("number of rows ", n_rows)

number of features  [54, 32, 23]
number of rows  [58101, 175000, 9413]


In [None]:
labels = [np.unique(goal) for goal in y]
n_labels = [len(np.unique(goal)) for goal in y]
print("labels: ", labels)
print("number of labels ", n_labels)

labels:  [array([1, 2, 3, 4, 5, 6, 7]), array([0, 1]), array([0, 1])]
number of labels  [7, 2, 2]


### Standard approaches

In [None]:
names = [
    "Nearest Neighbors",
    "Decision Tree",
    "Random Forest",
    "AdaBoost",
]

classifiers = [
    KNeighborsClassifier(3),
    DecisionTreeClassifier(max_depth=10, random_state=42),
    RandomForestClassifier(
        max_depth=10, n_estimators=100, max_features=1, random_state=42
    ),
    AdaBoostClassifier(random_state=42),
]

In [None]:
scores = []
for name, clf in zip(names, classifiers):
        print("Classifer" + name)
        results = {"model": name, "parameters": clf.get_params()}
        score = []
        for xdata, ydata, data_name in zip(X,y, data_names):
          X_train, X_test, y_train, y_test = train_test_split(xdata, ydata, test_size=0.2, random_state=42)
          print("Train ", data_name)
          clf = make_pipeline(StandardScaler(), clf)
          clf.fit(X_train, y_train)
          print("Test")
          acc = clf.score(X_test, y_test)
          score.append(acc)
          results.update({data_name: acc})
          print("Done")
        results.update({"overall score": np.mean(score)})
        scores.append(results)
        print("Next model")


ClassiferNearest Neighbors
Train  tree
Test
Done
Train  higgs
Test
Done
Train  heloc
Test
Done
Next model
ClassiferDecision Tree
Train  tree
Test
Done
Train  higgs
Test
Done
Train  heloc
Test
Done
Next model
ClassiferRandom Forest
Train  tree
Test
Done
Train  higgs
Test
Done
Train  heloc
Test
Done
Next model
ClassiferAdaBoost
Train  tree
Test
Done
Train  higgs
Test
Done
Train  heloc
Test
Done
Next model


In [None]:
scores

[{'model': 'Nearest Neighbors',
  'parameters': {'algorithm': 'auto',
   'leaf_size': 30,
   'metric': 'minkowski',
   'metric_params': None,
   'n_jobs': None,
   'n_neighbors': 3,
   'p': 2,
   'weights': 'uniform'},
  'tree': 0.8389983650288271,
  'higgs': 0.8810857142857142,
  'heloc': 0.6537440254912373,
  'overall score': 0.7912760349352596},
 {'model': 'Decision Tree',
  'parameters': {'ccp_alpha': 0.0,
   'class_weight': None,
   'criterion': 'gini',
   'max_depth': 10,
   'max_features': None,
   'max_leaf_nodes': None,
   'min_impurity_decrease': 0.0,
   'min_samples_leaf': 1,
   'min_samples_split': 2,
   'min_weight_fraction_leaf': 0.0,
   'random_state': 42,
   'splitter': 'best'},
  'tree': 0.760605799845108,
  'higgs': 1.0,
  'heloc': 0.6622411046202867,
  'overall score': 0.8076156348217983},
 {'model': 'Random Forest',
  'parameters': {'bootstrap': True,
   'ccp_alpha': 0.0,
   'class_weight': None,
   'criterion': 'gini',
   'max_depth': 10,
   'max_features': 1,
   '

# Lightgbm

In [None]:
wilderness = df_tree[['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4']].copy()
df_tree['wilderness_area'] = wilderness.idxmax(1)
soil = df_tree.iloc[:, 14:54]
df_tree['soil_type'] = soil.idxmax(1)
df_tree['wilderness_area_cat'] = df_tree['wilderness_area'].replace(df_tree['wilderness_area'].unique(), np.arange(1, df_tree['wilderness_area'].nunique()+1))
df_tree['soil_type_cat'] = df_tree['soil_type'].replace(df_tree['soil_type'].unique(), np.arange(1, df_tree['soil_type'].nunique()+1))
columns = np.arange(0, 10).tolist() +[ 54, 57, 58]
df_tree_lgb = df_tree.iloc[:, columns].copy()

In [None]:
dfX_tree_lgb = df_tree_lgb.drop("Cover_Type", axis=1)
dfy_tree_lgb = df_tree_lgb['Cover_Type']

In [None]:
X_tree_lgb = dfX_tree_lgb.values
y_tree_lgb = dfy_tree_lgb.values

X.append(X_tree_lgb)
y.append(y_tree_lgb)

data_names.append("tree_lgb")
column_names.append(dfX_tree_lgb.columns.to_list())
n_features.append(X_tree_lgb.shape[1])
n_rows.append(X_tree_lgb.shape[0])


In [None]:
results_lgb = {"model": "lgb"}
score_lgb = []
param = {'num_leaves': 31, 'objective': 'multiclass', 'num_class': 8}
num_round = 10

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X[1], y[1], test_size=0.2, random_state=42)
train_data = lgb.Dataset(X_train, label=y_train, feature_name=column_names[1], free_raw_data=False)
#train_data.save_binary('train.bin')
bst = lgb.train(param, train_data, num_round)
ypred = bst.predict(X_test)
pred = pd.DataFrame(ypred)
predicted = pred.idxmax(axis=1)
acc = accuracy_score(y_test, predicted)
results_lgb.update({data_names[1]: acc})
score_lgb.append(acc)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.305983 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7898
[LightGBM] [Info] Number of data points in the train set: 140000, number of used features: 32
[LightGBM] [Info] Start training from score -0.419191
[LightGBM] [Info] Start training from score -1.071713
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -34.538776


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X[2], y[2], test_size=0.2, random_state=42)
train_data = lgb.Dataset(X_train, label=y_train, feature_name=column_names[2], free_raw_data=False)
#train_data.save_binary('train.bin')
bst = lgb.train(param, train_data, num_round)
ypred = bst.predict(X_test)
pred = pd.DataFrame(ypred)
predicted = pred.idxmax(axis=1)
acc = accuracy_score(y_test, predicted)
results_lgb.update({data_names[2]: acc})
score_lgb.append(acc)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000940 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1485
[LightGBM] [Info] Number of data points in the train set: 7530, number of used features: 23
[LightGBM] [Info] Start training from score -0.650001
[LightGBM] [Info] Start training from score -0.738239
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -34.538776


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X[3], y[3], test_size=0.2, random_state=42)
train_data = lgb.Dataset(X_train, label=y_train, feature_name=column_names[3], free_raw_data=False)
#train_data.save_binary('train.bin')
bst = lgb.train(param, train_data, num_round)
ypred = bst.predict(X_test)
pred = pd.DataFrame(ypred)
predicted = pred.idxmax(axis=1)
acc = accuracy_score(y_test, predicted)
results_lgb.update({data_names[3]: acc})
score_lgb.append(acc)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005817 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2170
[LightGBM] [Info] Number of data points in the train set: 46480, number of used features: 12
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -0.997957
[LightGBM] [Info] Start training from score -0.725996
[LightGBM] [Info] Start training from score -2.778804
[LightGBM] [Info] Start training from score -5.428657
[LightGBM] [Info] Start training from score -4.121385
[LightGBM] [Info] Start training from score -3.526404
[LightGBM] [Info] Start training from score -3.341282


In [None]:
results_lgb.update({"overall score": np.mean(score_lgb)})

In [None]:
results_lgb

{'model': 'lgb',
 'higgs': 1.0,
 'heloc': 0.6898566117896973,
 'tree_lgb': 0.7642199466483091,
 'overall score': 0.8180255194793355}

## TabPNF

In [None]:
!pip install tabpfn



In [None]:
from tabpfn import TabPFNClassifier

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X[0], y[0], test_size=0.2, random_state=42)

In [None]:

X[0][np.random.randint(0, X[0].shape[0], 1024), :]

array([[3259,   51,   13, ...,    0,    1,    0],
       [2403,   88,   21, ...,    0,    0,    0],
       [2887,   15,   15, ...,    0,    0,    0],
       ...,
       [3109,  162,    3, ...,    0,    0,    0],
       [3119,  130,   16, ...,    0,    0,    0],
       [2807,   61,   20, ...,    0,    0,    0]])

In [None]:
results_tabpfn = {"model": name, "parameters": clf.get_params()}
score_tabpfn = []
classifier = TabPFNClassifier(device='cpu', N_ensemble_configurations=32)
for xdata, ydata, data_name in zip(X,y, data_names):
  sample_indexes =  np.random.randint(0, xdata.shape[0], 1280)
  x_sample = xdata[sample_indexes, :]
  y_sample = ydata[sample_indexes]
  X_train, X_test, y_train, y_test = train_test_split(x_sample, y_sample, test_size=256, random_state=42)
  print("Train ", data_name)
  classifier.fit(X_train, y_train)
  print("Test")
  y_eval, p_eval = classifier.predict(X_test, return_winning_probability=True)
  acc = accuracy_score(y_test, y_eval)
  score_tabpfn.append(acc)
  results_tabpfn.update({data_name: acc})
  print("Done")
results_tabpfn.update({"overall score": np.mean(score_tabpfn)})


Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
Train  tree
Test




Done
Train  higgs
Test




Done
Train  heloc
Test




Done
Train  tree_lgb
Test




Done
Train  tree_lgb
Test




Done


In [None]:
results_tabpfn

  and should_run_async(code)


{'model': 'AdaBoost',
 'parameters': {'data_sampler': <ensemble_tabpfn.samplers.data.BootstrapSampler at 0x7f6bc66e42e0>,
  'feature_sampler': <ensemble_tabpfn.samplers.features.LRPSampler at 0x7f6bc66e7ee0>,
  'max_iters': 100,
  'n_ensemble_configurations': 4},
 'tree': 0.73046875,
 'higgs': 0.9609375,
 'heloc': 0.73828125,
 'tree_lgb': 0.70703125,
 'overall score': 0.765625}

## Ensemble TabPNF


In [None]:
!pip install ensemble-tabpfn

Collecting ensemble-tabpfn
  Downloading ensemble_tabpfn-0.1.1-py3-none-any.whl (19 kB)
Collecting lolP==0.0.4 (from ensemble-tabpfn)
  Downloading lolP-0.0.4.tar.gz (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tabpfn==0.1.8 (from ensemble-tabpfn)
  Downloading tabpfn-0.1.8-py3-none-any.whl (153 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.3/153.3 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: lolP
  Building wheel for lolP (setup.py) ... [?25l[?25hdone
  Created wheel for lolP: filename=lolP-0.0.4-py3-none-any.whl size=9431 sha256=2de68a219c3e62f083d2c59c333d75eb70f6e1028bb29d7bd8da5f1cce2638f7
  Stored in directory: /root/.cache/pip/wheels/05/f4/de/c8e475062b672192fdb3a7cf33f4b7d0ef42251ff431c58baf
Successfully built lolP
Installing collected packages: tabpfn, lolP, ensemble-tabpfn
Successfully installed ensemble-tabpfn-0.1.1 lolP-0.0.4 tabpfn-0.1.8


In [None]:
from ensemble_tabpfn import EnsembleTabPFN
from sklearn.metrics import accuracy_score

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X[0], y[0], test_size=0.2, random_state=42)

In [None]:
clf = EnsembleTabPFN(max_iters=100)
clf.fit(X_train, y_train)
y_hat = clf.predict(y_test)
acc = accuracy_score(y_test, y_hat)

Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters


IndexError: ignored

## TapTap


In [None]:
!git clone https://github.com/ZhangTP1996/TapTap.git

In [None]:
!pip install optuna

In [None]:
!pip install datasets

In [None]:
!pip install accelerate -U

In [None]:
import TapTap
from TapTap.taptap.exp_utils import lightgbm_hpo
from TapTap.taptap.taptap import Taptap

In [None]:
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import r2_score
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"

In [None]:
def get_score(train_data, test_data, target_col, best_params):
    train_x = train_data.drop(columns=target_col).copy()
    test_x = test_data.drop(columns=target_col).copy()
    train_y = train_data[[target_col]]
    test_y = test_data[[target_col]]
    train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.2, random_state=1)
    gbm = lgb.LGBMRegressor(**best_params)
    gbm.fit(train_x, train_y, eval_set=[(val_x, val_y)], callbacks=[lgb.early_stopping(50, verbose=False)])
    pred = pd.DataFrame(gbm.predict(test_x), index=test_x.index)
    score = r2_score(test_y, pred)
    return score, gbm

In [None]:
data = fetch_california_housing(as_frame=True).frame

In [None]:
target_col = 'MedHouseVal'
task = 'regression'
train_data, test_data, _, _ = train_test_split(
        data, data[[target_col]], test_size=0.25, random_state=42
    )
best_params = lightgbm_hpo(
        data=train_data, target_col=target_col, task=task, n_trials=10, n_jobs=16
    )
original_score, gbm = get_score(
        train_data, test_data, target_col=target_col, best_params=best_params
    )

print("The score training by the original data is", original_score)


In [None]:
model = Taptap(llm='ztphs980/taptap-distill',
                   experiment_dir='./experiment_taptap/',
                   steps=1000,
                   batch_size=8,
                   numerical_modeling='split',
                   gradient_accumulation_steps=2)

In [None]:
 # Fine-tuning
model.fit(train_data, target_col=target_col, task=task)

ImportError: ignored

In [25]:
synthetic_data = model.sample(n_samples=2 * train_data.shape[0],
                                  data=train_data,
                                  task=task,
                                  max_length=1024)

 43%|████▎     | 13200/30960 [30:29<40:33,  7.30it/s]

MedHouseVal is 1. 0 9, PropertySubType is Apartment, Gender is Male, Education is Graduate, TotalWorkingYears is 1, MaritalStatus is Married, Over18 is Y, YearsSinceLastPromotion is 0, JobRole is Sales Executive, DailyRate is 1 4 3. 0 0, YearsAtCompany is 0, JobInvolvement is 3, JobLevel is 3, EmployeeNumber is 3 5 2. 0 0, MonthlyIncome is 3 7 0 6. 0 0, YearsInCurrentRole is 0, EmployeeCount is 1, EnvironmentSatisfaction is 1, JobSatisfaction is 1, WorkLifeBalance is 3, Department is Sales, DistanceFromHome is 1 1, Age is 3 4, TrainingTimesLastYear is 1, MonthlyRate is 1 1 0 8 2. 0 0, StockOptionLevel is 1, YearsWithCurrManager is 0, PercentSalaryHike is 2 0, NumCompaniesWorked is 0, PerformanceRating is 3, HourlyRate is 8 6, StandardHours is 8 0, RelationshipSatisfaction is 3


 51%|█████     | 15800/30960 [36:30<35:41,  7.08it/s]

MedHouseVal is 2. 2 2 6, Age is 3 2, Income is 2 9 9. 0 0, Education is 1. 0 6, AcceptedCmp4 is 0, NumWebVisitsMonth is 5, MntMeatProducts is 6, NumStorePurchases is 3, AcceptedCmp2 is 0, AcceptedCmp3 is 0, NumDealsPurchases is 1, Marital_Status is Single, MntSweetProducts is 1, MntFishProducts is 0, Year_Birth is 1 9 8 7, NumWebPurchases is 1, AcceptedCmp1 is 0, Complain is 0, MntFruits is 0, MntGoldProds is 0, label is 0, ID is 6 6 0 6, NumCatalogPurchases is 0, Teenhome is 1, Kidhome is 1, AcceptedCmp5 is 1, Dt_Customer is 2012-10-19


 81%|████████▏ | 25200/30960 [58:14<13:23,  7.17it/s]

MedHouseVal is 1. 3 8 9, Age is 40-44, Married is u, YearsEmployed is 0. 0 0, Income is 0. 0 0, CreditScore is 0, EducationLevel is c, BankCustomer is g, Employed is t, DriversLicense is t, Male is a, PriorDefault is t, Debt is 1. 3 5, ZipCode is 3 0. 0 0, label is 1, Ethnicity is v


31000it [1:11:41,  7.21it/s]


In [26]:

synthetic_data

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,,,,,,,,,4.345
1,,,,,,,,,1.336
2,,,,,,,,,1.228
3,,,,,,,,,1.779
4,,,,,,,,,3.273
...,...,...,...,...,...,...,...,...,...
30955,,,,,,,,,0.704
30956,,,,,,,,,3.612
30957,,,,,,,,,1.969
30958,,,,,,,,,2.250


In [None]:
if __name__ == '__main__':

    # Fine-tuning
    model.fit(train_data, target_col=target_col, task=task)

    # Sampling
    synthetic_data = model.sample(n_samples=2 * train_data.shape[0],
                                  data=train_data,
                                  task=task,
                                  max_length=1024)

    # Label generation
    synthetic_data[target_col] = gbm.predict(synthetic_data.drop(columns=[target_col]))

    # Training using synthetic data
    new_score, _ = get_score(
        synthetic_data, test_data, target_col=target_col, best_params=best_params
    )
    print("The score training by the synthetic data is", new_score)

# PCA approach

In [None]:
train_test_tree = [train_test_split(X[0], y[0], test_size=0.2, random_state=42)]
train_test_higgs = [train_test_split(X[1], y[1], test_size=0.2, random_state=42)]
train_test_heloc = [train_test_split(X[2], y[2], test_size=0.2, random_state=42)]
train_test_tree_lgb = [train_test_split(X[3], y[3], test_size=0.2, random_state=42)]

train_test_sets =[train_test_tree, train_test_higgs, train_test_heloc, train_test_tree_lgb]

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=10)

In [None]:
reduced = []
for sets in train_test_sets:
  x_red = pca.fit_transform(sets[0])
  reudced.append(x_red)

ValueError: ignored

In [None]:
train_test_sets[0][0]

[array([[2959,  141,    9, ...,    0,    0,    0],
        [2023,  327,   23, ...,    0,    0,    0],
        [2916,   48,   15, ...,    0,    0,    0],
        ...,
        [2816,   46,   16, ...,    0,    0,    0],
        [3391,  295,    5, ...,    0,    1,    0],
        [2981,  228,   12, ...,    0,    0,    0]]),
 array([[3123,  108,   18, ...,    0,    0,    0],
        [2613,  351,   22, ...,    0,    0,    0],
        [2930,  115,    5, ...,    0,    0,    0],
        ...,
        [2989,  250,   15, ...,    0,    0,    0],
        [2692,   56,   25, ...,    0,    0,    0],
        [3058,   84,    7, ...,    0,    0,    0]]),
 array([2, 6, 2, ..., 2, 1, 2]),
 array([1, 2, 1, ..., 2, 2, 1])]

In [None]:
pd.DataFrame(x_red)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1210.401571,-532.007606,-103.171202,-2.574482,-23.904498,-15.519595,11.999928,-19.865149,0.329019,0.529708
1,-2316.232185,-192.370372,-729.204378,362.269525,172.041242,-5.995157,-10.791199,22.583427,-2.764584,-0.548289
2,-2427.807700,-99.095786,125.890790,38.853706,-118.435798,42.719735,-2.042259,-1.373556,-4.384630,-1.102633
3,-752.758505,612.943063,354.503096,-45.260034,-8.911447,11.439500,12.228931,-23.292639,2.118009,-0.419616
4,2998.059511,1327.917539,-113.798720,140.408579,-71.634883,-46.499336,-21.853731,-4.082356,-3.575779,2.413408
...,...,...,...,...,...,...,...,...,...,...
46475,3267.371113,1511.260385,-346.822264,-71.706223,215.258272,21.310534,20.889164,23.045046,-2.563291,-1.511510
46476,-706.584359,-1365.494774,-265.630773,-123.105845,97.174837,-11.769574,-43.042744,-11.354243,5.134728,-0.940294
46477,-578.019435,1028.530157,-81.898954,20.996640,-104.252235,-2.812811,5.222631,13.151927,-1.690675,-0.638685
46478,565.105662,-361.192949,359.586892,-146.863305,137.791559,-36.173782,10.134307,-1.109559,-5.559620,0.860054


# Merge approach

In [27]:
pd.concat([dfX_tree, dfX_higgs], axis="index")

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,PRI_met_sumet,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt,Weight
0,3351.0,206.0,27.0,726.0,124.0,3813.0,192.0,252.0,180.0,2271.0,...,,,,,,,,,,
1,2732.0,129.0,7.0,212.0,1.0,1082.0,231.0,236.0,137.0,912.0,...,,,,,,,,,,
2,2572.0,24.0,9.0,201.0,25.0,957.0,216.0,222.0,142.0,2191.0,...,,,,,,,,,,
3,2824.0,69.0,13.0,417.0,39.0,3223.0,233.0,214.0,110.0,6478.0,...,,,,,,,,,,
4,2529.0,84.0,5.0,120.0,9.0,1092.0,227.0,231.0,139.0,4983.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174995,,,,,,,,,,,...,126.918,1.0,39.902,-2.439,-1.788,-999.000,-999.000,-999.000,39.902,0.018636
174996,,,,,,,,,,,...,126.406,0.0,-999.000,-999.000,-999.000,-999.000,-999.000,-999.000,0.000,5.607815
174997,,,,,,,,,,,...,156.983,0.0,-999.000,-999.000,-999.000,-999.000,-999.000,-999.000,0.000,0.018636
174998,,,,,,,,,,,...,726.227,3.0,175.903,-0.460,-1.340,94.014,2.118,-3.067,387.890,0.001502
