### Ensemble of Kaggle's top 3 models

In [1]:
import numpy as np
import pandas as pd

from scipy.stats import mode
from sklearn.preprocessing import LabelEncoder

In [2]:
# Load predictions from CSV files
autogluon_preds_df = pd.read_csv('19_17_avg_ensemble_predictions-0.98531.csv')
xgb_preds_df = pd.read_csv('19_17_xgb,lgbm,cat,.csv')
lgbm_preds_df = pd.read_csv('19_17_autogluon.csv')

autogluon_preds = autogluon_preds_df['class']
xgb_preds = xgb_preds_df['class']
lgbm_preds = lgbm_preds_df['class']

In [3]:
le = LabelEncoder()

autogluon_preds = le.fit_transform(autogluon_preds)
xgb_preds = le.transform(xgb_preds)
lgbm_preds = le.transform(lgbm_preds)

In [4]:
# Model scores
autogluon_score = 0.98539
xgb_score = 0.98536
lgbm_score = 0.98533

# Normalize the weights so they sum to 1
total_score = autogluon_score + xgb_score + lgbm_score

autogluon_weight = autogluon_score / total_score
xgb_weight = xgb_score / total_score
lgbm_weight = lgbm_score / total_score

In [5]:
# Combine predictions using the weights
ensemble_preds = (autogluon_weight * autogluon_preds +
                  xgb_weight * xgb_preds +
                  lgbm_weight * lgbm_preds)

# Convert the combined predictions to binary (0 or 1)
final_preds_ensemble = np.round(ensemble_preds).astype(int)
final_preds_ensemble = le.inverse_transform(final_preds_ensemble)

# Prepare the output dataframe
ensemble_output = pd.DataFrame({
    'id': autogluon_preds_df['id'],
    'class': final_preds_ensemble
})

ensemble_output.head(2)

Unnamed: 0,id,class
0,3116945,e
1,3116946,p


In [6]:
# Save the final ensemble predictions to a CSV file
ensemble_output.to_csv('19_17_ensemble_predictions.csv', index=False)

In [7]:
# Averaging the predictions
avg_ensemble_preds = (autogluon_preds + xgb_preds + lgbm_preds) / 3

# Convert the combined predictions to binary (0 or 1)
final_preds_avg_ensemble = np.round(avg_ensemble_preds).astype(int)
final_preds_avg_ensemble = le.inverse_transform(final_preds_avg_ensemble)

# Prepare the output dataframe
avg_ensemble_output = pd.DataFrame({
    'id': autogluon_preds_df['id'],
    'class': final_preds_avg_ensemble
})

avg_ensemble_output.head(2)

Unnamed: 0,id,class
0,3116945,e
1,3116946,p


In [8]:
# Save the final ensemble predictions to a CSV file
avg_ensemble_output.to_csv('19_17_avg_ensemble_predictions.csv', index=False)

In [9]:
# Stack the predictions
stacked_preds = np.stack((autogluon_preds, xgb_preds, lgbm_preds), axis=0)

# Sum the predictions along axis=0
sum_preds = np.sum(stacked_preds, axis=0)

# Majority vote (more than half the models predicting 1 leads to final prediction of 1, else 0)
stacked_ensemble_preds = (sum_preds > (stacked_preds.shape[0] // 2)).astype(int)
stacked_ensemble_preds = le.inverse_transform(stacked_ensemble_preds)

# Prepare the output dataframe
stacked_ensemble_output = pd.DataFrame({
    'id': autogluon_preds_df['id'],
    'class': stacked_ensemble_preds
})

stacked_ensemble_output.head(2)

Unnamed: 0,id,class
0,3116945,e
1,3116946,p


In [10]:
# Save the final ensemble predictions to a CSV file
stacked_ensemble_output.to_csv('19_17_stacked_ensemble_predictions.csv', index=False)

In [11]:
# Define the weights as a numpy array
weights = np.array([autogluon_weight, xgb_weight, lgbm_weight])

# Perform weighted voting by multiplying each prediction by its corresponding weight
weighted_preds = np.average(stacked_preds, axis=0, weights=weights)

# Convert the weighted predictions to binary by applying a threshold of 0.5
weighted_stcked_ensemble_preds = (weighted_preds > 0.5).astype(int)
weighted_stcked_ensemble_preds = le.inverse_transform(weighted_stcked_ensemble_preds)

# Prepare the output dataframe
weighted_stacked_ensemble_output = pd.DataFrame({
    'id': autogluon_preds_df['id'],
    'class': weighted_stcked_ensemble_preds
})

weighted_stacked_ensemble_output.head(2)

Unnamed: 0,id,class
0,3116945,e
1,3116946,p


In [12]:
# Save the final ensemble predictions to a CSV file
weighted_stacked_ensemble_output.to_csv('19_17_weighted_stacked_ensemble_predictions.csv', index=False)

In [13]:
ensemble_output['class'].value_counts()

class
p    1133805
e     944159
Name: count, dtype: int64

In [14]:
avg_ensemble_output['class'].value_counts()

class
p    1133805
e     944159
Name: count, dtype: int64

In [15]:
stacked_ensemble_output['class'].value_counts()

class
p    1133805
e     944159
Name: count, dtype: int64

In [16]:
weighted_stacked_ensemble_output['class'].value_counts()

class
p    1133805
e     944159
Name: count, dtype: int64