# IMPORTS

In [None]:
import numpy as np 
import pandas as pd 
from sklearn.preprocessing import MinMaxScaler
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

In [None]:
targets=["Response"]
target=targets[0]

# GENERALIZATION CONCEPT

<font size="3"> We can use different types of means such as Geometric Mean, Arithmetic Mean, and Harmonic Mean</font>

 * **Geometric Mean:**
The Geometric Mean is calculated by taking the nth root of the product of n numbers.

Geometric Mean = (x₁ * x₂ * x₃ * ... * xₙ)^(1/n)

* **Arithmetic Mean:**
The Arithmetic Mean is the sum of all the numbers in a series divided by the total number of values.

Arithmetic Mean = (x₁ + x₂ + x₃ + ... + xₙ) / n

* **Harmonic Mean:**
The Harmonic Mean is calculated by taking the reciprocal of the arithmetic mean of the reciprocals of the numbers in the series.

Harmonic Mean = n / ((1/x₁) + (1/x₂) + (1/x₃) + ... + (1/xₙ))

where x₁ , x₂ , x₃ , ... ,xₙ are vectors predicted from each submission

<font size="3">For a set of positive predictions, the Harmonic Mean is the smallest, followed by the Geometric Mean, and then the Arithmetic Mean with the largest values on average</font>

**When to use:**
1. Geometric Mean is less sensitive to extreme values (outliers) in the data compared to the arithmetic mean. This can give a better results and reduces overfitted predictions.
2. Arithmetic Mean is sensitive to extreme values. One very large or very small number can significantly affect the mean. Applicable for cases where large predictions are important. 
3. Harmonic Mean can give more weightage to smaller numbers among the predictions, risk averse method to establish a safe prediction.



In [None]:
def ensemble_mean(sub_list,cols, mean="AM"):
    
    """
    The function computes Arithmetic Mean/Geometric Mean/Harmonic Mean given a list of results with specific results.
    """
    
    sub_out=sub_list[0].copy()
    if mean=="AM":
        for col in cols:
            sub_out[col]=sum(df[col] for df in sub_list)/len(sub_list)
    elif mean=="GM":
        for df in sub_list[1:]:
            for col in cols:
                sub_out[col]*=df[col]
        for col in cols:
            sub_out[col]=(sub_out[col])**(1/len(sub_list))
    elif mean=="HM":
        for col in cols:
            sub_out[col]=len(sub_list)/sum(1/df[col] for df in sub_list)
    
    return sub_out

## Selected Public Notebooks

Thanks to the authors for their good work

1. by [@rohanrao](https://www.kaggle.com/code/rohanrao/automl-grand-prix-1st-place-solution)
2. by [@thegodchurch](https://www.kaggle.com/code/thegodchurch/lightbgm-finetuned)
3. by [@ravi20076](https://www.kaggle.com/code/ravi20076/playgrounds4e07-eda-baseline-v1)
4. by [@rzatemizel](https://www.kaggle.com/code/rzatemizel/single-model-baseline-xgboost)
5. by [@innixma](https://www.kaggle.com/code/innixma/4th-place-automl-grand-prix-automl-grandmasters)


In [None]:
sub_ext1=pd.read_csv("/kaggle/input/automl-grand-prix-1st-place-solution/submission.csv")
sub_ext2=pd.read_csv("/kaggle/input/lightbgm-finetuned/submission.csv")
sub_ext3=pd.read_csv("/kaggle/input/playgrounds4e07-eda-baseline-v1/submission_V1.csv")
sub_ext4=pd.read_parquet("/kaggle/input/single-model-baseline-xgboost/submission.parquet")
sub_ext5=pd.read_csv("/kaggle/input/4th-place-automl-grand-prix-automl-grandmasters/submission.csv")

def scale(df,targets):
    sc=MinMaxScaler()
    df[targets]=sc.fit_transform(df[targets])
    
    return df

sub_ext1=scale(sub_ext1,targets)
sub_ext2=scale(sub_ext2,targets)
sub_ext3=scale(sub_ext3,targets)
sub_ext4=scale(sub_ext4,targets)
sub_ext5=scale(sub_ext5,targets)

## Choose Weights

There are two ways to assign weights:
1) Assign higher weights to high Public LB result, it's more trial and error just to increase the Public LB. It makes more sense to just give ranks based on Public LB scores.
2) A better way is ranking them based on their CV score and assign a higher rank to higher CV result, it would be more generalized to use ordered numbers (n, n-1, n-2,…)

In [None]:
sub_list=[sub_ext1,sub_ext2, sub_ext3,sub_ext4,sub_ext5] # list all the results

weights=np.square([4,4,3,2,1])
if len(sub_list)==len(weights):
    weighted_list = [item for sublist, weight in zip(sub_list, weights) for item in [sublist] * weight]

sub_ensemble=ensemble_mean(weighted_list,targets,mean="AM")
sub_ensemble.to_csv('submission.csv',index=False)
sub_ensemble.head()