In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import itertools as it
from scipy.stats import spearmanr
from sklearn.linear_model import LinearRegression, RidgeCV, Lasso, ElasticNet
from sklearn.feature_selection import mutual_info_regression as mir
from sklearn.model_selection import train_test_split, KFold, TimeSeriesSplit
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder as onehot
from sklearn.cluster import KMeans
from sklearn.neural_network import MLPRegressor
from sklearn.decomposition import PCA
from sklearn.metrics import mean_absolute_percentage_error as mape, mean_squared_error
import xgboost as xgb
# from mlxtend.regressor import StackingCVRegressor
from scipy.stats import spearmanr, stats
from statsmodels.tsa.deterministic import DeterministicProcess, Fourier
from statsmodels.tsa.stattools import pacf
from scipy import signal
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

In [None]:
de_train_final = pd.read_csv('./data/de_train_final.csv')
y_de_train = pd.read_csv('./data/y_de_train.csv')
fr_train_final = pd.read_csv('./data/fr_train_final.csv')
y_fr_train = pd.read_csv('./data/y_fr_train.csv')

de_test_final = pd.read_csv('./data/de_test_final.csv')
fr_test_final = pd.read_csv('./data/fr_test_final.csv')

In [None]:
de_train_final.head()

In [None]:
print("de_train_final with shape:", de_train_final.shape)
print("y_de_train with shape:", y_de_train.shape)
print("fr_train_final with shape:", fr_train_final.shape)
print("y_fr_train with shape:", y_fr_train.shape)
print("de_test_final with shape:", de_test_final.shape)
print("fr_test_final with shape:", fr_test_final.shape)

In [None]:
X = de_train_final
y = y_de_train

# Split the data
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)

# Train the model
lr_model = LinearRegression()
lr_model.fit(x_train, y_train)

# Predict on the test set
y_pred_lr = lr_model.predict(x_test)

# Calculate Spearman Correlation
spearman_corr_lr, _ = spearmanr(y_test, y_pred_lr)
spearman_corr_lr_value = spearman_corr_lr.item()

# Print
print(f"Spearman Correlation for Linear Regression: {spearman_corr_lr_value:.1%}")

In [None]:
# Split the data
X_de = de_train_final
y_de = y_de_train
X_trainde, X_testde, Y_trainde, Y_testde = train_test_split(X_de, y_de, test_size=0.30, random_state=42)

X_fr = fr_train_final
y_fr = y_fr_train
X_trainfr, X_testfr, Y_trainfr, Y_testfr = train_test_split(X_fr, y_fr, test_size=0.30, random_state=42)

In [None]:

def metric_train(output, truth):
    return spearmanr(output, truth).correlation


def get_model(model_name, best_param=None):
    if model_name == 'dt':
        model = DecisionTreeRegressor(**(best_param if best_param else {}))
    elif model_name == 'bagging_ridge':
        base_model = Ridge(**(best_param['base_model'] if best_param and 'base_model' in best_param else {}))
        model = BaggingRegressor(base_estimator=base_model, n_estimators=10, random_state=42, **(best_param['model'] if best_param and 'model' in best_param else {}))
    elif model_name == 'extra_trees':
        model = ExtraTreesRegressor(**(best_param if best_param else {}))
    elif model_name == 'rf':
        model = RandomForestRegressor(**(best_param if best_param else {}))
    elif model_name == 'bagging_knn':
        base_model = KNeighborsRegressor(**(best_param['base_model'] if best_param and 'base_model' in best_param else {}))
        model = BaggingRegressor(base_estimator=base_model, n_estimators=10, random_state=42, **(best_param['model'] if best_param and 'model' in best_param else {}))
    elif model_name == 'bagging_svr':
        base_model = SVR(**(best_param['base_model'] if best_param and 'base_model' in best_param else {}))
        model = BaggingRegressor(base_estimator=base_model, n_estimators=10, random_state=42, **(best_param['model'] if best_param and 'model' in best_param else {}))
    elif model_name == 'bagging_linear':
        base_model = LinearRegression(**(best_param['base_model'] if best_param and 'base_model' in best_param else {}))
        model = BaggingRegressor(base_estimator=base_model, n_estimators=10, random_state=42, **(best_param['model'] if best_param and 'model' in best_param else {}))
    elif model_name == 'adaboost':
        model = AdaBoostRegressor(**(best_param if best_param else {}))
    elif model_name == 'gradient_boosting':
        model = GradientBoostingRegressor(**(best_param if best_param else {}))
    elif model_name == 'xgboost':
        model = XGBRegressor(**(best_param if best_param else {}))
    else:
        raise ValueError('Unknown Model')
    return model
        
scorer_train = make_scorer(metric_train)

In [None]:
model_names = [
    'dt',  # Decision Tree Regressor
    'bagging_ridge',  # Bagging model based on Ridge regression
    'extra_trees',  # Extra Trees Regressor
    'rf',  # Random Forest Regressor
    'bagging_knn',  # Bagging model based on KNN regression
    'bagging_svr',  # Bagging model based on SVR
    'bagging_linear',  # Bagging model based on Linear regression
    'adaboost',  # AdaBoost Regressor
    'gradient_boosting' , # Gradient Boosting Regressor
    'xgboost' #Xgboost Regressor
]

# Train and evaluate models
results = []

for model_name in model_names:
    model = get_model(model_name)
    
    # Train on DE dataset
    model.fit(X_trainde, Y_trainde)
    predictions_de = model.predict(X_testde)
    score_de = metric_train(predictions_de, Y_testde)
    
    # Train on FR dataset
    model.fit(X_trainfr, Y_trainfr)
    predictions_fr = model.predict(X_testfr)  
    score_fr = metric_train(predictions_fr, Y_testfr)
    
    # Overall Score
    predictions_overall = np.concatenate((predictions_de, predictions_fr))
    truth_overall = np.concatenate((Y_testde, Y_testfr))
    score_overall = metric_train(predictions_overall, truth_overall)
    
    results.append({
        'Model': model_name,
        'DE_Score': score_de,
        'FR_Score': score_fr,
        'Overall_Score': score_overall  
    })

# Convert results to DataFrame for better visualization
import pandas as pd
results_df = pd.DataFrame(results)

In [None]:
print(results_df)

## Feature Importance

In [None]:
# Train XGBoost model on German dataset
model_de = XGBRegressor()
model_de.fit(X_trainde, Y_trainde)

# Obtain feature importances and feature names
feature_importances_de = model_de.feature_importances_
feature_names_de = X_trainde.columns.tolist()  # Get feature names

# Create DataFrame for feature names and their importance scores
feature_importance_de = pd.DataFrame({'Feature Name': feature_names_de, 'Importance': feature_importances_de})

# Sort features by importance in descending order
feature_importance_de.sort_values(by='Importance', ascending=False, inplace=True)

# Visualize feature importance for the German dataset
plt.figure(figsize=(12, 8))
plt.barh(feature_importance_de['Feature Name'], feature_importance_de['Importance'])
plt.xlabel('Importance')
plt.title('Feature Importance for DE Dataset')
plt.gca().invert_yaxis()  # To display the most important feature at the top
plt.show()

# Train XGBoost model on French dataset
model_fr = XGBRegressor()
model_fr.fit(X_trainfr, Y_trainfr)

# Obtain feature importances and feature names
feature_importances_fr = model_fr.feature_importances_
feature_names_fr = X_trainfr.columns.tolist()  # Get feature names

# Create DataFrame for feature names and their importance scores
feature_importance_fr = pd.DataFrame({'Feature Name': feature_names_fr, 'Importance': feature_importances_fr})

# Sort features by importance in descending order
feature_importance_fr.sort_values(by='Importance', ascending=False, inplace=True)

# Visualize feature importance for the French dataset
plt.figure(figsize=(12, 8))
plt.barh(feature_importance_fr['Feature Name'], feature_importance_fr['Importance'])
plt.xlabel('Importance')
plt.title('Feature Importance for FR Dataset')
plt.gca().invert_yaxis()  # To display the most important feature at the top
plt.show()



## Tuning

In [None]:
best_models = results_df.nlargest(5, 'Overall_Score')['Model']

estimators = []
for model_name in best_models:
    if model_name == 'dt':
        estimator = DecisionTreeRegressor()
    elif model_name == 'bagging_ridge':
        estimator = BaggingRegressor(base_estimator=RidgeCV())
    elif model_name == 'extra_trees':
        estimator = ExtraTreesRegressor()
    elif model_name == 'rf':
        estimator = RandomForestRegressor()
    elif model_name == 'bagging_knn':
        estimator = BaggingRegressor(base_estimator=KNeighborsRegressor())
    elif model_name == 'bagging_svr':
        estimator = BaggingRegressor(base_estimator=SVR())
    elif model_name == 'bagging_linear':
        estimator = BaggingRegressor(base_estimator=LinearRegression())
    elif model_name == 'adaboost':
        estimator = AdaBoostRegressor()
    elif model_name == 'gradient_boosting':
        estimator = GradientBoostingRegressor()
    elif model_name == 'xgboost':
        estimator = XGBRegressor()
    else:
        raise ValueError('Unknown Model')
    
    estimators.append((model_name, estimator))

# Define Stacking Model
stacking_model = StackingCVRegressor(regressors=[estimator for _, estimator in estimators], 
                                     meta_regressor=RidgeCV(),
                                     cv=5,
                                     use_features_in_secondary=True,
                                     random_state=42)

# Train and evaluate models
results = []

model_name = 'stacking_model'

model = stacking_model
    
# Train on DE dataset
model.fit(X_trainde, Y_trainde)
predictions_de = model.predict(X_testde)
score_de = metric_train(predictions_de, Y_testde)

# Train on FR dataset
model.fit(X_trainfr, Y_trainfr)
predictions_fr = model.predict(X_testfr)  
score_fr = metric_train(predictions_fr, Y_testfr)

# Overall Score
predictions_overall = np.concatenate((predictions_de, predictions_fr))
truth_overall = np.concatenate((Y_testde, Y_testfr))
score_overall = metric_train(predictions_overall, truth_overall)

results.append({
    'Model': model_name,
    'DE_Score': score_de,
    'FR_Score': score_fr,
    'Overall_Score': score_overall  
})

# Convert results to DataFrame for better visualization
results_df = pd.DataFrame(results)


In [None]:
print(best_models)

In [None]:
print(results_df)

In [None]:
de_train_pred = stacking_model.predict(X_de)
de_train = de_train_final.reset_index()
de_train['TARGET'] = de_train_pred
de_train = de_train[['ID', 'TARGET']]


In [None]:
# Print Coefficient of each base model

meta_coef = stacking_model.meta_regr_.coef_
meta_coef_base_models = meta_coef[:len(estimators)]
model_coefficients = dict(zip([name for name, _ in estimators], meta_coef_base_models))

for model_name, coef in model_coefficients.items():
    print(f"{model_name}: {coef}")

coefficients_df = pd.DataFrame(list(model_coefficients.items()), columns=['Model', 'Coefficient'])
print(coefficients_df)


In [None]:
de_test = stacking_model.predict(de_test_final)
de_test_pred = de_test_final.reset_index()
de_test_pred['TARGET'] = de_test
de_test_pred = de_test_pred[['ID', 'TARGET']]

In [None]:
fr_train_pred = stacking_model.predict(X_fr)
fr_result = fr_train_final.reset_index()
fr_result['TARGET'] = fr_train_pred
fr_result = fr_result[['ID', 'TARGET']]

In [None]:
fr_test = stacking_model.predict(fr_test_final)
fr_test_pred = fr_test_final.reset_index()
fr_test_pred['TARGET'] = fr_test
fr_test_pred = fr_test_pred[['ID', 'TARGET']]

In [None]:
df_test = pd.read_csv('../challenge_data/X_test.csv')
test_pred = pd.DataFrame()
test_pred['ID'] = df_test['ID']

test_pred = test_pred.merge(de_test_pred[['ID', 'TARGET']], on='ID', how='left')
test_pred = test_pred.merge(fr_test_pred[['ID', 'TARGET']], on='ID', how='left')
test_pred['TARGET'] = test_pred['TARGET_x'].combine_first(test_pred['TARGET_y'])
test_pred = test_pred.drop(['TARGET_x', 'TARGET_y'], axis=1)

In [None]:
test_pred

In [None]:
#test_pred.to_csv('./Submission/' + 'stack1_test' + '.csv', index=False)