In [1]:
import pandas as pd
import numpy as np
from Util.tools import *

In [2]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
import lightgbm as lgb
from scipy.stats import spearmanr

from sklearn.model_selection import train_test_split

In [3]:
X_train = pd.read_csv('../challenge_data/X_train.csv')
Y_train = pd.read_csv('../challenge_data/Y_train.csv')
X_test = pd.read_csv('../challenge_data/X_test.csv')
X_train_clean = X_train.drop(['COUNTRY'], axis=1)
X_train_clean = preprocessing(X_train_clean, norm=True, pca=True)
X_train_clean = pd.DataFrame(X_train_clean)
X_train_clean.columns = X_train_clean.columns.astype(str)
Y_train_clean = Y_train['TARGET']

In [4]:
def training(model, cv=5):
    scores = []
    for _ in range(cv):
        X_train, X_test, Y_train, Y_test = train_test_split(X_train_clean, Y_train_clean, test_size=0.2, random_state=np.random.randint(1, 100))
        model.fit(X_train, Y_train)
        y_pred = model.predict(X_test)
        score = spearmanr(Y_test, y_pred).correlation

        scores.append(score)

    metric = np.mean(scores)

    print('Spearman correlation for the train set: {:.1f}%'.format(100 * metric ))

    return metric

In [7]:
def grid_search_ensemble(n_estimators, learning_rate, estimator, X, Y, cv=5):
    best_n = 0
    best_learning_rate = 0
    best_result = float('-inf')
    best_model = None

    for i in n_estimators:
        for j in learning_rate:  
            if estimator == 'AdaBoost':
                model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=3), n_estimators=i, learning_rate=j, random_state=1)

            elif estimator == 'GradientBoost':
                model = GradientBoostingRegressor(n_estimators=i, learning_rate=j, random_state=1)
                
            elif estimator == 'XGBoost':
                model = XGBRegressor(n_estimators=i, learning_rate=j, random_state=1)
                
            else:
                model = lgb.LGBMRegressor(max_depth=2, n_estimators=i, learning_rate=j, random_state=1, min_child_samples=20)

            scores = []
            for _ in range(cv):
                X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=np.random.randint(1, 100))

                model.fit(X_train, Y_train)
                y_pred = model.predict(X_test)
                
                score = spearmanr(Y_test, y_pred).correlation

                scores.append(score)

            # Calculate the mean score
            mean_score = np.mean(scores)

            # Check if the current model is the best
            if mean_score > best_result:
                best_result = mean_score
                best_n = i
                best_learning_rate = j
                best_model = model
                

    print(f'The best parameters values are: learning_rate: {best_learning_rate}, n_estimators: {best_n}')    

    return best_n, best_learning_rate, best_model, best_result

In [5]:
clf = DecisionTreeRegressor(max_depth=3)
clf.fit(X_train_clean, Y_train_clean)
ada = AdaBoostRegressor(clf, n_estimators=500, learning_rate=0.1, random_state=1)
result = training(ada)

Spearman correlation for the train set: 20.0%


In [8]:
n_estimators = np.arange(100, 600, 30)
learning_rate = [0.01, 0.05, 0.1, 0.5, 1]
best_n, best_learning_rate, best_model, best_result = grid_search_ensemble(n_estimators, learning_rate, 'AdaBoost', X_train_clean, Y_train_clean)

The best parameters values are: learning_rate: 0.1, n_estimators: 310


In [37]:
clf = DecisionTreeRegressor(max_depth=3)
clf.fit(X_train_clean, Y_train_clean)
ada_best = AdaBoostRegressor(clf, n_estimators=310, learning_rate=0.1, random_state=1)
result = training(ada_best)

Spearman correlation for the train set: 16.7%


In [10]:
gb = GradientBoostingRegressor(max_depth=2, n_estimators=500, learning_rate=0.1, random_state=1)
result = training(gb)

Spearman correlation for the train set: 11.8%


In [11]:
n_estimators = np.arange(100, 600, 30)
learning_rate = [0.01, 0.05, 0.1, 0.5, 1]
best_n, best_learning_rate, best_model, best_result = grid_search_ensemble(n_estimators, learning_rate, 'GradientBoost', X_train_clean, Y_train_clean)

The best parameters values are: learning_rate: 0.01, n_estimators: 280


In [34]:
gb_best = GradientBoostingRegressor(max_depth=2, n_estimators=280, learning_rate=0.01, random_state=1)

result = training(gb_best)

Spearman correlation for the train set: 19.1%


In [14]:
X_test = pd.read_csv('../challenge_data/X_test.csv')
X_test = X_test.drop(['COUNTRY'], axis=1)
X_test_clean = preprocessing(X_test, norm=True, pca=True)

In [15]:
submission(X_test, X_test_clean, gb_best, 'gb_best')



In [24]:
xgb = XGBRegressor(max_depth=2, n_estimators=500, learning_rate=0.1)
result = training(xgb)

Spearman correlation for the train set: 12.7%


In [25]:
n_estimators = np.arange(100, 600, 30)
learning_rate = [0.01, 0.05, 0.1, 0.5, 1]
best_n, best_learning_rate, best_model, best_result = grid_search_ensemble(n_estimators, learning_rate, 'XGBoost', X_train_clean, Y_train_clean)

The best parameters values are: learning_rate: 0.01, n_estimators: 190


In [31]:
xgb_best = XGBRegressor(max_depth=2, n_estimators=190, learning_rate=0.01)

result = training(xgb_best)

Spearman correlation for the train set: 18.1%
