In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from bayes_opt import BayesianOptimization
from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor


def exploratory_data_analysis(df):
    print('Number of Missing values')
    print(df.isnull().sum())
    print('\n')

    for i in df.columns:
        print('COLUMN: {}'.format(i))
        if ('id' in i) | ('country' in i):
            print('Number of unique values: {}'.format(df[i].nunique()))
        else:
            if df[i].nunique() < 15:
                print(df[i].value_counts())
                df[i].value_counts().plot(kind='bar')
                plt.title(i)
                plt.show()
            else:
                if df[i].dtype.kind in 'bifc':
                    print('Min: {}'.format(df[i].min()))
                    print('Max: {}'.format(df[i].max()))
                    print('Mean: {}'.format(df[i].mean()))
                    print('Median: {}'.format(df[i].median()))
                    df[i].plot(kind='kde')
                    plt.title(i)
                    plt.show()
        print('\n')


def preprocess_data(df2):
    df2 = df2.drop(['season', 'club_id', 'league_id'], axis=1)
    df2['dynamic_payment_segment2'] = df2['dynamic_payment_segment'].str[2:]

    dm = pd.get_dummies(df2['dynamic_payment_segment2'])
    df2 = pd.concat([df2, dm], axis=1)
    df2 = df2.drop(['dynamic_payment_segment', 'dynamic_payment_segment2'], axis=1)
    df2 = df2.drop('registration_platform_specific', axis=1)
    df2['registration_country_cd'] = pd.Categorical(df2['registration_country'],
                                                    categories=df2['registration_country'].unique()).codes
    df2 = df2.drop('registration_country', axis=1)
    df2['global_competition_level'].fillna(0, inplace=True)

    return df2


def create_features_and_target(df2):
    X = df2.copy().drop('league_rank', axis=1)
    Y = df2['league_rank']

    return X, Y


def split(X, Y):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0, stratify=Y)
    return X_train, X_test, Y_train, Y_test


def scale(X_train, X_test):
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)

    return X_train, X_test


def xgboost_hyper_param(learning_rate, n_estimators, max_depth, gamma):
    n_estimators = int(n_estimators)
    max_depth = int(max_depth)

    clf = XGBRegressor(
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        max_depth=max_depth,
        gamma=gamma,
        n_jobs=-1
    )

    return np.mean(cross_val_score(clf, X_train, Y_train, cv=10, scoring='neg_mean_absolute_error'))


def optimize_hyperparameters(X_train, Y_train):
    pbounds = {
        'learning_rate': (0.01, 1.0),
        'n_estimators': (100, 1000),
        'max_depth': (5, 20),
        'gamma': (2, 8)
    }

    optimizer = BayesianOptimization(
        f=xgboost_hyper_param,
        pbounds=pbounds,
        random_state=1)

    optimizer.maximize(init_points=10, n_iter=5)

    best_params = optimizer.max['params']

    best_params['max_depth'] = int(best_params['max_depth'])
    best_params['n_estimators'] = int(best_params['n_estimators'])

    model = XGBRegressor(
        learning_rate=best_params['learning_rate'],
        n_estimators=best_params['n_estimators'],
        max_depth=best_params['max_depth'],
        gamma=best_params['gamma'],
        n_jobs=-1
    )

    model.fit(X_train, Y_train)

    return model, best_params


def evaluate_model(model, X_test, Y_test):
    err = mae(Y_test, model.predict(X_test))
    print('MAE: {}'.format(err))

    aa = model.predict(X_test)
    aa2 = [14 if i >= 14 else 1 if i <= 1 else round(i) for i in aa]

    err2 = mae(Y_test, aa2)
    print('MAE Round: {}'.format(err2))


def plot_fimportance(model, X):
    n_features = X.shape[1]
    plt.figure(figsize=(20, 20))
    plt.barh(range(n_features), model.feature_importances_)
    plt.yticks(np.arange(n_features), X.columns.values)
    plt.xlabel('Feature importance')
    plt.ylabel('Feature')


def make_predictions(df_test, df2, best_params):
    df_test2 = preprocess_data(df_test)
    X, Y = create_features_and_target(df2)

    X_scaled, df5 = scale(X, df_test2)

    model2 = XGBRegressor(**best_params, n_jobs=-1, random_state=1)
    model2.fit(X_scaled, Y)

    plot_fimportance(model2, X)

    pp = model2.predict(df5)
    pp2 = [14 if i >= 14 else 1 if i <= 1 else round(i) for i in pp]

    df_test['league_rank'] = pp2
    df_test3 = df_test[['club_id', 'league_rank']].copy()

    return df_test3


df = pd.read_csv('jobfair_train.csv')
exploratory_data_analysis(df)
df2 = preprocess_data(df)

X, Y = create_features_and_target(df2)
X_train, X_test, Y_train, Y_test = split(X, Y)
X_train, X_test = scale(X_train, X_test)

model, best_params = optimize_hyperparameters(X_train, Y_train)
evaluate_model(model, X_test, Y_test)
plot_fimportance(model, X)

df_test = pd.read_csv('jobfair_test.csv')
predictions = make_predictions(df_test, df2, best_params)
predictions.to_csv('league_rank_predictions.csv', index=False)
