In [1]:
from metrics import custom_score, ultimate_score

In [20]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from metrics import custom_score, ultimate_score
from sklearn.inspection import permutation_importance
from boruta import BorutaPy
from collections import Counter
from itertools import product, chain, combinations
import warnings
from pandasql import sqldf
import seaborn as sns

In [31]:
pd.set_option('display.max_colwidth', None)

warnings.filterwarnings('ignore')

random_seed = 42
np.random.seed(random_seed)

test_sets_number = 5

In [14]:
X = pd.read_csv("data/x_train.txt", sep=" ", header=None)
y = pd.read_csv("data/y_train.txt", sep=" ", header=None)

def create_test_sets(X, y, random_seed, test_sets_number):

    skf = StratifiedKFold(n_splits=test_sets_number, shuffle=True, random_state=random_seed)

    datasets = {}

    for fold_idx, (train_index, test_index) in enumerate(skf.split(X, y)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        datasets[f"fold_{fold_idx+1}"] = {
            "X_train": X_train_scaled,
            "X_test": X_test_scaled,
            "y_train": y_train.reset_index(drop=True),
            "y_test": y_test.reset_index(drop=True)
        }
    return datasets

datasets = create_test_sets(X, y, random_seed, TEST_SETS_NUMBER)

In [None]:
def get_sorted_features(init_model, X_train, y_train, feature_selection='mdi', **kwargs):
    model = init_model(**kwargs)
    model.fit(X_train, y_train)
    if feature_selection == 'mdi':
        try:
            importances = model.feature_importances_
        except AttributeError:
            importances = np.abs(model.coef_).reshape(-1,)
    elif feature_selection == 'permutation_importance':
        importances = permutation_importance(model, X_train, y_train).importances_mean
    
    feature_names = np.arange(X_train.shape[1])
    
    feature_importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': importances
    })
    
    feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)
    features = np.array(feature_importance_df.feature)
    
    return features

def calculate_model(init_model, X_train, y_train, X_test, y_test, selected_features, **kwargs):

    X_train_filtered = X_train[:, selected_features]
    X_test_filtered = X_test[:, selected_features]
    
    model = init_model(**kwargs)
    model.fit(X_train_filtered, y_train)
    prob = model.predict_proba(X_test_filtered)
    
    k_test = int(len(y_test) * 0.2) # 20% of data should be ones
    score = custom_score(y_test, prob, k_test)
    ult_score = ultimate_score(y_test, prob, k_test, len(selected_features))
    return score, ult_score

## Logstic Regression

In [26]:
def subsets(lst, max_subset_len):
    """Return 1-max_subset_len len subsets of the input list """
    return [list(subset) for r in range(1, max_subset_len + 1) for subset in combinations(lst, r)]

In [24]:
penalty_list = [None, 'l1', 'l2']
solver_list = ['liblinear', 'lbfgs']
C_list = [0.01, 0.1, 1, 10]

In [44]:
repeats = 5
max_subset_len = 3
model = LogisticRegression
subsets_top_features = [8]

param_grid = [
    {'penalty': penalty,
     'C': C,
     'solver': solver
     }
    for penalty, 
        C,
        solver
        in product(penalty_list,
                   C_list,
                   solver_list
                   )
        if not (penalty == 'l1' and solver == 'lbfgs')
        and not (penalty is None and solver == 'liblinear')        
]

# grid search
def do_grid_search_model(model, datasets, test_sets_number, param_grid, repeats, top_features, max_subset_len):
    subsets_top_features = subsets(top_features, max_subset_len)
    result_rows = []
    param_keys = list(param_grid[0].keys())

    for test_idx in range(1, test_sets_number+1):
        for params in param_grid:
            print(test_idx, params)
            for selected_features in subsets_top_features:

                scores = []
                ult_scores = []
                for i in range(repeats):
                    score, ult_score = calculate_model(model, 
                                                    datasets[f'fold_{test_idx}']['X_train'], 
                                                    datasets[f'fold_{test_idx}']['y_train'], 
                                                    datasets[f'fold_{test_idx}']['X_test'], 
                                                    datasets[f'fold_{test_idx}']['y_test'], 
                                                    selected_features, 
                                                    **params, 
                                                    random_state=i)
                
                    scores.append(score)
                    ult_scores.append(ult_score)


                result_row = [
                        test_idx,
                        len(selected_features),
                        ', '.join([str(x) for x in selected_features]),
                        round(np.mean(scores), 4),
                        round(np.mean(ult_scores)),
                        ', '.join([f'{key}={value}' for key, value in params.items()])
                    ]
                result_row.extend([str(params.get(k)) for k in param_keys])

                result_rows.append(result_row)

    results_df = pd.DataFrame(result_rows, columns = ["test_set_id", "no_of_features", "features", "precision", "ultimate_score", "params_merged"] + param_keys)
    
    return results_df

results_df = do_grid_search_model(model, datasets, TEST_SETS_NUMBER, param_grid, repeats, top_features=[8], max_subset_len=1)
results_df.to_csv('results_to_report/acc_test_grid_search_lr_from_fs.csv', index=False)


1 {'penalty': None, 'C': 0.01, 'solver': 'lbfgs'}
1 {'penalty': None, 'C': 0.1, 'solver': 'lbfgs'}
1 {'penalty': None, 'C': 1, 'solver': 'lbfgs'}
1 {'penalty': None, 'C': 10, 'solver': 'lbfgs'}
1 {'penalty': 'l1', 'C': 0.01, 'solver': 'liblinear'}
1 {'penalty': 'l1', 'C': 0.1, 'solver': 'liblinear'}
1 {'penalty': 'l1', 'C': 1, 'solver': 'liblinear'}
1 {'penalty': 'l1', 'C': 10, 'solver': 'liblinear'}
1 {'penalty': 'l2', 'C': 0.01, 'solver': 'liblinear'}
1 {'penalty': 'l2', 'C': 0.01, 'solver': 'lbfgs'}
1 {'penalty': 'l2', 'C': 0.1, 'solver': 'liblinear'}
1 {'penalty': 'l2', 'C': 0.1, 'solver': 'lbfgs'}
1 {'penalty': 'l2', 'C': 1, 'solver': 'liblinear'}
1 {'penalty': 'l2', 'C': 1, 'solver': 'lbfgs'}
1 {'penalty': 'l2', 'C': 10, 'solver': 'liblinear'}
1 {'penalty': 'l2', 'C': 10, 'solver': 'lbfgs'}
2 {'penalty': None, 'C': 0.01, 'solver': 'lbfgs'}
2 {'penalty': None, 'C': 0.1, 'solver': 'lbfgs'}
2 {'penalty': None, 'C': 1, 'solver': 'lbfgs'}
2 {'penalty': None, 'C': 10, 'solver': 'lbfgs'

## Random Forest

In [45]:
max_depth_list = [3, 5, 7, 10]
min_samples_split_list = [2, 4, 6]
criterion_list = ['gini', 'entropy', 'log_loss']
max_features_list = ['sqrt', 'log2', None]

model = RandomForestClassifier
subsets_top_features = [8, 246]

param_grid = [
    {'max_depth': max_depth,
     'min_samples_split': min_samples_split,
     'criterion': criterion,
     'max_features': max_features,
     }
    for max_depth, 
        min_samples_split,
        criterion,
        max_features
        in product(max_depth_list,
                   min_samples_split_list,
                   criterion_list,
                   max_features_list
                   )
]

results_df = do_grid_search_model(model, datasets, TEST_SETS_NUMBER, param_grid, repeats, top_features=[8, 246], max_subset_len=1)
results_df.to_csv('results_to_report/acc_test_grid_search_rf_from_fs.csv', index=False)

1 {'max_depth': 3, 'min_samples_split': 2, 'criterion': 'gini', 'max_features': 'sqrt'}
1 {'max_depth': 3, 'min_samples_split': 2, 'criterion': 'gini', 'max_features': 'log2'}
1 {'max_depth': 3, 'min_samples_split': 2, 'criterion': 'gini', 'max_features': None}
1 {'max_depth': 3, 'min_samples_split': 2, 'criterion': 'entropy', 'max_features': 'sqrt'}
1 {'max_depth': 3, 'min_samples_split': 2, 'criterion': 'entropy', 'max_features': 'log2'}
1 {'max_depth': 3, 'min_samples_split': 2, 'criterion': 'entropy', 'max_features': None}
1 {'max_depth': 3, 'min_samples_split': 2, 'criterion': 'log_loss', 'max_features': 'sqrt'}
1 {'max_depth': 3, 'min_samples_split': 2, 'criterion': 'log_loss', 'max_features': 'log2'}
1 {'max_depth': 3, 'min_samples_split': 2, 'criterion': 'log_loss', 'max_features': None}
1 {'max_depth': 3, 'min_samples_split': 4, 'criterion': 'gini', 'max_features': 'sqrt'}
1 {'max_depth': 3, 'min_samples_split': 4, 'criterion': 'gini', 'max_features': 'log2'}
1 {'max_depth': 3