In [68]:
import pandas as pd
import numpy as np
import os

from sklearn.ensemble import (AdaBoostClassifier, GradientBoostingClassifier,
                              RandomForestClassifier, ExtraTreesClassifier)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import clone
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.datasets import load_digits

#from tqdm import tqdm

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats.distributions import randint

from sklearn.preprocessing import StandardScaler



In [109]:
def load_train_data(folder, bias_for_corr = 0.0):

    dfs = []
    for filename in os.listdir(folder):
        if filename.endswith(".csv"): 
            df = pd.read_csv(f"{folder}/{filename}")
            if "ID" not in df:
                replacements = ["IT", "1D"]
                for replacement in replacements:
                    if replacement in df:
                        df["ID"] = df[replacement]
                        df = df.drop(columns=[replacement])
                        break
            dfs.append(df)


    df = dfs[0]
    for row in df.values:
        id = int(row[0])
        category = int(row[1])
        for other in dfs[1:]:
            other_category = other[other.ID == id]["Category"].values[0]
            if category != other_category:
                print("Mismatch Category found at ID", id)

    data = dfs[0]
    for df in dfs[1:]:
        data = data.join(df.set_index(["ID", "Category"]), on=["ID", "Category"], validate='1:1')
    
    y = data['Category']
    X = data.drop(['ID', 'Category'], axis=1)

    #calculate correlation
    corr = X.apply(lambda x: abs(x.corr(y))).sort_values(ascending=False)
    columns_to_remove = []
    for index, value in corr.items():
        if value < bias_for_corr:
            columns_to_remove.append(index)

    X.drop(columns_to_remove, axis=1, inplace=True)

    return X.to_numpy(), y.to_numpy()


def compute_meta_feature(clf, X_train, X_test, y_train, cv):
    n_classes = len(np.unique(y_train))
    
    X_meta_train = np.zeros((len(y_train), n_classes), dtype=np.float32)

    splits = cv.split(X_train, y_train)
    for train_fold_index, predict_fold_index in splits:
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]
        
        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)
        
        X_meta_train[predict_fold_index] = folded_clf.predict_proba(X_fold_predict)
    
    meta_clf = clone(clf)
    meta_clf.fit(X_train, y_train)
    
    X_meta_test = meta_clf.predict_proba(X_test)
    
    return X_meta_train, X_meta_test

def generate_meta_features(classifiers, X_train, X_test, y_train, cv):
   
    features = [
        compute_meta_feature(clf, X_train, X_test, y_train, cv)
        for clf in classifiers
    ]
    
    stacked_features_train = np.hstack([
        features_train for features_train, features_test in features
    ])

    stacked_features_test = np.hstack([
        features_test for features_train, features_test in features
    ])
    
    return stacked_features_train, stacked_features_test

def compute_metric(clf, X_train, X_test, y_train):
    clf.fit(X_train, y_train)
    y_test_pred = clf.predict(X_test)
    return np.round(f1_score(y_test, y_test_pred, average='macro'), 6)


In [110]:
X, y = load_train_data("../../data/midterm_hackathon/train")

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

scaler = StandardScaler()
SX_train = scaler.fit_transform(X_train)
SX_test = scaler.transform(X_test)

gb = GradientBoostingClassifier(random_state = 42, n_estimators = 200)
et = ExtraTreesClassifier(random_state = 42, n_estimators = 300)

skf = StratifiedKFold(n_splits=10)
meta_X_train, meta_X_test = generate_meta_features([gb, et], SX_train, SX_test, y_train, skf)

p666_meta_lr = LogisticRegression(random_state = 42, penalty = None, solver = 'lbfgs', multi_class = 'auto')
compute_metric(p666_meta_lr, p666_meta_X_train, p666_meta_X_test, y_train)

0.942867