In [8]:
import pandas as pd
import numpy as np
import os

from sklearn.ensemble import (GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier

from sklearn.model_selection import train_test_split, StratifiedKFold

In [4]:
def load_data(folder):
    dfs = []
    for filename in os.listdir(folder):
        if filename.endswith(".csv"): 
            df = pd.read_csv(f"{folder}/{filename}")
            columns_mapping = {"IT": "ID", "1D": "ID",
                           "conc_points_mean": "concave_points_mean", "conc_points_std": "concave_points_std", "conc_points_max": "concave_points_max",
                           "p_mean": "perimeter_mean", "p_std": "perimeter_std", "p_max": "perimeter_max",
                           }
            df.rename(columns=columns_mapping, inplace=True)
            dfs.append(df)
    return dfs

def prepare_data_for_training(dfs, bias_for_corr = 0.0):
    #df = dfs[0]
    #for row in df.values:
    #    id = int(row[0])
    #    category = int(row[1])
    #    for other in dfs[1:]:
    #        other_category = other[other.ID == id]["Category"].values[0]
    #        if category != other_category:
    #            print("Mismatch Category found at ID", id)

    data = dfs[0]
    for df in dfs[1:]:
        data = data.join(df.set_index(["ID", "Category"]), on=["ID", "Category"], validate='1:1')

    #data = data.sort_index(axis=1) ???
              
    y = data['Category']
    X = data.drop(['ID', 'Category'], axis=1)

    #calculate correlation
    corr = X.apply(lambda x: abs(x.corr(y))).sort_values(ascending=False)
    columns_to_drop = []
    for index, value in corr.items():
        if value < bias_for_corr:
            columns_to_drop.append(index)

    X.drop(columns_to_drop, axis=1, inplace=True)
    return X, y, columns_to_drop

def prepare_data_for_testing(dfs, columns_to_drop = []):
    data = dfs[0]
    for df in dfs[1:]:
        data = data.join(df.set_index(["ID"]), on=["ID"], validate='1:1')

    #data = data.reindex(sorted(data.columns), axis=1)
    
    data.drop(columns_to_drop, axis=1, inplace=True)
    #print(data.info())
    
    return data


def compute_meta_feature(clf, X_train, X_test, y_train, cv):
    n_classes = len(np.unique(y_train))
    
    X_meta_train = np.zeros((len(y_train), n_classes), dtype=np.float32)

    splits = cv.split(X_train, y_train)
    for train_fold_index, predict_fold_index in splits:
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]
        
        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)
        
        X_meta_train[predict_fold_index] = folded_clf.predict_proba(X_fold_predict)
    
    meta_clf = clone(clf)
    meta_clf.fit(X_train, y_train)
    
    X_meta_test = meta_clf.predict_proba(X_test)
    
    return X_meta_train, X_meta_test

def generate_meta_features(classifiers, X_train, X_test, y_train, cv):
   
    features = [
        compute_meta_feature(clf, X_train, X_test, y_train, cv)
        for clf in classifiers
    ]
    
    stacked_features_train = np.hstack([
        features_train for features_train, features_test in features
    ])

    stacked_features_test = np.hstack([
        features_test for features_train, features_test in features
    ])
    
    return stacked_features_train, stacked_features_test

def compute_metric(clf, X_train, X_test, y_train, y_test):
    clf.fit(X_train, y_train)
    y_test_pred = clf.predict(X_test)
    return np.round(f1_score(y_test, y_test_pred, average='macro'), 6)


In [5]:
data = load_data("../../data/midterm_hackathon/train")

X, y, columns_to_drop = prepare_data_for_training(data, 0.25)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2, shuffle=True)

gb = GradientBoostingClassifier(random_state = 42, n_estimators = 100)
et = ExtraTreesClassifier(random_state = 42, n_estimators = 50)
rf = RandomForestClassifier(random_state = 42, n_estimators = 50, max_depth=24)

skf = StratifiedKFold(n_splits=30)
meta_X_train, meta_X_test = generate_meta_features([gb, et, rf], X_train.to_numpy(), X_test.to_numpy(), y_train.to_numpy(), skf)

meta_lr = LogisticRegression(random_state = 42, penalty = None, solver = 'lbfgs', multi_class = 'auto')
compute_metric(meta_lr, meta_X_train, meta_X_test, y_train, y_test)

0.980956

In [6]:
data = load_data("../../data/midterm_hackathon/train")

X, y, columns_to_drop = prepare_data_for_training(data, 0.25)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2, shuffle=True)

models = [
    ("GradientBoosting", GradientBoostingClassifier(random_state = 42, n_estimators = 100)),
    ("ExtraTrees", ExtraTreesClassifier(random_state = 42, n_estimators = 50)),
    ("RandomForest", RandomForestClassifier(random_state = 42, n_estimators = 50, max_depth=24))
]

lr = LogisticRegression(random_state = 42, penalty = None, solver = 'lbfgs', multi_class = 'auto')

skf = StratifiedKFold(n_splits=30)

model = StackingClassifier(models, final_estimator = lr, cv = skf, stack_method = "predict_proba" )

#print(X.info())
#model.fit(X_train, y_train).score(X_test, y_test)
model.fit(X, y)


In [7]:
tdata = load_data("../../data/midterm_hackathon/test")
tX = prepare_data_for_testing(tdata, columns_to_drop)
y_test_pred = model.predict(tX[X.columns])

with open("../../data/midterm_hackathon/submission1.csv", "w") as f:
    f.write("ID,Category\n")
    for id, result in zip(tX["ID"], y_test_pred):
        f.write(f"{id},{result}\n")