In [371]:
import pandas as pd
import numpy as np
import os

from sklearn.ensemble import (AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import clone
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import f1_score

from sklearn.preprocessing import StandardScaler

In [372]:
def load_data(folder):
    dfs = []
    for filename in os.listdir(folder):
        if filename.endswith(".csv"): 
            df = pd.read_csv(f"{folder}/{filename}")
            columns_mapping = {"IT": "ID", "1D": "ID",
                           "conc_points_mean": "concave_points_mean", "conc_points_std": "concave_points_std", "conc_points_max": "concave_points_max",
                           "p_mean": "perimeter_mean", "p_std": "perimeter_std", "p_max": "perimeter_max",
                           }
            df.rename(columns=columns_mapping, inplace=True)
            dfs.append(df)
    return dfs

def prepare_data_for_training(dfs, bias_for_corr = 0.0):
    #df = dfs[0]
    #for row in df.values:
    #    id = int(row[0])
    #    category = int(row[1])
    #    for other in dfs[1:]:
    #        other_category = other[other.ID == id]["Category"].values[0]
    #        if category != other_category:
    #            print("Mismatch Category found at ID", id)

    data = dfs[0]
    for df in dfs[1:]:
        data = data.join(df.set_index(["ID", "Category"]), on=["ID", "Category"], validate='1:1')

    #data = data.sort_index(axis=1) ???
              
    y = data['Category']
    X = data.drop(['ID', 'Category'], axis=1)

    #calculate correlation
    corr = X.apply(lambda x: abs(x.corr(y))).sort_values(ascending=False)
    columns_to_drop = []
    for index, value in corr.items():
        if value < bias_for_corr:
            columns_to_drop.append(index)

    X.drop(columns_to_drop, axis=1, inplace=True)
    return X, y, columns_to_drop

def prepare_data_for_testing(dfs, columns_to_drop = []):
    data = dfs[0]
    for df in dfs[1:]:
        data = data.join(df.set_index(["ID"]), on=["ID"], validate='1:1')

    #data = data.sort_index(axis=1) ???
    
    data.drop(columns_to_drop, axis=1, inplace=True)
    #print(data.info())
    return data

In [373]:
from sklearn.ensemble import StackingClassifier

data = load_data("../../data/midterm_hackathon/train")

X, y, columns_to_drop = prepare_data_for_training(data, 0.25)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2, shuffle=True)

models = [
    ("GradientBoosting", GradientBoostingClassifier(random_state = 42, n_estimators = 100)),
    ("ExtraTrees", ExtraTreesClassifier(random_state = 42, n_estimators = 50)),
    ("RandomForest", RandomForestClassifier(random_state = 42, n_estimators = 50, max_depth=24))
]

lr = LogisticRegression(random_state = 42, penalty = None, solver = 'lbfgs', multi_class = 'auto')

skf = StratifiedKFold(n_splits=30)

model = StackingClassifier(models, final_estimator = lr, cv = skf, stack_method = "predict_proba" )

#print(X.info())
#model.fit(X_train, y_train).score(X_test, y_test)
model.fit(X, y)


In [374]:
tdata = load_data("../../data/midterm_hackathon/test")
tX = prepare_data_for_testing(tdata, columns_to_drop)
y_test_pred = model.predict(tX[X.columns])

with open("../../data/midterm_hackathon/submission1.csv", "w") as f:
    f.write("ID,Category\n")
    for id, result in zip(tX["ID"], y_test_pred):
        f.write(f"{id},{result}\n")