In [2]:
import pandas as pd
import seaborn as sns

from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
import sklearn.linear_model as sklm
import sklearn.model_selection as skms
import sklearn.metrics as metrics
import sklearn.pipeline as skpl
import sklearn.preprocessing as skpp
import sklearn.compose as skcmp
import sklearn.impute as skim
import category_encoders as ce

from helpers.preprocessing import *

In [3]:
train_df = pd.read_csv("data/train.csv", index_col="id")
test_df = pd.read_csv("data/test.csv", index_col="id")
orig_df = pd.read_csv("data/orig.csv", index_col="id")

CONT_FEATS = [
    "cap-diameter",
    "stem-height",
    "stem-width"
]
CAT_FEATS = [c for c in train_df.columns if c not in CONT_FEATS]
RESPONSE_COL = "class"

In [4]:
fix_categories(orig_df, train_df, test_df, CAT_FEATS, nan_as_cat=True)
fix_that_one_mushroom_in_test(test_df)


pipeline = skpl.make_pipeline(
    skcmp.ColumnTransformer([
        (
            "float", 
            skpl.make_pipeline
            (
                skim.SimpleImputer(add_indicator=True, strategy="median"),
            ),
            CONT_FEATS
        ),
        (
            "cat", 
            skpp.OneHotEncoder(
                sparse_output=False,
                dtype=np.int8,
                handle_unknown='infrequent_if_exist'),
            [c for c in CAT_FEATS if c != RESPONSE_COL]
        ),
    ]).set_output(transform="pandas"),
    skpp.StandardScaler().set_output(transform="pandas")
)



In [5]:
x_tr = pipeline.fit_transform(train_df)
y_tr = train_df[RESPONSE_COL]
y_tr = y_tr.replace({"e":0, "p":1})

x_test = pipeline.transform(test_df)

### KMeans

In [6]:
kmeans = KMeans(n_clusters=2)
train_preds = kmeans.fit_predict(x_tr, y_tr)

  super()._check_params_vs_input(X, default_n_init=10)


In [9]:
metrics.matthews_corrcoef(y_tr, train_preds)

0.01393712596412315

In [8]:
test_preds = kmeans.predict(x_test)
out_df = pd.DataFrame({"class":test_preds}, index=test_df.index)
out_df["class"] = out_df["class"].replace({0:"e", 1:"p"})
out_df.to_csv("predictions/v2/kmeans.csv")