In [61]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score
from sklearn.ensemble import HistGradientBoostingClassifier

In [44]:
train=pd.read_csv(r'C:\atlantis_citizens_final.csv')
test=pd.read_csv(r'C:\test_atlantis_hidden.csv')

In [71]:
X=train.drop(columns=["Occupation","Citizen_ID"])
y=train['Occupation']

In [70]:
cat_cols=X.select_dtypes(include=["object"]).columns.tolist()

In [49]:
num_cols=X.select_dtypes(exclude=["object"]).columns.tolist()

In [72]:

preprocessor = ColumnTransformer(
    [
        (
            "cat",
            Pipeline([
                ("imputer", SimpleImputer(strategy="most_frequent")),
                ("onehot", OneHotEncoder(
                    handle_unknown="ignore",
                    sparse_output=False,
                    min_frequency=10
                ))
            ]),
            cat_cols
        ),
        (
            "num",
            Pipeline([
                ("imputer", SimpleImputer(strategy="median"))
            ]),
            num_cols
        )
    ]
)

In [73]:
model =  HistGradientBoostingClassifier(max_iter=300,max_depth=8,learning_rate=0.05,min_samples_leaf=30,random_state=42)

In [74]:
pipe = Pipeline(steps=[("preprocess", preprocessor),("model", model)])

In [75]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

pipe.fit(X_train, y_train)

val_preds = pipe.predict(X_val)
print("Macro F1:", f1_score(y_val, val_preds, average="macro"))

Macro F1: 0.6026314998133941


In [83]:
pipe.fit(X_train,y_train)
test_ids=test["Citizen_ID"]

In [84]:
X_test=test.drop(columns=["Citizen_ID"])

In [85]:
test_preds=pipe.predict(X_test)

In [86]:
submission=pd.DataFrame({"Citizen_ID":test_ids,"Occupation":test_preds})

In [88]:
submission.to_csv("submission79.csv",index=False)

In [89]:
df=pd.read_csv(r'submission79.csv')

In [90]:
occupation_map = {
    "Warrior": 0,
    "Merchant": 1,
    "Fisher": 2,
    "Miner": 3,
    "Scribe": 4
}

In [91]:
df["Occupation"] = df["Occupation"].map(occupation_map)

In [93]:
df.to_csv("submission_fixed23.csv", index=False)