In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


# EDA
**We start with our EDA**
first we will import all the modules and load up our data set onto a dataframe

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("/kaggle/input/iitg-ai-recruitment-2025-beyond-the-box/atlantis_citizens_final.csv")

print(df.head(10))

In [None]:
#we will also run a small check 
print(df.isnull().sum())

right now we see a number of data that has null data meaning its empty we soon understand some stuff that do not hold much meaning like citizenship id and also data that holds meaning we are not gonna just erase rows that are empty but we will impute them with what we know after.

In [None]:
#lets start by cleaning some unnecesaary columns
columns_to_drop = ["Citizen_ID", "Bio_Hash"]
df = df.drop(columns_to_drop, axis = 1).copy()
print(df.head())

In [None]:
df.dtypes#checking to see if each data type is the way we want it to be

In [None]:
print(df.describe())
df.shape

In [None]:
df["Vehicle_Owned"].value_counts().plot(
    kind="bar",
    xlabel="Type of car",
    ylabel="Count",
    title="Vehicle Ownership Distribution"
)

In [None]:
#we want to establish connection between each feature and the occupation and further solidify our reasoning
#we start with non numeric data
#as such we start with our knowledge to check if by occupation affects the cars owned

occupations = df["Occupation"].unique()
for occ in occupations:
    subset = df[df["Occupation"] == occ]
    subset["Vehicle_Owned"].value_counts().plot(kind="bar")
    plt.xlabel("Type of car")
    plt.ylabel("Count")
    plt.title(f"Car types owned for {occ}")
    plt.xticks(rotation=45)
    plt.show()


In [None]:
pd.crosstab(df["Occupation"], df["Vehicle_Owned"], normalize="index")

In [None]:
ct = pd.crosstab(df["Occupation"], df["Vehicle_Owned"], normalize="index")
plt.figure(figsize=(6,6))
sns.heatmap(ct, annot=True, cmap="Blues")
plt.title("Proportion of Vehicle Types per Occupation")
plt.show()

We notice a very important trend the relative proportions of an occupation owning a certain vehicle differs even though it will be harder to notice in absolute numbers this provides a very good insight proving that the vehicle owned plays a crucial role in the occupation deciding factors

In [None]:
#we now move on to diet type
occupations = df["Occupation"].unique()
for occ in occupations:
    subset = df[df["Occupation"] == occ]
    subset["Diet_Type"].value_counts().plot(kind="bar")
    plt.xlabel("Diet type")
    plt.ylabel("Count")
    plt.title(f"Diet types for {occ}")
    plt.xticks(rotation=45)
    plt.show()


In [None]:
ct = pd.crosstab(df["Occupation"], df["Diet_Type"], normalize="index")
plt.figure(figsize=(10,6))
sns.heatmap(ct, annot=True, cmap="Blues")
plt.title("Proportion of Diet Types per Occupation")
plt.show()


The key takeaway from this heat map is we see that all the proportions lie around the 0.32 to 0.34 mark which shows little to no variation so this feature on its own carries very little significance we might later check its significance when paired with other features but for now it is a low impact feature

In [None]:
#district names being studied 
ct = pd.crosstab(df["Occupation"], df["District_Name"], normalize="index")
plt.figure(figsize=(12,6))
sns.heatmap(ct, annot=True, cmap="Purples")
plt.title("Proportion of Districts per Occupation")
plt.show()


A very impactful feature as the heat maps show. Proportions are widely seperated and show clear distinctions one of the strongest distinguishing features we have encountered so far

In [None]:
#lets run the similar heat map testing through work district
ct = pd.crosstab(df["Occupation"], df["Work_District"], normalize="index")
plt.figure(figsize=(12,6))
sns.heatmap(ct, annot=True, cmap="Oranges")
plt.title("Proportion of Work Districts per Occupation")
plt.show()

Like the previous one this shows very great distinctions with nicely space proportions and is another useful one in our distinguishing features

**We are done with out catergorical data now we move on to the numeric ones the thing is with numeric ones we have to make sure that the plot diagrams are nicely picked otherwise we might misjudge it we will try something like boxplot**

In [None]:
#starting with House_size_sq_ft and plotting the boxplot of the data

plt.figure(figsize=(10,6))
sns.boxplot(x="Occupation", y="House_Size_sq_ft", data=df)
plt.title("House Size Distribution by Occupation")
plt.xticks(rotation=45)
plt.show()


This is a very good sign for us as we see that this numeric data aligns very well and somewhat matches with intuition(merchants being wealthy having a higher house size on average while the fishers being poorer have a smaller house), this tells us that our data on house sizes can very well be used for classification

In [None]:
#moving on to wealth index 
plt.figure(figsize=(10,6))
sns.boxplot(x="Occupation", y="Wealth_Index", data=df)
plt.title("Wealth Index Distribution by Occupation")
plt.xticks(rotation=45)
plt.show()


The data is skewed a lot by the outliers, a quick but not so efficient fix would be to go with log of the wealth index and re plotting our box Plots

In [None]:
df["Wealth_Index_log"] = np.log1p(df["Wealth_Index"])

plt.figure(figsize=(10,6))
sns.boxplot(x="Occupation", y="Wealth_Index_log", data=df)
plt.title("Log-Transformed Wealth Index Distribution by Occupation")
plt.xticks(rotation=45)
plt.show()


we see clear distinct seperation in the log values of wealth index proving it to be useful feature for classification and so we will include that in our data

This was our first action under **"Feature Engineering"**

In [None]:
#going with the Life_expectance
plt.figure(figsize=(10,6))
sns.boxplot(x="Occupation", y="Life_Expectancy", data=df)
plt.title("Life Expectancy Distribution by Occupation")
plt.xticks(rotation=45)
plt.show()

we see that our life expectancy data is also a very important column as even though with outliers our data is quite well behaved and shows proper distinguishing features

In [None]:
plt.figure(figsize=(12,8))
sns.pairplot(df, 
             vars=["House_Size_sq_ft", "Wealth_Index_log", "Life_Expectancy"], 
             hue="Occupation", 
             diag_kind="kde", 
             plot_kws={"alpha":0.6})
plt.show()


In [None]:
df  = df.drop("Wealth_Index", axis = 1).copy()
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(numeric_only=True), 
            annot=True, 
            cmap="coolwarm", 
            center=0)
plt.title("Correlation Heatmap of Numeric Features")
plt.show()

Our analysis of the three numeric data gave a lot of information, the house sizes andwealth index log is showing a lot of correlation and so does the other numeric data

In [None]:
df[["House_Size_sq_ft", "Wealth_Index_log", "Life_Expectancy"]] = (
    df.groupby("Occupation")[["House_Size_sq_ft", "Wealth_Index_log", "Life_Expectancy"]]
      .transform(lambda x: x.fillna(x.median()))
)

In [None]:
df.isna().sum()

# The First Model (Random Forest)
our data preprocessing is done for now we have included the features that are very effective and picked out ones which are useful now it is time for us to see how our data analysis can perform when we put a model to use it.
We start with a basic Random Forest Classfier model to see how it performs as a baseline
it will be using F1 macro score as its evaluation metric

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

# Features and target
X = df.drop(columns=["Occupation"])
y = df["Occupation"]

# Identifing our categorical and numeric columns
categorical_cols = X.select_dtypes(include=["object"]).columns
numeric_cols = X.select_dtypes(include=["int64","float64"]).columns

# getting preprocessing done
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols)
    ]
)

clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestClassifier(n_estimators=200, random_state=42))
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Fitting our model with the train set 
clf.fit(X_train, y_train)

# Predicting and evaluateing
y_pred = clf.predict(X_test)
print("F1-macro:", f1_score(y_test, y_pred, average="macro"))


**Our Test results**
we see that from our test results that our macro score has crossed the 0.5 mark without much effort and is sitting close to 0.58
we shall now see how much we can improve on this model and learn what we can from this before moving to a more advanced model
We start by introducing a change : making the class weight balanced so that all classes are equally distributed in the training data

**test 2 with our random forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import f1_score,accuracy_score
from sklearn.model_selection import train_test_split


X = df.drop(columns=["Occupation"])
y = df["Occupation"]


categorical_cols = X.select_dtypes(include=["object"]).columns
numeric_cols = X.select_dtypes(include=["int64","float64"]).columns


preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols)
    ]
)


clf_balanced = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestClassifier(
        n_estimators=200,
        random_state=42,
        class_weight="balanced"   # balancing our class weights so that even minority classes are represented
    ))
])


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


clf_balanced.fit(X_train, y_train)
y_pred = clf_balanced.predict(X_test)
print("F1-macro (balanced):", f1_score(y_test, y_pred, average="macro"))
print("accuracy: ", accuracy_score(y_test,y_pred))
clf=clf_balanced

We see this betters our score by a very small fraction but still improves it we shall now improve on it further

**Test 3 with random forest using K-fold CV**

With our first model of a Random forest Classifier we are achieving a test result of a F1 Macro score of 58.48% that is giving us a few more doors of opportunity to explore, but we want to make sure that our random forest classifier be maxed out with its capacities before going to other models.
Lets try Hyperparameter tuning.
We set up a stratified K fold CV to see how it scores  using still F1 macro as our evaluation matrix

In [None]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import make_scorer, f1_score


f1_macro = make_scorer(f1_score, average="macro")


cv = StratifiedKFold(n_splits= 10, shuffle=True, random_state=42)


scores = cross_val_score(clf, X, y, cv=cv, scoring="f1_macro")

print("F1-macro scores per fold:", scores)
print("Average F1-macro:", scores.mean())


# Hyperparameter tuning for our Random Forest


We see that our model does cross 0.6 mark and is consistendly near that across all the folds and even reaching as high as 0.62.
We will now try hyper Parameter tuning using randomized search CV

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score
import numpy as np


f1_macro = make_scorer(f1_score, average="macro")


param_dist = {
    "model__n_estimators": [200, 300, 500],
    "model__max_depth": [None, 10, 20, 30],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 2, 4],
    "model__max_features": ["sqrt", "log2"],
    "model__class_weight": [None, "balanced"]
}


random_search = RandomizedSearchCV(
    clf,                
    param_distributions=param_dist,
    n_iter=20,         
    cv=5,               
    scoring="f1_macro",
    random_state=42,
    n_jobs=-1
)

random_search.fit(X, y)

print("Best parameters:", random_search.best_params_)
print("Best CV F1-macro:", random_search.best_score_)


Our score is very much near 0.63 and we realise our best parameters for the model

**Testing with our best found parameters**

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier


X = df.drop(columns=["Occupation"])
y = df["Occupation"]


categorical_cols = X.select_dtypes(include=["object"]).columns
numeric_cols = X.select_dtypes(include=["int64","float64"]).columns


preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols)
    ]
)

# Pipeline with tuned hyperparameters
clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestClassifier(
        n_estimators=500,
        max_depth=10,
        min_samples_split=5,
        min_samples_leaf=1,
        max_features="sqrt",
        class_weight="balanced",
        random_state=42,
        n_jobs=-1
    ))
])

# Fit on the full dataset (no split)
clf.fit(X, y)
print("training compelte")

In [None]:
df_test = pd.read_csv("/kaggle/input/iitg-ai-recruitment-2025-beyond-the-box/test_atlantis_hidden.csv")
print(df.isna().sum())

In [None]:
df_test["Wealth_Index_log"] = np.log1p(df_test["Wealth_Index"])
ids = df_test["Citizen_ID"]
df_test = df_test.drop(["Wealth_Index", "Bio_Hash", "Citizen_ID"], axis = 1).copy()
print(df_test.head())

# First Submission

In [None]:

occupation_map = {
    "Warrior": 0,
    "Merchant": 1,
    "Fisher": 2,
    "Miner": 3,
    "Scribe": 4
}


test_preds = clf.predict(df_test)


test_preds_encoded = [occupation_map[label] for label in test_preds]


submission = pd.DataFrame({
    "Citizen_ID": ids,              
    "Occupation": test_preds_encoded
})


submission.to_csv("submission__.csv", index=False)
print("Submission file created with encoded Occupation labels.")


**Our model score 00.607 on the scoreboard but we need to push it beyond 0.65 as much as we can**
we start by going deeper into our random forest and use Randomized search to find the best parameters and then output the F1 macro score and then train it on our entire dataset and give the output


In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score


X = df.drop(columns=["Occupation"])
y = df["Occupation"]


categorical_cols = X.select_dtypes(include=["object"]).columns
numeric_cols = X.select_dtypes(include=["int64","float64"]).columns


preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols)
    ]
)


clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestClassifier(
        n_estimators=1000,
        min_samples_split=20,
        min_samples_leaf=1,
        max_features=0.5,
        max_depth=30,
        class_weight="balanced",
        random_state=42,
        n_jobs=4
    ))
])


X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


clf.fit(X_train, y_train)


test_preds = clf.predict(X_test)
print("Test F1-macro:", f1_score(y_test, test_preds, average="macro", zero_division=0))


**Training on whole dataSet to see if our score on submission betters**

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Features and target
X = df.drop(columns=["Occupation"])
y = df["Occupation"]


categorical_cols = X.select_dtypes(include=["object"]).columns
numeric_cols = X.select_dtypes(include=["int64","float64"]).columns


preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols)
    ]
)


clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestClassifier(
        n_estimators=1000,
        min_samples_split=20,
        min_samples_leaf=1,
        max_features=0.5,
        max_depth=30,
        class_weight="balanced",
        random_state=42,
        n_jobs=4
    ))
])



clf.fit(X, y)

Submitting after training on whole data frame

In [None]:

occupation_map = {
    "Warrior": 0,
    "Merchant": 1,
    "Fisher": 2,
    "Miner": 3,
    "Scribe": 4
}


test_preds = clf.predict(df_test)


test_preds_encoded = [occupation_map[label] for label in test_preds]


submission = pd.DataFrame({
    "Citizen_ID": ids,              
    "Occupation": test_preds_encoded
})


submission.to_csv("submission_3.csv", index=False)
print("Submission file created with encoded Occupation labels.")


**Test 1:**
Best parameters: {'model__n_estimators': 1000, 'model__min_samples_split': 20, 'model__min_samples_leaf': 1,
'model__max_features': 0.5, 'model__max_depth': 30, 'model__class_weight': 'balanced'}
Best CV F1-macro: 0.6373094017590735
Test F1-macro: 0.6331419918941843

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score


X = df.drop(columns=["Occupation"])
y = df["Occupation"]


categorical_cols = X.select_dtypes(include=["object"]).columns.drop("Diet_Type")
numeric_cols = X.select_dtypes(include=["int64","float64"]).columns

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols)
    ]
)

clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestClassifier(
        n_estimators=1000,
        min_samples_split=20,
        min_samples_leaf=1,
        max_features=0.5,
        max_depth=30,
        class_weight="balanced",
        random_state=42,
        n_jobs=4
    ))
])


X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


clf.fit(X_train, y_train)


test_preds = clf.predict(X_test)
print("Test F1-macro:", f1_score(y_test, test_preds, average="macro", zero_division=0))


*We tried dropping the diet type column but as it turns out our model's score has dropped we will try introducing some features before moving on to another model*

# Feature Engineering (The Wealth Per Capita)

we enlist our first completely engineered feature of our dataset
the wealth per capita. we encountered it during our EDA in task 1

In [None]:

df["Wealth_per_capita"] = df["Wealth_Index_log"] / (df["House_Size_sq_ft"] + 1)

print(df.groupby("Occupation")["Wealth_per_capita"].describe())

plt.figure(figsize=(10,6))
sns.boxplot(x="Occupation", y="Wealth_per_capita", data=df)
plt.xticks(rotation=45)
plt.title("Wealth per capita distribution across Occupations")
plt.show()

mean_values = df.groupby("Occupation")["Wealth_per_capita"].mean()
mean_values = mean_values / mean_values.sum()
print("Normalized mean Wealth_per_capita proportions:\n", mean_values)


We see that this variation is more distinct and we get much more info out of this feature that the raw ones seperately

In [None]:
df[["Wealth_Index_log", "House_Size_sq_ft", "Wealth_per_capita"]].corr()

we see a quite an appreciable correlation between all the three we shall decide to keep it

In [None]:
df.dtypes

**Training with our engineered features**

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score


X = df.drop(columns=["Occupation"])
y = df["Occupation"]


categorical_cols = X.select_dtypes(include=["object"]).columns
numeric_cols = X.select_dtypes(include=["int64","float64"]).columns


preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols)
    ]
)


clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestClassifier(
        n_estimators=1000,
        min_samples_split=20,
        min_samples_leaf=1,
        max_features=0.5,
        max_depth=30,
        class_weight="balanced",
        random_state=42,
        n_jobs=4
    ))
])


X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


clf.fit(X_train, y_train)


test_preds = clf.predict(X_test)
print("Test F1-macro:", f1_score(y_test, test_preds, average="macro", zero_division=0))


In [None]:
feature_names = clf.named_steps["preprocessor"].get_feature_names_out()
importances = clf.named_steps["model"].feature_importances_
feat_imp = pd.Series(importances, index=feature_names).sort_values(ascending=False)
print(feat_imp.head(20))

**Experimenting by dropping Vehicle_owned feature**

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score


X = df.drop(columns=["Occupation"])
y = df["Occupation"]


categorical_cols = X.select_dtypes(include=["object"]).columns.drop("Vehicle_Owned")
numeric_cols = X.select_dtypes(include=["int64","float64"]).columns


preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols)
    ]
)


clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestClassifier(
        n_estimators=1000,
        min_samples_split=20,
        min_samples_leaf=1,
        max_features=0.5,
        max_depth=30,
        class_weight="balanced",
        random_state=42,
        n_jobs=4
    ))
])


X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


clf.fit(X_train, y_train)


test_preds = clf.predict(X_test)
print("Test F1-macro:", f1_score(y_test, test_preds, average="macro", zero_division=0))


We see our model performance more or less unchanged meaning that the lower importance feature "Vehicle Owned" was not contributing anything substantial but noise so we shall drop that feature as we move forth

# Experimenting with Features

since district name and work district also had an influence on the job and and it greatly varied the diet type i want to experiment using bins like such so that we can find if the model can use that for classifying

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

df["District_Diet"] = df["District_Name"] + "_" + df["Diet_Type"]
df["WorkDistrict_Diet"] = df["Work_District"] + "_" + df["Diet_Type"]

X = df.drop(columns=["Occupation"])
y = df["Occupation"]

categorical_cols = X.select_dtypes(include=["object"]).columns.drop("Vehicle_Owned")
numeric_cols = X.select_dtypes(include=["int64","float64"]).columns

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols)
    ]
)

clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestClassifier(
        n_estimators=1000,
        min_samples_split=20,
        min_samples_leaf=1,
        max_features=0.5,
        max_depth=30,
        class_weight="balanced",
        random_state=42,
        n_jobs=4
    ))
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

clf.fit(X_train, y_train)

test_preds = clf.predict(X_test)
print("Test F1-macro:", f1_score(y_test, test_preds, average="macro", zero_division=0))


In [None]:
feature_names = clf.named_steps["preprocessor"].get_feature_names_out()
importances = clf.named_steps["model"].feature_importances_

feat_imp = pd.Series(importances, index=feature_names).sort_values(ascending=False)
print(feat_imp.head(30))  # top 30 features


The model did not show any improvement with these features and on the importance list they rank very less

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

X = df.drop(columns=["Occupation", "District_Diet", "WorkDistrict_Diet"])
y = df["Occupation"]

categorical_cols = X.select_dtypes(include=["object"]).columns.drop("Vehicle_Owned")
numeric_cols = X.select_dtypes(include=["int64","float64"]).columns

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols)
    ]
)

clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestClassifier(
        n_estimators=1000,
        min_samples_split=20,
        min_samples_leaf=1,
        max_features=0.5,
        max_depth=30,
        class_weight="balanced",
        random_state=42,
        n_jobs=4
    ))
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

clf.fit(X_train, y_train)

test_preds = clf.predict(X_test)
print("Test F1-macro:", f1_score(y_test, test_preds, average="macro", zero_division=0))


Experimenting this time with our wealth index

In [None]:
import pandas as pd

df["Wealth_Index_log_bin"] = pd.qcut(df["Wealth_Index_log"], q=4, labels=["low","mid_low","mid_high","high"])
df["House_Size_bin"] = pd.qcut(df["House_Size_sq_ft"], q=4, labels=["small","medium_small","medium_large","large"])

X = df.drop(columns=["Occupation", "District_Diet", "WorkDistrict_Diet","Vehicle_Owned"])
y = df["Occupation"]

categorical_cols = X.select_dtypes(include=["object"]).columns
numeric_cols = X.select_dtypes(include=["int64","float64"]).columns

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols)
    ]
)

clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestClassifier(
        n_estimators=1000,
        min_samples_split=20,
        min_samples_leaf=1,
        max_features=0.5,
        max_depth=30,
        class_weight="balanced",
        random_state=42,
        n_jobs=4
    ))
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

clf.fit(X_train, y_train)
test_preds = clf.predict(X_test)
print("Test F1-macro:", f1_score(y_test, test_preds, average="macro", zero_division=0))


In [None]:
feature_names = clf.named_steps["preprocessor"].get_feature_names_out()
importances = clf.named_steps["model"].feature_importances_

feat_imp = pd.Series(importances, index=feature_names).sort_values(ascending=False)
print(feat_imp.head(30))  # top 30 features

In [None]:
df.columns

**Training another model after dropping the experimented features**

In [None]:
df = df.drop(["Vehicle_Owned", "District_Diet", "WorkDistrict_Diet", "Wealth_Index_log_bin","House_Size_bin"],axis=1)
print(df.columns)

# CatBoost Model

the lines that follow each codel block contain the the console logs that the catboost model showed

In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

X = df.drop(columns=["Occupation"])
y = df["Occupation"]

categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

clf = CatBoostClassifier(
    iterations=1000,
    depth=8,
    learning_rate=0.05,
    loss_function="MultiClass",
    eval_metric="MultiClass",
    cat_features=categorical_cols,
    random_seed=42,
    verbose=200
)

clf.fit(X_train, y_train,eval_set = (X_test,y_test))

test_preds = clf.predict(X_test)
print("Test F1-macro:", f1_score(y_test, test_preds, average="macro", zero_division=0))



# Test 1

0:	learn: 1.5601968	total: 135ms	remaining: 2m 14s
200:	learn: 0.7812936	total: 14.5s	remaining: 57.6s
400:	learn: 0.6807556	total: 29.7s	remaining: 44.4s
600:	learn: 0.6249210	total: 45s	remaining: 29.9s
800:	learn: 0.5795493	total: 1m	remaining: 15s
999:	learn: 0.5403172	total: 1m 15s	remaining: 0us
Test F1-macro: 0.6517691465783431 
This was test 1 with CatBoost and no modifications


# Test 2
0:	learn: 1.5601968	test: 1.5628883	best: 1.5628883 (0)	total: 79ms	remaining: 1m 18s
200:	learn: 0.7812936	test: 0.8650590	best: 0.8650590 (200)	total: 14.8s	remaining: 58.9s
400:	learn: 0.6807556	test: 0.8129358	best: 0.8129358 (400)	total: 30.4s	remaining: 45.4s
600:	learn: 0.6249210	test: 0.8044889	best: 0.8043910 (596)	total: 45.9s	remaining: 30.5s
800:	learn: 0.5795493	test: 0.8023722	best: 0.8017557 (700)	total: 1m 1s	remaining: 15.3s
999:	learn: 0.5403172	test: 0.8041424	best: 0.8017557 (700)	total: 1m 17s	remaining: 0us

bestTest = 0.8017556836
bestIteration = 700

Shrink model to first 701 iterations.
Test F1-macro: 0.6592440471947738
This was our test 2 with CatBoost and setting eval_set so that our model does not flatlines when learning and stops when it sees that its performance is falling 



# Experimenting with class weights

we tried modifying the class weights so that the minority classes get equal representation we try many different formula for class weight

In [None]:
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

X = df.drop(columns=["Occupation"])
y = df["Occupation"]

classes, counts = np.unique(y, return_counts=True)
total = len(y)


weights = {cls: total / (len(classes) * count) for cls, count in zip(classes, counts)}


class_weights = [weights[cls] for cls in classes]

print("Class weights:", dict(zip(classes, class_weights)))

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


clf = CatBoostClassifier(
    iterations=1000,
    depth=8,
    learning_rate=0.05,
    loss_function="MultiClass",
    cat_features=X.select_dtypes(include=["object"]).columns.tolist(),
    class_weights=class_weights,
    random_seed=42,
    verbose=200
)

clf.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=100)

test_preds = clf.predict(X_test)
print("Test F1-macro:", f1_score(y_test, test_preds, average="macro", zero_division=0))


# Test 3
Class weights: {'Fisher': np.float64(1.0045280612244898), 'Merchant': np.float64(0.8911456859971711), 'Miner': np.float64(1.0403566710700132), 'Scribe': np.float64(1.249583498611662), 'Warrior': np.float64(0.8921551968280941)}
0:	learn: 1.5647856	test: 1.5670019	best: 1.5670019 (0)	total: 42.4ms	remaining: 42.4s
200:	learn: 0.7888568	test: 0.8761359	best: 0.8761359 (200)	total: 15.2s	remaining: 1m
400:	learn: 0.6879808	test: 0.8229574	best: 0.8229574 (400)	total: 30.9s	remaining: 46.2s
600:	learn: 0.6336274	test: 0.8129771	best: 0.8127935 (595)	total: 46.7s	remaining: 31s
800:	learn: 0.5863063	test: 0.8110601	best: 0.8100837 (757)	total: 1m 2s	remaining: 15.6s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.8100836626
bestIteration = 757

Shrink model to first 758 iterations.
Test F1-macro: 0.6522951194726081

this test was done with putting class weight inversely proportional to their counts we do notice a slight dip in score

In [None]:
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

X = df.drop(columns=["Occupation"])
y = df["Occupation"]


classes, counts = np.unique(y, return_counts=True)
total = len(y)


weights = {cls: np.sqrt(total/(len(classes)*count)) for cls, count in zip(classes, counts)}


class_weights = [weights[cls] for cls in classes]

print("Class weights:", dict(zip(classes, class_weights)))

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


clf = CatBoostClassifier(
    iterations=1000,
    depth=8,
    learning_rate=0.05,
    loss_function="MultiClass",
    cat_features=X.select_dtypes(include=["object"]).columns.tolist(),
    class_weights=class_weights,
    random_seed=42,
    verbose=200
)

clf.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=100)

test_preds = clf.predict(X_test)
print("Test F1-macro:", f1_score(y_test, test_preds, average="macro", zero_division=0))


In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, test_preds))


# Test 4
Class weights: {'Fisher': np.float64(1.002261473481092), 'Merchant': np.float64(0.9440051302811713), 'Miner': np.float64(1.0199787601072943), 'Scribe': np.float64(1.1178477081479667), 'Warrior': np.float64(0.9445396745653907)}
0:	learn: 1.5583713	test: 1.5607479	best: 1.5607479 (0)	total: 107ms	remaining: 1m 46s
200:	learn: 0.7904772	test: 0.8750976	best: 0.8750976 (200)	total: 15.5s	remaining: 1m 1s
400:	learn: 0.6854620	test: 0.8185688	best: 0.8185688 (400)	total: 31.3s	remaining: 46.8s
600:	learn: 0.6250139	test: 0.8088275	best: 0.8086688 (596)	total: 47.2s	remaining: 31.4s
800:	learn: 0.5792265	test: 0.8071120	best: 0.8071081 (799)	total: 1m 3s	remaining: 15.7s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.8063136285
bestIteration = 843

Shrink model to first 844 iterations.
Test F1-macro: 0.65816499293084

this test was done with catboost having sqrt of inverse as the weights from the classwise score we understand the model is not performing good enough for miners and fishers

In [None]:
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report

X = df.drop(columns=["Occupation"])
y = df["Occupation"]


classes, counts = np.unique(y, return_counts=True)
total = len(y)


weights = {cls: 1.0 for cls in classes}


weights["Fisher"] = 1.3   
weights["Miner"]  = 1.3   


class_weights = [weights[cls] for cls in classes]

print("Selective class weights:", dict(zip(classes, class_weights)))

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

clf = CatBoostClassifier(
    iterations=1000,
    depth=8,
    learning_rate=0.05,
    loss_function="MultiClass",
    cat_features=X.select_dtypes(include=["object"]).columns.tolist(),
    class_weights=class_weights,
    random_seed=42,
    verbose=200
)

clf.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=100)

test_preds = clf.predict(X_test)
print("Test F1-macro:", f1_score(y_test, test_preds, average="macro", zero_division=0))
print(classification_report(y_test, test_preds))


# Test 5
this test we did it with manually increasing the wieghts of minority classes so their classification is done better , next we try hyper paramter tuning with grid search CV

In [None]:
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostClassifier

param_grid = {
    'depth': [6, 8, 10],
    'learning_rate': [0.03, 0.05],
    'iterations': [1000, 2000],
    'l2_leaf_reg': [5, 7]
}

clf = CatBoostClassifier(
    loss_function="MultiClass",
    cat_features=X.select_dtypes(include=["object"]).columns.tolist(),
    class_weights=class_weights,
    random_seed=42,
    verbose=200
)

grid = GridSearchCV(
    clf,
    param_grid,
    scoring='f1_macro',
    cv=2,
    n_jobs=-1
)


grid.fit(
    X_train, y_train,
    eval_set=(X_test, y_test),
    early_stopping_rounds=100
)

print("Best params:", grid.best_params_)
print("Best CV F1-macro:", grid.best_score_)


**we tried experimenting with a bigger grid but our notebook was crashing and we were getting errors so we tried a smaller grid**

In [None]:
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostClassifier


param_grid = {
    'learning_rate': [0.02, 0.03, 0.04, 0.05],  
    'iterations': [1000, 1500],                
    'depth': [8],                               
    'l2_leaf_reg': [5]                        
}

clf = CatBoostClassifier(
    loss_function="MultiClass",
    cat_features=X.select_dtypes(include=["object"]).columns.tolist(),
    class_weights=class_weights,  
    random_seed=42,
    verbose=200
)

grid = GridSearchCV(
    clf,
    param_grid,
    scoring='f1_macro',
    cv=2,         
    n_jobs=-1
)


grid.fit(
    X_train, y_train,
    eval_set=(X_test, y_test),
    early_stopping_rounds=100
)

print("Best params:", grid.best_params_)
print("Best CV F1-macro:", grid.best_score_)


Creating another submission with training on full set with our parameters

In [None]:
import numpy as np
from catboost import CatBoostClassifier

X = df.drop(columns=["Occupation"])
y = df["Occupation"]


classes, counts = np.unique(y, return_counts=True)


weights = {cls: 1.0 for cls in classes}


weights["Fisher"] = 1.3   
weights["Miner"]  = 1.3  


class_weights = [weights[cls] for cls in classes]

print("Selective class weights:", dict(zip(classes, class_weights)))


clf = CatBoostClassifier(
    iterations=1000,
    depth=8,
    learning_rate=0.05,
    loss_function="MultiClass",
    cat_features=X.select_dtypes(include=["object"]).columns.tolist(),
    class_weights=class_weights,
    random_seed=42,
    verbose=200
)

clf.fit(X, y)


we do notice a bit of a problem as the model did crash in between and we lost a bit of code but we will try to replicate where we left




Index(['Diet_Type', 'District_Name', 'Occupation', 'Wealth_Index', 'House_Size_sq_ft', 'Life_Expectancy', 'Vehicle_Owned', 'Work_District', 'House_Size_log', 'Wealth_Index_log', 'Wealth_per_capita', 'Wealth_per_year'], dtype='object')


We try running a Randomized CV search of this grid and find out the best parameters

A Little bit of code was lost as the bigger randomized grid search was done we did get the best parameters but it kept crashin when tried to restart we tried setting n_jobs = 4 but it only resulted in a longer process all in all we did boil down to some final best parameters 

In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv("/kaggle/input/iitg-ai-recruitment-2025-beyond-the-box/atlantis_citizens_final.csv")

df["House_Size_log"] = np.log1p(df["House_Size_sq_ft"])
df["Wealth_Index_log"] = np.log1p(df["Wealth_Index"])
df["Wealth_per_capita"] = df["Wealth_Index"] / (df["House_Size_sq_ft"] +  1e-6)
df["Wealth_per_year"] = df["Wealth_Index"] / (df["Life_Expectancy"] +  1e-6)
df = df.drop(["Bio_Hash", "Citizen_ID"], axis = 1)

Running a code on randomized search CV to get recommended parameters for our model


In [None]:
from sklearn.model_selection import RandomizedSearchCV 
from catboost import CatBoostClassifier 
import numpy as np
from sklearn.model_selection import train_test_split

param_dist = { 
    'learning_rate': [0.02, 0.03, 0.05], 
    'depth': [6, 8, 10], 
    'l2_leaf_reg': [1, 3, 5], 
    'iterations': [800, 1000] 
}




X = df.drop(columns=["Occupation", "House_Size_sq_ft"])
y = df["Occupation"]



classes, counts = np.unique(y, return_counts=True)
weights = {cls: 1.0 for cls in classes}


weights["Fisher"] = 1.3   
weights["Miner"]  = 1.3   
class_weights = [weights[cls] for cls in classes]


cat_features=X.select_dtypes(include=["object"]).columns.tolist()





total = len(y)


X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)





clf = CatBoostClassifier( 
    loss_function="MultiClass", 
    cat_features=cat_features, 
    class_weights=class_weights, 
    random_seed=42, 
    verbose=0 )

search = RandomizedSearchCV( estimator=clf,
                            param_distributions=param_dist,
                            n_iter=20, 
                            scoring='f1_macro', 
                            cv=3, 
                            n_jobs=-1, 
                            random_state=42 
                           )
search.fit(X_train, y_train) 
print("Best params:", search.best_params_) 
print("Best CV F1-macro:", search.best_score_)


In [None]:
df.columns

In [None]:
from catboost import CatBoostClassifier
import pandas as pd

cat_features = ['Diet_Type','District_Name','Vehicle_Owned','Work_District']

X = df.drop(columns=["Occupation","House_Size_sq_ft"])
y = df["Occupation"]

classes = y.unique()
weights = {cls: 1.0 for cls in classes}
weights["Fisher"] = 1.3
weights["Miner"] = 1.3

final_clf = CatBoostClassifier(
    learning_rate=0.03,
    depth=6,
    l2_leaf_reg=3,
    iterations=1000,
    loss_function="MultiClass",
    cat_features=cat_features,
    class_weights=weights,
    random_seed=42,
    verbose=200
)

final_clf.fit(X,y)


In [None]:
df_test.columns


In [None]:
import pandas as pd
import numpy as np
df_test = pd.read_csv("/kaggle/input/iitg-ai-recruitment-2025-beyond-the-box/test_atlantis_hidden.csv")

df_test["House_Size_log"] = np.log1p(df_test["House_Size_sq_ft"])
df_test["Wealth_Index_log"] = np.log1p(df_test["Wealth_Index"])
df_test["Wealth_per_capita"] = df_test["Wealth_Index"] / (df_test["House_Size_sq_ft"] +  1e-6)
df_test["Wealth_per_year"] = df_test["Wealth_Index"] / (df_test["Life_Expectancy"] +  1e-6)
df_test = df_test.drop(["Bio_Hash"], axis = 1)

# Final Submission

In [None]:
X_test = df_test.drop(columns=["Citizen_ID", "House_Size_sq_ft"])
test_preds = final_clf.predict(X_test).flatten()

occupation_map = {
    "Warrior": 0,
    "Merchant": 1,
    "Fisher": 2,
    "Miner": 3,
    "Scribe": 4
}

test_preds_labels = [occupation_map[p] for p in test_preds]

submission = pd.DataFrame({
    "Citizen_ID": df_test["Citizen_ID"],
    "Occupation": test_preds_labels
})

submission.to_csv("submission_6.csv", index=False)


# End

this notebook ends here with submission_6 which gave us a F1 macro score 0.633 on the leaderboard
the overall training was satisfactory.