In [1]:
from google.colab import files
import warnings
warnings.filterwarnings("ignore")

IMPORT IMPORTANT LIBRARIES

In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

READING FILES

In [3]:

uploaded=files.upload()
train_df=pd.read_csv('train.csv')
test_df=pd.read_csv('test.csv')

Saving train.csv to train.csv
Saving test.csv to test.csv


MODEL CODE

In [4]:
X = train_df.drop(['Citizen_ID', 'Bio_Hash', 'Occupation'], axis=1)
y=train_df["Occupation"]
X_test = test_df.drop(['Citizen_ID', 'Bio_Hash'], axis=1)

#IF SOMEONE HAS HIGHER WEALTH AND IF ITS HOUSE SIZE IS MISSING ,IT SHOULD NOT BE FILLED BY JUST MEAN
X.loc[X["Wealth_Index"]>200000,"House_Size_sq_ft"]=10000
X_test.loc[X_test["Wealth_Index"]>200000,"House_Size_sq_ft"]=10000

def mapper(df):
  life_map = df.groupby('District_Name')['Life_Expectancy'].mean()
  wealth_map = df.groupby('District_Name')['Wealth_Index'].median()
  house_map = df.groupby('District_Name')['House_Size_sq_ft'].median()

  df['Life_Expectancy'] = df['Life_Expectancy'].fillna(df['District_Name'].map(life_map))
  df['Wealth_Index'] = df['Wealth_Index'].fillna(df['District_Name'].map(wealth_map))
  df['House_Size_sq_ft'] = df['House_Size_sq_ft'].fillna(df['District_Name'].map(house_map))
  return df
X=mapper(X)
X_test=mapper(X_test)
#I AM NOT SURE WHETHER THIS MAPPING IS CORRECT OR NOT.:(:(

#feATURE ENGINEERING

def add_features(df):
    df = df.copy()

    df["log_wealth"] = np.log1p(df["Wealth_Index"])
    df["log_house_size"] = np.log1p(df["House_Size_sq_ft"])
    df["age_house_ratio"] = df["Life_Expectancy"] / (df["House_Size_sq_ft"] + 1)
    df["Wealth_Per_SqFt"] = df["Wealth_Index"] / (df["House_Size_sq_ft"] + 1)

    return df

X = add_features(X)
X_test = add_features(X_test)

#ENCODING AND DATA SCALING
all_cat_cols = X.select_dtypes(include="object").columns.tolist()
num_cols = X.select_dtypes(include=np.number).columns
ord_cols = ['Vehicle_Owned']
onehot_cols = [c for c in all_cat_cols if c not in ord_cols]

#Vehicle Order
vehicle_order = [
    'No Vehicle',
    'Fin Bicycle',
    'Sea Scooter',
    'Submarine',
    'Royal Submarine'
]

preprocessor = ColumnTransformer(
    transformers=[

        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ]), num_cols),

        ("cat_ordinal", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ordinal", OrdinalEncoder(
                categories=[vehicle_order],
                handle_unknown='use_encoded_value',
                unknown_value=-1
            ))
        ]), ['Vehicle_Owned']),

        ("cat_onehot", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]), onehot_cols)
    ]
)


clf=Pipeline([("preprocessor",preprocessor),
("classifier",LogisticRegression(max_iter=1000,random_state=42,multi_class='multinomial'))])

clf.fit(X, y)

predictions = clf.predict(X_test)
occupation_map = {
    'Warrior': 0,
    'Merchant': 1,
    'Fisher': 2,
    'Miner': 3,
    'Scribe': 4
}

#MAPPING FOR SUBMISSION
mapped_predictions = [occupation_map[pred] for pred in predictions]


submission = pd.DataFrame({
    'Citizen_ID': test_df['Citizen_ID'],
    'Occupation': mapped_predictions
})
#SAVING MY FILE
submission.to_csv('predictions.csv',index=False)
files.download("predictions.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

CROSS VALIDATING MY MODEL

In [5]:
X_val_train,X_val_test,y_val_train,y_val_test=train_test_split(train_df,y,test_size=0.2,random_state=42)
X_val_train=mapper(X_val_train)
X_val_test=mapper(X_val_test)
a=X_val_test["Citizen_ID"]
X_val_train=X_val_train.drop(['Citizen_ID', 'Bio_Hash', 'Occupation'], axis=1)
X_val_test=X_val_test.drop(['Citizen_ID', 'Bio_Hash', 'Occupation'], axis=1)

X_val_train.loc[X_val_train["Wealth_Index"]>200000,"House_Size_sq_ft"]=10000
X_val_test.loc[X_val_test["Wealth_Index"]>200000,"House_Size_sq_ft"]=10000

X_val_train = add_features(X_val_train)
X_val_test = add_features(X_val_test)

clf.fit(X_val_train, y_val_train)

prediction = clf.predict(X_val_test)



print("CLASSIFICATION REPORT")
print(classification_report(y_val_test,prediction))

print(f"Validation Accuracy: {accuracy_score(y_val_test,prediction):.3f}")

CLASSIFICATION REPORT
              precision    recall  f1-score   support

      Fisher       0.47      0.39      0.43       606
    Merchant       0.74      0.79      0.76       740
       Miner       0.56      0.54      0.55       609
      Scribe       0.66      0.65      0.65       486
     Warrior       0.65      0.73      0.68       710

    accuracy                           0.63      3151
   macro avg       0.62      0.62      0.62      3151
weighted avg       0.62      0.63      0.62      3151

Validation Accuracy: 0.628
