In [34]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import LabelEncoder


In [35]:
train = pd.read_csv("./Data/train.csv")
X_test = pd.read_csv("./Data/test.csv")
test = pd.read_csv("./Data/test.csv")

In [36]:
y_train = train['NObeyesdad']
X_train = train.drop(['NObeyesdad'], axis=1)

In [37]:
CAEC_dict = {'no': 0, 'Sometimes': 0.33, 'Frequently': 0.66, 'Always': 1 }
CALC_dict = {'no': 0, 'Sometimes': 0.5, 'Frequently': 1}

X_train['CAEC'] = X_train['CAEC'].map(CAEC_dict)
X_train['CALC'] = X_train['CALC'].map(CALC_dict)
X_test['CAEC'] = X_test['CAEC'].map(CAEC_dict)
X_test['CALC'] = X_test['CALC'].map(CALC_dict)
cat_cols2 = ['Gender', 'family_history_with_overweight', 'FAVC', 'SMOKE', 'SCC', 'MTRANS']

In [38]:
def calculate_bmi(data):
    data['BMI'] = data['Weight'] / (data['Height'] ** 2)
    return data.drop(['Weight', 'Height', 'id'], axis=1)

In [39]:
pipeline = Pipeline([
    ('calculate_bmi', FunctionTransformer(calculate_bmi)),
])

In [40]:
X_train = pipeline.fit_transform(X_train)
X_test = pipeline.transform(X_test)

In [41]:
#change categorical columns to numerical
le = LabelEncoder()

encoding_mapping = {}
for col in cat_cols2:
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = le.transform(X_test[col])

    encoding_mapping[col] = dict(zip(le.classes_, le.transform(le.classes_)))

y_train = le.fit_transform(y_train)


for col, mapping in encoding_mapping.items():
    print(f"Encoding mapping for column '{col}':")
    print(mapping)
    print()

Encoding mapping for column 'Gender':
{'Female': 0, 'Male': 1}

Encoding mapping for column 'family_history_with_overweight':
{'no': 0, 'yes': 1}

Encoding mapping for column 'FAVC':
{'no': 0, 'yes': 1}

Encoding mapping for column 'SMOKE':
{'no': 0, 'yes': 1}

Encoding mapping for column 'SCC':
{'no': 0, 'yes': 1}

Encoding mapping for column 'MTRANS':
{'Automobile': 0, 'Bike': 1, 'Motorbike': 2, 'Public_Transportation': 3, 'Walking': 4}



In [42]:
cols_numerical = [ 'Gender', 'Age', 'family_history_with_overweight', 'FAVC', 'FCVC',
       'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS',
       'BMI']

In [43]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train = pd.DataFrame(X_train, columns=cols_numerical)
X_test = pd.DataFrame(X_test, columns=cols_numerical)


In [44]:
labels = ['Insufficient_Weight', 'Normal_Weight', 'Obesity_Type_I',
       'Obesity_Type_II', 'Obesity_Type_III', 'Overweight_Level_I',
       'Overweight_Level_II']

In [45]:
final_model = XGBClassifier(random_state = 1, colsample_bytree = 0.5, learning_rate = 0.01, max_depth = 7, n_estimators = 1000, 
                            subsample = 0.75, gamma = 0.5)
final_model.fit(X_train, y_train)

In [46]:
pred = final_model.predict(X_test)
pred = le.inverse_transform(pred)

result = pd.DataFrame({'id': test['id'], 'NObeyesdad': pred})
result.to_csv('submission.csv', index=False)



In [47]:
result

Unnamed: 0,id,NObeyesdad
0,20758,Obesity_Type_II
1,20759,Overweight_Level_I
2,20760,Obesity_Type_III
3,20761,Obesity_Type_I
4,20762,Obesity_Type_III
...,...,...
13835,34593,Overweight_Level_II
13836,34594,Normal_Weight
13837,34595,Insufficient_Weight
13838,34596,Normal_Weight
