In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [57]:
df = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [58]:
df.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


In [59]:
#Check for missing values
df.isnull().sum()

id                                0
Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
NObeyesdad                        0
dtype: int64

In [61]:
# always try to write a function to clean the training data so that you dont have to repeat all the steps for test data again
def clean(data):
    data = data.drop(['id'],axis=1)
    return data

In [62]:
df = clean(df)

In [63]:
test_clean = clean(test)

In [64]:
df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


In [66]:
le = LabelEncoder()

In [67]:
cat_cols = df.select_dtypes(include = 'object').drop(columns='NObeyesdad').columns

In [68]:
def label(df):
    for cols in cat_cols:
        df[cols] = le.fit_transform(df[cols])
        print(f"columns name :: ",cols,"  ",le.classes_)
    return df

In [69]:
df=label(df)

columns name ::  Gender    ['Female' 'Male']
columns name ::  family_history_with_overweight    ['no' 'yes']
columns name ::  FAVC    ['no' 'yes']
columns name ::  CAEC    ['Always' 'Frequently' 'Sometimes' 'no']
columns name ::  SMOKE    ['no' 'yes']
columns name ::  SCC    ['no' 'yes']
columns name ::  CALC    ['Frequently' 'Sometimes' 'no']
columns name ::  MTRANS    ['Automobile' 'Bike' 'Motorbike' 'Public_Transportation' 'Walking']


In [75]:
X = df.drop(columns='NObeyesdad')
y = df['NObeyesdad']

In [76]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [77]:
rfc = RandomForestClassifier(max_depth=3)

In [78]:
rfc.fit(X_train,y_train)

In [79]:
pred_train=rfc.predict(X_train)
pred_test = rfc.predict(X_test)

In [80]:
from sklearn.metrics import accuracy_score

In [81]:
print("train accuracy :: ",accuracy_score(pred_train,y_train))
print("test accuracy :: ",accuracy_score(pred_test,y_test))

train accuracy ::  0.718846624002301
test accuracy ::  0.7140563421398336


In [82]:
test_df = label(test_clean)

columns name ::  Gender    ['Female' 'Male']
columns name ::  family_history_with_overweight    ['no' 'yes']
columns name ::  FAVC    ['no' 'yes']
columns name ::  CAEC    ['Always' 'Frequently' 'Sometimes' 'no']
columns name ::  SMOKE    ['no' 'yes']
columns name ::  SCC    ['no' 'yes']
columns name ::  CALC    ['Always' 'Frequently' 'Sometimes' 'no']
columns name ::  MTRANS    ['Automobile' 'Bike' 'Motorbike' 'Public_Transportation' 'Walking']


In [85]:
pres_final =rfc.predict(test_df)

In [110]:
param_grid = {
    #'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [2, 3, 4, 5],   # Maximum depth of the trees
    'min_samples_split': [2, 5, 7],   # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4],     # Minimum number of samples required at each leaf node
    #'bootstrap': [True, False]         # Whether bootstrap samples are used when building trees
}

In [111]:
gs = GridSearchCV(estimator=rfc,param_grid=param_grid,cv = 3)

In [112]:
gs.fit(X_train, y_train) 

In [117]:
gs.best_params_

{'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 2}

In [118]:
rfc = RandomForestClassifier(max_depth=5,min_samples_leaf=4,min_samples_split=2)

In [119]:
rfc.fit(X_train,y_train)

In [120]:
pred_train=rfc.predict(X_train)
pred_test = rfc.predict(X_test)

In [121]:
print("train accuracy :: ",accuracy_score(pred_train,y_train))
print("test accuracy :: ",accuracy_score(pred_test,y_test))

train accuracy ::  0.8292226936075358
test accuracy ::  0.8232374835790396


In [122]:
sub=pd.DataFrame({
    "id":test.id.values,
    "NObeyesdad":pres_final
})

In [123]:
sub.to_csv("submision.csv",index = False)