# Importing The Necessary Libraries

In [11]:
# Importing necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Loading the Data and Preprocessing

In [12]:
train_df = pd.read_csv('Data/train.csv')
train_df.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


In [13]:
test_df = pd.read_csv('Data/test.csv')
test_df.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
0,20758,Male,26.899886,1.848294,120.644178,yes,yes,2.938616,3.0,Sometimes,no,2.825629,no,0.8554,0.0,Sometimes,Public_Transportation
1,20759,Female,21.0,1.6,66.0,yes,yes,2.0,1.0,Sometimes,no,3.0,no,1.0,0.0,Sometimes,Public_Transportation
2,20760,Female,26.0,1.643355,111.600553,yes,yes,3.0,3.0,Sometimes,no,2.621877,no,0.0,0.250502,Sometimes,Public_Transportation
3,20761,Male,20.979254,1.553127,103.669116,yes,yes,2.0,2.977909,Sometimes,no,2.786417,no,0.094851,0.0,Sometimes,Public_Transportation
4,20762,Female,26.0,1.627396,104.835346,yes,yes,3.0,3.0,Sometimes,no,2.653531,no,0.0,0.741069,Sometimes,Public_Transportation


In [14]:
print("Columns in Training Dataset :", train_df.columns)
print("Columns in Test Dataset :", test_df.columns)

Columns in Training Dataset : Index(['id', 'Gender', 'Age', 'Height', 'Weight',
       'family_history_with_overweight', 'FAVC', 'FCVC', 'NCP', 'CAEC',
       'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS', 'NObeyesdad'],
      dtype='object')
Columns in Test Dataset : Index(['id', 'Gender', 'Age', 'Height', 'Weight',
       'family_history_with_overweight', 'FAVC', 'FCVC', 'NCP', 'CAEC',
       'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS'],
      dtype='object')


In [15]:
# Encode categorical variables
label_encoders = {}
for column in train_df.select_dtypes(include=['object']).columns:
    if column != 'NObeyesdad':  # Exclude the target variable for now
        le = LabelEncoder()
        train_df[column] = le.fit_transform(train_df[column])
        label_encoders[column] = le

# Encode the target variable
target_le = LabelEncoder()
train_df['NObeyesdad'] = target_le.fit_transform(train_df['NObeyesdad'])
label_encoders['NObeyesdad'] = target_le

# Split the dataset into features and target variable
X = train_df.drop(['id', 'NObeyesdad'], axis=1)
y = train_df['NObeyesdad']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Building

In [16]:
# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model on the training data
rf_classifier.fit(X_train, y_train)

# Model Evaluation

In [17]:
# Predict on the validation set
y_pred = rf_classifier.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

Validation Accuracy: 0.8959537572254336


# Apply The Trained Model On Test Data

In [18]:
# Apply label encoding to categorical features in the test dataset
for column in test_df.columns:
    if column != 'id' and column != 'NObeyesdad':  # Exclude ID and target variable
        if test_df[column].dtype == 'object':  # Check for categorical dtype
            le = LabelEncoder()
            test_df[column] = le.fit_transform(test_df[column])  # Fit and transform

# Prepare test features (no need to convert to string)
X_test = test_df.drop(['id'], axis=1)

# Predict on the test dataset
test_predictions = rf_classifier.predict(X_test)

# Decode the predictions to the original labels
test_predictions_labels = label_encoders['NObeyesdad'].inverse_transform(test_predictions)

# Create a Output File

In [19]:
# Create submission DataFrame
submission_df = pd.DataFrame({
    'id': test_df['id'],
    'NObeyesdad': test_predictions_labels
})

# Save submission file
submission_file_path = 'submission.csv'
submission_df.to_csv(submission_file_path, index=False)