In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the datasets
train_data = pd.read_csv('/content/Train_Data.csv')
test_data = pd.read_csv('/content/Test_Data.csv')
# Drop rows with missing age_group in train_data
train_data = train_data.dropna(subset=['age_group'])

# Drop SEQN as it's an identifier
train_data = train_data.drop('SEQN', axis=1)
test_data = test_data.drop('SEQN', axis=1)

# Separate features and target
X_train = train_data.drop('age_group', axis=1)
y_train = train_data['age_group']
X_test = test_data.copy()

# Define numerical and categorical columns
numerical_cols = ['BMXBMI', 'LBXGLU', 'LBXGLT', 'LBXIN']
categorical_cols = ['RIAGENDR', 'PAQ605', 'DIQ010']

# Handle missing values
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

# Fit and transform numerical features
X_train[numerical_cols] = num_imputer.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = num_imputer.transform(X_test[numerical_cols])

# Fit and transform categorical features
X_train[categorical_cols] = cat_imputer.fit_transform(X_train[categorical_cols])
X_test[categorical_cols] = cat_imputer.transform(X_test[categorical_cols])

# Check for NaN in X_train after imputation
print("NaN in X_train after imputation:", X_train.isnull().sum().sum())

# Encode categorical variables
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
ohe.fit(X_train[categorical_cols])
ohe_feature_names = ohe.get_feature_names_out(categorical_cols)

# Transform categorical features
X_train_ohe = ohe.transform(X_train[categorical_cols])
X_train_ohe_df = pd.DataFrame(X_train_ohe, columns=ohe_feature_names, index=X_train.index)
X_test_ohe = ohe.transform(X_test[categorical_cols])
X_test_ohe_df = pd.DataFrame(X_test_ohe, columns=ohe_feature_names, index=X_test.index)

# Drop original categorical columns and concatenate encoded ones
X_train = X_train.drop(categorical_cols, axis=1)
X_train = pd.concat([X_train, X_train_ohe_df], axis=1)
X_test = X_test.drop(categorical_cols, axis=1)
X_test = pd.concat([X_test, X_test_ohe_df], axis=1)

# Check class distribution to handle imbalance
class_counts = y_train.value_counts(normalize=True)
print("Class distribution:\n", class_counts)

# Split data for validation
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train Random Forest Classifier
rf = RandomForestClassifier(class_weight='balanced', random_state=42)
rf.fit(X_train_split, y_train_split)

# Evaluate on validation set
y_pred = rf.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_pred))
print("Classification Report:\n", classification_report(y_val, y_pred))

# Train on full training data
rf.fit(X_train, y_train)

# Predict on test set
y_test_pred = rf.predict(X_test)

# Map age group to numeric labels
y_test_mapped = pd.Series(y_test_pred).map({'Adult': 0, 'Senior': 1})

# Create submission file
submission = pd.DataFrame({'age_group': y_test_mapped})
submission.to_csv('submission.csv', index=False)



NaN in X_train after imputation: 0
Class distribution:
 age_group
Adult     0.839139
Senior    0.160861
Name: proportion, dtype: float64
Validation Accuracy: 0.8567774936061381
Classification Report:
               precision    recall  f1-score   support

       Adult       0.88      0.96      0.92       340
      Senior       0.37      0.14      0.20        51

    accuracy                           0.86       391
   macro avg       0.63      0.55      0.56       391
weighted avg       0.81      0.86      0.83       391

