In [3]:
from google.colab import files

uploaded = files.upload()


Saving Test_Data.csv to Test_Data.csv
Saving Train_Data.csv to Train_Data.csv


In [4]:
import os
print("Current working directory:", os.getcwd())

# Install and import libraries
!pip install -q pandas scikit-learn seaborn

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import seaborn as sns
import matplotlib.pyplot as plt

# Load datasets
train = pd.read_csv("/content/Train_Data.csv")
test = pd.read_csv("/content/Test_Data.csv")

print("Train data shape:", train.shape)
print("Test data shape:", test.shape)
print(train.head())

# Check missing target labels
missing_labels = train['age_group'].isnull().sum()
print(f" Missing values in age_group: {missing_labels}")

# Drop rows with missing target
train.dropna(subset=['age_group'], inplace=True)

# Adult → 0, Senior → 1
train['age_group'] = train['age_group'].map({'Adult': 0, 'Senior': 1})

# Preprocess features
train.drop(columns=['SEQN'], inplace=True)
test.drop(columns=['SEQN'], inplace=True)

# Convert categorical columns
categorical_cols = ['RIAGENDR', 'PAQ605', 'DIQ010']
for col in categorical_cols:
    train[col] = pd.to_numeric(train[col], errors='coerce')
    test[col] = pd.to_numeric(test[col], errors='coerce')

# Fill missing values with column means
train.fillna(train.mean(numeric_only=True), inplace=True)
test.fillna(train.mean(numeric_only=True), inplace=True)

# Features and target
X = train.drop(columns=['age_group'])
y = train['age_group']

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Train model with class imbalance handling
model = LogisticRegression(max_iter=1000, class_weight='balanced')
model.fit(X_train, y_train)

# Validate model
y_pred = model.predict(X_val)
f1 = f1_score(y_val, y_pred)
print(" F1 Score on validation set:", round(f1, 4))

# Predict on test data
test_preds = model.predict(test)

# Save predictions (0 or 1 only)
submission = pd.DataFrame({'age_group': test_preds})
submission.to_csv("submission.csv", index=False)
print("Submission saved as submission.csv")

submission.head(10)


Current working directory: /content
Train data shape: (1966, 9)
Test data shape: (312, 8)
      SEQN  RIAGENDR  PAQ605  BMXBMI  LBXGLU  DIQ010  LBXGLT  LBXIN age_group
0  73564.0       2.0     2.0    35.7   110.0     2.0   150.0  14.91     Adult
1  73568.0       2.0     2.0    20.3    89.0     2.0    80.0   3.85     Adult
2  73576.0       1.0     2.0    23.2    89.0     2.0    68.0   6.14     Adult
3  73577.0       1.0     2.0    28.9   104.0     NaN    84.0  16.15     Adult
4  73580.0       2.0     1.0    35.9   103.0     2.0    81.0  10.92     Adult
 Missing values in age_group: 14
 F1 Score on validation set: 0.3474
Submission saved as submission.csv


Unnamed: 0,age_group
0,0
1,1
2,1
3,0
4,0
5,1
6,1
7,1
8,0
9,0
