In [1]:
# Install gdown for downloading from Google Drive
!pip install gdown

# Download train dataset
!gdown --id 1OQWfTaTaRatB8gT3sg723LRtYYKBm5tU -O train_heart.csv

# Download test dataset
!gdown --id 1MznJKj9_X4TBgnkbMWfBSBv9jyCQ6krd -O test_heart.csv

print("Files downloaded! Check with !ls")
!ls *.csv

Downloading...
From: https://drive.google.com/uc?id=1OQWfTaTaRatB8gT3sg723LRtYYKBm5tU
To: /content/train_heart.csv
100% 1.32M/1.32M [00:00<00:00, 140MB/s]
Downloading...
From: https://drive.google.com/uc?id=1MznJKj9_X4TBgnkbMWfBSBv9jyCQ6krd
To: /content/test_heart.csv
100% 131k/131k [00:00<00:00, 96.4MB/s]
Files downloaded! Check with !ls
test_heart.csv	train_heart.csv


In [9]:
import pandas as pd  # For loading/handling data
import numpy as np   # For math stuff

# Load train data
train_df = pd.read_csv('Heart_Attack_training_dataset.csv')
print("Train data shape:", train_df.shape)
print("\nFirst 5 rows:")
print(train_df.head())

# Load test data (same features, no target)
test_df = pd.read_csv('Hear_Attack_evaluation_dataset.csv')
print("\nTest data shape:", test_df.shape)
print("\nFirst 5 rows of test:")
print(test_df.head())

# Quick explore: Info on columns (types, missing values)
print("\nTrain info:")
train_df.info()
print("\nMissing values in train:")
print(train_df.isnull().sum())

Train data shape: (7963, 26)

First 5 rows:
  patient_id  age     sex  chol       bp  hr  diabetes  family_history  \
0    BMW7812   67    Male   208   158/88  72         0               0   
1    CZE1114   21    Male   389   165/93  98         1               1   
2    BNI9906   21  Female   324   174/99  72         1               0   
3    JLN3497   84    Male   383  163/100  73         1               1   
4    GFO8847   66    Male   318    91/88  93         1               1   

   smoking  obesity  ...  sedentary_hr  income        bmi  triglycerides  \
0        1        0  ...      6.615001  261404  31.251233            286   
1        1        1  ...      4.963459  285768  27.194973            235   
2        0        0  ...      9.463426  235282  28.176571            587   
3        1        0  ...      7.648981  125640  36.464704            378   
4        1        1  ...      1.514821  160555  21.809144            231   

   phys_act_days  sleep_hr    country      continent  

In [14]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

# Copy data to avoid messing originals
df = train_df.copy()
test = test_df.copy()

# Step 3.1: Handle 'bp' - split into systolic and diastolic (two new numeric columns)
df[['systolic_bp', 'diastolic_bp']] = df['bp'].str.split('/', expand=True).astype(float)
test[['systolic_bp', 'diastolic_bp']] = test['bp'].str.split('/', expand=True).astype(float)

# Drop original 'bp' column
df.drop('bp', axis=1, inplace=True)
test.drop('bp', axis=1, inplace=True)

# Step 3.2: Handle missing values (if any) - fill with median for numerics, mode for categoricals
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
numeric_cols.remove('heart_attack_risk')  # Don't fill target
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
test[numeric_cols] = test[numeric_cols].fillna(test[numeric_cols].median())

categorical_cols = ['sex', 'diet', 'country', 'continent', 'hemisphere']
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])
test[categorical_cols] = test[categorical_cols].fillna(test[categorical_cols].mode().iloc[0])

print("Missing values handled!")

# Step 3.3: Separate features (X) and target (y)
X = df.drop(['patient_id', 'heart_attack_risk'], axis=1)
y = df['heart_attack_risk']

# For test: Features only (keep patient_id for output)
X_test = test.drop('patient_id', axis=1)
patient_ids = test['patient_id']

# Step 3.4: Split train into train/val (80/20) for testing model
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Features shape:", X.shape)
print("Categorical columns:", categorical_cols)
print("Numeric columns (after bp split):", [col for col in X.columns if col not in categorical_cols])

Missing values handled!
Features shape: (7963, 25)
Categorical columns: ['sex', 'diet', 'country', 'continent', 'hemisphere']
Numeric columns (after bp split): ['age', 'chol', 'hr', 'diabetes', 'family_history', 'smoking', 'obesity', 'alcohol', 'exercise_hr_wk', 'prev_heart_prob', 'med_use', 'stress_lvl', 'sedentary_hr', 'income', 'bmi', 'triglycerides', 'phys_act_days', 'sleep_hr', 'systolic_bp', 'diastolic_bp']


In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Define columns again (in case)
numeric_features = [col for col in X.columns if col not in categorical_cols]
categorical_features = categorical_cols

# Create preprocessor: Scale numerics + One-hot categoricals
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features)
    ])

# Create full pipeline: Prep + Model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Train the model
model.fit(X_train, y_train)

print("Balanced Model trained!")

Balanced Model trained!


In [16]:
from sklearn.model_selection import GridSearchCV

# Tune Random Forest (try different settings)
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20]
}
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='recall')
grid_search.fit(X_train, y_train)

# Use best model
model = grid_search.best_estimator_
print("Best params:", grid_search.best_params_)
# Then re-run Step 5 for new metrics

Best params: {'classifier__max_depth': 20, 'classifier__n_estimators': 50}


In [24]:
# Predict on val set
y_pred = model.predict(X_val)
y_pred_proba = model.predict_proba(X_val)[:, 1]  # For ROC-AUC

# Calculate metrics
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
roc_auc = roc_auc_score(y_val, y_pred_proba)

# Print results
print("Training Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")  # This is what they score on!
print(f"F1-Score: {f1:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")

Training Metrics:
Accuracy: 0.6541
Precision: 0.4211
Recall: 0.0146
F1-Score: 0.0282
ROC-AUC: 0.4976


In [18]:
# Predict on test
test_predictions = model.predict(X_test)

# Create output DataFrame
output_df = pd.DataFrame({
    'patient_id': patient_ids,
    'heart_attack_risk': test_predictions
})

# Save to CSV
output_df.to_csv('YourTeamCode_YourTeamName_Task1_Predictions.csv', index=False)
print("Predictions saved! First few rows:")
print(output_df.head())
print("\nDownload this file from Colab: Files > right-click > Download")

Predictions saved! First few rows:
  patient_id  heart_attack_risk
0    VRK5064                  0
1    NEN2365                  0
2    KXT2493                  0
3    TKO0406                  0
4    GDP2405                  0

Download this file from Colab: Files > right-click > Download
