In [39]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_curve

In [40]:
#step 1: upload dataset
df = pd.read_csv("heart.csv.csv")
df.drop(['education'], inplace = True, axis = 1)
df.rename(columns ={'male':'Sex_male'}, inplace = True)
df.dropna(axis = 0, inplace = True)
print(df.head(), df.shape)
print(df.TenYearCHD.value_counts())

   Sex_male  age  currentSmoker  cigsPerDay  BPMeds  prevalentStroke  \
0         1   39              0         0.0     0.0                0   
1         0   46              0         0.0     0.0                0   
2         1   48              1        20.0     0.0                0   
3         0   61              1        30.0     0.0                0   
4         0   46              1        23.0     0.0                0   

   prevalentHyp  diabetes  totChol  sysBP  diaBP    BMI  heartRate  glucose  \
0             0         0    195.0  106.0   70.0  26.97       80.0     77.0   
1             0         0    250.0  121.0   81.0  28.73       95.0     76.0   
2             0         0    245.0  127.5   80.0  25.34       75.0     70.0   
3             1         0    225.0  150.0   95.0  28.58       65.0    103.0   
4             0         0    285.0  130.0   84.0  23.10       85.0     85.0   

   TenYearCHD  
0           0  
1           0  
2           0  
3           1  
4           

In [41]:
# Step 2: Manual Oversampling
class_0 = df[df['TenYearCHD'] == 0]
class_1 = df[df['TenYearCHD'] == 1]
class_1_over = class_1.sample(len(class_0), replace=True, random_state=42)
df_over = pd.concat([class_0, class_1_over], axis=0).sample(frac=1, random_state=42).reset_index(drop=True)
print("Counts of classes after oversampling:")
print(df_over['TenYearCHD'].value_counts())


Counts of classes after oversampling:
TenYearCHD
1    3179
0    3179
Name: count, dtype: int64


In [42]:
# Step 3: Split data
X = df_over.drop('TenYearCHD', axis=1)
y = df_over['TenYearCHD']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [43]:
# Step 4: Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rf_model.fit(X_train, y_train)

In [44]:
# Step 5:  Evaluation
y_pred = rf_model.predict(X_test)
print("Initial Random Forest Evaluation (Threshold=0.5):")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Initial Random Forest Evaluation (Threshold=0.5):
Accuracy: 0.9827044025157232
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.98       636
           1       0.98      0.99      0.98       636

    accuracy                           0.98      1272
   macro avg       0.98      0.98      0.98      1272
weighted avg       0.98      0.98      0.98      1272

Confusion Matrix:
 [[621  15]
 [  7 629]]
