In [1]:
# Cell 1: Imports and Data Loading
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
import mlflow as mf
from mlflow.tracking import MlflowClient
import pickle

data = pd.read_csv('framingham.csv')
data.drop(columns=['education', 'male', 'BPMeds', 'prevalentStroke'], inplace=True)
#data = data.dropna()
# Cell 2: Data Splitting and Preprocessing
X = data.drop(columns=['TenYearCHD'])
y = data['TenYearCHD']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle missing values
num_columns = X_train.select_dtypes(include=['int64', 'float64']).columns
imputer = SimpleImputer(strategy="mean")
X_train[num_columns] = imputer.fit_transform(X_train[num_columns])
X_test[num_columns] = imputer.transform(X_test[num_columns])

# Scaling the data
scaler = StandardScaler()
X_train[num_columns] = scaler.fit_transform(X_train[num_columns])
X_test[num_columns] = scaler.transform(X_test[num_columns])

# Handling class imbalance
#smote = SMOTE(sampling_strategy='auto', random_state=42)
#X_train, y_train = smote.fit_resample(X_train, y_train)

# Cell 3: Decision Tree Model
param_grid_dtc = {
    'max_depth': [3, 5, 7, 10, 12],
    'min_samples_split': [2, 5, 10, 12],
    'min_samples_leaf': [1, 2, 4, 6, 8],
    'criterion': ['gini', 'entropy']
}

dtc = DecisionTreeClassifier(random_state=42)
grid_search_dtc = GridSearchCV(estimator=dtc, param_grid=param_grid_dtc, cv=5, n_jobs=-1, verbose=1)
grid_search_dtc.fit(X_train, y_train)

best_dtc = grid_search_dtc.best_estimator_
print(f"Best Parameters for Decision Tree: {grid_search_dtc.best_params_}")

# Predictions
y_train_pred = best_dtc.predict(X_train)
y_test_pred = best_dtc.predict(X_test)
print(f"Decision Tree Training Accuracy: {accuracy_score(y_train, y_train_pred):.4f}")
print(f"Decision Tree Testing Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")




Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Best Parameters for Decision Tree: {'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 5}
Decision Tree Training Accuracy: 0.8602
Decision Tree Testing Accuracy: 0.8396


In [4]:
y_train_pred = best_rf_clf.predict(X_train)
y_test_pred = best_rf_clf.predict(X_test)

print(f"Decision Tree Training Accuracy: {accuracy_score(y_train, y_train_pred):.4f}")
print(f"Decision Tree Testing Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")


Decision Tree Training Accuracy: 0.9088
Decision Tree Testing Accuracy: 0.8514


In [5]:
y_train_pred = best_log_reg.predict(X_train)
y_test_pred = best_log_reg.predict(X_test)

print(f"Decision Tree Training Accuracy: {accuracy_score(y_train, y_train_pred):.4f}")
print(f"Decision Tree Testing Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")


Decision Tree Training Accuracy: 0.8540
Decision Tree Testing Accuracy: 0.8561


In [3]:
# Cell 4: Logistic Regression
param_grid_lr = {
    'penalty': ['l2', 'none'],
    'C': [0.01, 0.1, 1, 10]
}

log_reg = LogisticRegression(random_state=42, max_iter=1000)
grid_search_lr = GridSearchCV(estimator=log_reg, param_grid=param_grid_lr, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_lr.fit(X_train, y_train)

best_log_reg = grid_search_lr.best_estimator_
print(f"Best Parameters for Logistic Regression: {grid_search_lr.best_params_}")

y_pred_lr = best_log_reg.predict(X_test)
print(f"Logistic Regression Accuracy: {accuracy_score(y_test, y_pred_lr):.4f}")

# Cell 5: Random Forest
param_grid_rf = {
    'n_estimators': [50, 75, 100],
    'max_depth': [10, 20, 25],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [2, 4]
}

rf_clf = RandomForestClassifier(random_state=42)
grid_search_rf = GridSearchCV(estimator=rf_clf, param_grid=param_grid_rf, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_rf.fit(X_train, y_train)

best_rf_clf = grid_search_rf.best_estimator_
print(f"Best Parameters for Random Forest: {grid_search_rf.best_params_}")

y_pred_rf = best_rf_clf.predict(X_test)
print(f"Random Forest Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")

# Save all models
with open('decision_tree_classifier.pkl', 'wb') as f:
    pickle.dump(best_dtc, f)

with open('logistic_regression.pkl', 'wb') as f:
    pickle.dump(best_log_reg, f)

with open('random_forest_classifier.pkl', 'wb') as f:
    pickle.dump(best_rf_clf, f)

Best Parameters for Logistic Regression: {'C': 0.1, 'penalty': 'l2'}
Logistic Regression Accuracy: 0.8561
Best Parameters for Random Forest: {'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 50}
Random Forest Accuracy: 0.8514


In [6]:

#creating Input Pipeline For Our Model
new_data = pd.DataFrame({
    "age": [53],  
    "currentSmoker": [1],  
    "cigsPerDay": [20],  
    "prevalentHyp": [0], 
    "diabetes": [1], 
    "totChol": [220.0], 
    "sysBP": [140.0], 
    "diaBP": [120.0], 
    "BMI": [31.0],
    "heartRate": [90], 
    "glucose": [80], 
    
})



#saving the model
import pickle

with open('randomForest_classifier.pkl','wb') as f:
    pickle.dump(best_rf_clf,f)