inputs: data generated by 0_eda.ipynb

contents: model training scripts

output: the "best" model written to best_model/ (used in the streamlit dashboard)

In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np

df = pd.read_csv('data/model_ready.csv')
df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,label
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,False
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,True
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,True
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,False
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,False


In [7]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.attribute_names]

class DataPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoders = {}

    def fit(self, X, y=None):
        # Convert all columns to string type to avoid type comparison issues
        X = X.astype(str)
        # Handle missing values and encode categorical columns
        categorical_cols = X.select_dtypes(include=['object']).columns
        for col in categorical_cols:
            le = LabelEncoder()
            X[col] = le.fit_transform(X[col].fillna(X[col].mode()[0]))
            self.encoders[col] = le
        return self

    def transform(self, X):
        X = X.copy()
        # Convert all columns to string type to avoid type comparison issues
        X = X.astype(str)
        # Handle missing values in numeric columns
        numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
        for col in numeric_cols:
            X[col] = X[col].fillna(X[col].median())

        # Encode categorical columns
        for col, le in self.encoders.items():
            # Fit the encoder with all possible labels including those in the test set
            all_labels = np.unique(X[col].fillna(X[col].mode()[0]))
            le.classes_ = np.unique(np.concatenate((le.classes_, all_labels), axis=None))
            X[col] = le.transform(X[col].fillna(X[col].mode()[0]))
        return X

# Define the pipeline
pipeline = Pipeline([
    ('selector', DataFrameSelector(df.columns.drop('label').drop('id'))),
    ('preprocessor', DataPreprocessor()),
    ('classifier', xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'))
])

# Split the data into features and target
X = df.drop('label', axis=1).drop('id', axis=1)
y = df['label']

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.86


In [11]:
pipeline.predict_proba(X_test)

array([[9.99609351e-01, 3.90639267e-04],
       [9.90521312e-01, 9.47866403e-03],
       [9.10651684e-03, 9.90893483e-01],
       [2.13086605e-03, 9.97869134e-01],
       [9.86829996e-02, 9.01317000e-01],
       [9.88545537e-01, 1.14544798e-02],
       [9.84986901e-01, 1.50131183e-02],
       [2.30473280e-03, 9.97695267e-01],
       [8.82750750e-03, 9.91172493e-01],
       [9.77232754e-01, 2.27672644e-02],
       [2.58897543e-02, 9.74110246e-01],
       [6.82139397e-03, 9.93178606e-01],
       [5.39945364e-02, 9.46005464e-01],
       [3.34292054e-02, 9.66570795e-01],
       [8.47796202e-02, 9.15220380e-01],
       [8.97965074e-01, 1.02034934e-01],
       [9.97876465e-01, 2.12356425e-03],
       [7.06843138e-01, 2.93156832e-01],
       [7.31706023e-01, 2.68293947e-01],
       [5.04815578e-03, 9.94951844e-01],
       [9.99355733e-01, 6.44258573e-04],
       [9.93178964e-01, 6.82106288e-03],
       [3.17931175e-04, 9.99682069e-01],
       [2.69964337e-02, 9.73003566e-01],
       [1.401102

In [10]:
import mlflow
import shutil

# Reset indices of X_train and y_train
X_train = X_train.reset_index(drop=True)
y_train = pd.Series(y_train).reset_index(drop=True)

# Sample 100 rows from X_train and y_train
X_train_sample = X_train.sample(n=100, random_state=42)
y_train_sample = y_train[X_train_sample.index]

# Remove the existing 'best_model' directory if it exists
shutil.rmtree('best_model', ignore_errors=True)

# Infer the signature of the model based on the sample data
signature = mlflow.models.signature.infer_signature(X_train_sample, y_train_sample)

# Save the trained model to the 'best_model' directory with the inferred signature
mlflow.sklearn.save_model(pipeline, 'best_model', signature=signature, input_example=X_test)



In [5]:
X_train_sample

Unnamed: 0,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal
669,40,Male,Hungary,non-anginal,130.0,215.0,False,normal,138.0,False,0.0,,,
33,64,Male,Cleveland,typical angina,170.0,227.0,False,lv hypertrophy,155.0,False,0.6,flat,0.0,reversable defect
549,58,Male,VA Long Beach,atypical angina,126.0,0.0,True,normal,110.0,True,2.0,flat,,
199,62,Female,Cleveland,non-anginal,130.0,263.0,False,normal,97.0,False,1.2,flat,1.0,reversable defect
264,64,Female,Cleveland,non-anginal,140.0,313.0,False,normal,133.0,False,0.2,upsloping,0.0,reversable defect
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404,53,Male,Hungary,atypical angina,120.0,,False,normal,132.0,False,0.0,,,
428,62,Male,Switzerland,asymptomatic,115.0,0.0,,normal,128.0,True,2.5,downsloping,,
424,54,Male,VA Long Beach,non-anginal,,203.0,False,st-t abnormality,,,,,,
235,39,Male,Hungary,atypical angina,120.0,200.0,False,normal,160.0,True,1.0,flat,,


In [6]:
test_df = pd.DataFrame(X_test)
test_df['labell'] = y_test
test_df.to_csv('data/test_df.csv', index=False)