#### Load the libraries

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,RobustScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.impute import KNNImputer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier


### Load the dataset

In [3]:
# Load the training dataset
data = pd.read_csv("synthetic_data_training.csv")
data.head()

Unnamed: 0,gender,age,bmi,systolic_bp,diastolic_bp,exercise_frequency,smoker,family_history,diet_quality,us_state,shoe_size,cholesterol_level,has_disease
0,Male,53,19.138668,121.898174,61.773357,,1.0,1.0,Poor,NY,12.365815,160.204426,0
1,Female,23,32.498117,154.821812,68.959128,Regularly,1.0,,Average,OH,10.585105,145.484203,0
2,Male,27,33.151563,185.758938,113.805127,Regularly,1.0,0.0,,FL,12.34671,206.106997,1
3,Female,32,37.248601,163.894943,57.066786,Frequently,0.0,1.0,Poor,OH,7.997821,257.131324,1
4,Female,18,26.668744,117.623795,93.728305,,0.0,0.0,Good,NY,11.679335,249.37045,0


####  Check all columns Names and Any missing values

In [4]:
column_names = data.columns

# available columns
print("------------ Available Columns --------------------------")
for column_name in column_names:
    print(column_name)
    
    
# Get data types of each column
column_data_types = data.dtypes

# Print the data types
print(" --------------  Data Types of Columns:  ---------------")
print(column_data_types)

------------ Available Columns --------------------------
gender
age
bmi
systolic_bp
diastolic_bp
exercise_frequency
smoker
family_history
diet_quality
us_state
shoe_size
cholesterol_level
has_disease
 --------------  Data Types of Columns:  ---------------
gender                 object
age                     int64
bmi                   float64
systolic_bp           float64
diastolic_bp          float64
exercise_frequency     object
smoker                float64
family_history        float64
diet_quality           object
us_state               object
shoe_size             float64
cholesterol_level     float64
has_disease             int64
dtype: object


In [5]:
## finding missing columns
def find_missing(df):
    # Find columns with missing values
    has_missing_values = df.columns[df.isnull().any()].tolist()

    # Print the columns with missing values
    print("Columns with Missing Values:")
    print(has_missing_values) 
    
find_missing(data)

Columns with Missing Values:
['bmi', 'systolic_bp', 'diastolic_bp', 'exercise_frequency', 'smoker', 'family_history', 'diet_quality', 'cholesterol_level']


### Preprocessing

#### Imputing Numerical Columns

In [6]:
# copy dataset
data_processed = data.copy()

## columns to impute, these are the columns with missing values
cols_to_impute = ['bmi', 'systolic_bp', 'diastolic_bp','family_history','smoker', 'cholesterol_level']

# get the data
imputation_data = data_processed[cols_to_impute]

# Create a KNN imputer
knn_imputer = KNNImputer(n_neighbors=5)

# KNN imputation
imputed_data = knn_imputer.fit_transform(imputation_data)

# Replace imputed data
data_processed[cols_to_impute] = imputed_data

data_processed.head()

Unnamed: 0,gender,age,bmi,systolic_bp,diastolic_bp,exercise_frequency,smoker,family_history,diet_quality,us_state,shoe_size,cholesterol_level,has_disease
0,Male,53,19.138668,121.898174,61.773357,,1.0,1.0,Poor,NY,12.365815,160.204426,0
1,Female,23,32.498117,154.821812,68.959128,Regularly,1.0,0.6,Average,OH,10.585105,145.484203,0
2,Male,27,33.151563,185.758938,113.805127,Regularly,1.0,0.0,,FL,12.34671,206.106997,1
3,Female,32,37.248601,163.894943,57.066786,Frequently,0.0,1.0,Poor,OH,7.997821,257.131324,1
4,Female,18,26.668744,117.623795,93.728305,,0.0,0.0,Good,NY,11.679335,249.37045,0


### Imputing Non Numerical Columns

In [7]:
# get the columns with missing values, non numerical
cats_cols = ['exercise_frequency','diet_quality']

# create imputer
categorical_imputer = SimpleImputer(strategy='most_frequent')

# Impute missing values 
data_processed[cats_cols] = categorical_imputer.fit_transform(data_processed[cats_cols])

data_processed.head()

Unnamed: 0,gender,age,bmi,systolic_bp,diastolic_bp,exercise_frequency,smoker,family_history,diet_quality,us_state,shoe_size,cholesterol_level,has_disease
0,Male,53,19.138668,121.898174,61.773357,Frequently,1.0,1.0,Poor,NY,12.365815,160.204426,0
1,Female,23,32.498117,154.821812,68.959128,Regularly,1.0,0.6,Average,OH,10.585105,145.484203,0
2,Male,27,33.151563,185.758938,113.805127,Regularly,1.0,0.0,Poor,FL,12.34671,206.106997,1
3,Female,32,37.248601,163.894943,57.066786,Frequently,0.0,1.0,Poor,OH,7.997821,257.131324,1
4,Female,18,26.668744,117.623795,93.728305,Frequently,0.0,0.0,Good,NY,11.679335,249.37045,0


### Check the missing values

In [8]:
find_missing(data_processed)

Columns with Missing Values:
[]


#### Get target VARIABLES 

In [9]:
# Separate the target variable
X = data_processed.drop(columns=["has_disease"])
y = data_processed["has_disease"]
X.head()

Unnamed: 0,gender,age,bmi,systolic_bp,diastolic_bp,exercise_frequency,smoker,family_history,diet_quality,us_state,shoe_size,cholesterol_level
0,Male,53,19.138668,121.898174,61.773357,Frequently,1.0,1.0,Poor,NY,12.365815,160.204426
1,Female,23,32.498117,154.821812,68.959128,Regularly,1.0,0.6,Average,OH,10.585105,145.484203
2,Male,27,33.151563,185.758938,113.805127,Regularly,1.0,0.0,Poor,FL,12.34671,206.106997
3,Female,32,37.248601,163.894943,57.066786,Frequently,0.0,1.0,Poor,OH,7.997821,257.131324
4,Female,18,26.668744,117.623795,93.728305,Frequently,0.0,0.0,Good,NY,11.679335,249.37045


### Split the data into training and testing sets

In [10]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Define categorical and numeric columns
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()
numeric_cols = X_train.select_dtypes(exclude=['object']).columns.tolist()

print(categorical_cols)
print(numeric_cols)

['gender', 'exercise_frequency', 'diet_quality', 'us_state']
['age', 'bmi', 'systolic_bp', 'diastolic_bp', 'smoker', 'family_history', 'shoe_size', 'cholesterol_level']


#### Transformers for processing, Further Processing

In [11]:
# Create transformers for for processing numerical and categorical columns
#handles the outliers

## for categorical
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('outlier_handling', 'passthrough'),  
    ('scaler', RobustScaler())
])


## for numerical
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


## create the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


### Step 2: Build the Classifier

#### Logistic Regression Classifier

In [12]:
#create a Classifier
lg_classifier = LogisticRegression(max_iter=1000)

# create a pipeline for processing
lg_clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', lg_classifier)])


# Train the Classifier
lg_clf.fit(X_train, y_train)


# Make predictions on the testing set
y_pred = lg_clf.predict(X_test)


# get F1 score
f1 = f1_score(y_test, y_pred)


# Print the F1 score as a percentage
print("F1 Score:", f1*100, "%")

F1 Score: 84.375 %


#### SVC Classifier

In [13]:
# define a classifier
svc_classifier = SVC(kernel='linear')

# create a pipeline
svc_clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', svc_classifier)])

# Train the classifier
svc_clf.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = svc_clf.predict(X_test)

# Calculate the F1 score
f1 = f1_score(y_test, y_pred)

# Print the score
print("F1 Score:", f1)


F1 Score: 0.8374164810690423


### GradientBoostingClassifier classifier

In [14]:
# create a classifier
gbc_classifier = GradientBoostingClassifier(n_estimators=100, random_state=42)

# create a pipeline
gbc_clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', gbc_classifier)])

# Train the dataset
gbc_clf.fit(X_train, y_train)

# make predictions
y_pred = gbc_clf.predict(X_test)

# get f1 score
f1 = f1_score(y_test, y_pred)
print("F1 Score:", f1)


F1 Score: 0.8701594533029612


### A function to find the f1 score using different classifiers

In [15]:
def evaluate_classifiers(classifiers, X_train, X_test, y_train, y_test):
    results = []

    for clf_name, classifier in classifiers:
        # Create a pipeline that includes preprocessing and the classifier
        clf = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', classifier)])

        # Fit the classifier to the preprocessed training data (with outliers removed)
        clf.fit(X_train, y_train)

        # Make predictions on the testing set
        y_pred = clf.predict(X_test)
        
        # Calculate the F1 score
        f1 = f1_score(y_test, y_pred)

        # Append results to the list
        results.append({'Classifier': clf_name, 'F1 Score': f1*100})

    # Create a DataFrame to store the results
    results_df = pd.DataFrame(results)
    return results_df

#### Classifiers to be tested

In [16]:
# Define the list of classifiers to evaluate
classifiers_to_evaluate = [
    ('K-Nearest Neighbors', KNeighborsClassifier(n_neighbors=5)),
    ('Gradient Boosting', GradientBoostingClassifier(n_estimators=100, random_state=42)),
    ('SVC',SVC(kernel='linear')),
    ('LogisticRegression',LogisticRegression(max_iter=1000))
]

### Calculating F1 score for Imputed Cases

In [17]:
# Evaluate classifiers
f1_df = evaluate_classifiers(classifiers_to_evaluate, X_train, X_test, y_train, y_test)

# Print the results
f1_df

Unnamed: 0,Classifier,F1 Score
0,K-Nearest Neighbors,75.402299
1,Gradient Boosting,87.015945
2,SVC,83.741648
3,LogisticRegression,84.375


### Calculating F1 score for Complete Cases

In [18]:
# target values for complete cases
X = data.drop(columns=['has_disease'])
y = data['has_disease']

In [19]:
# split the complete cases
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# comple cases mask
ccm = X_train.notnull().all(axis=1)

# get complete cases
X_t_complete = X_train[ccm]
y_t_complete = y_train[ccm]


In [20]:
# Evaluate classifiers
results_complete_case_df = evaluate_classifiers(classifiers_to_evaluate, X_t_complete, X_test, y_t_complete, y_test)
# Print the results
results_complete_case_df

Unnamed: 0,Classifier,F1 Score
0,K-Nearest Neighbors,77.625571
1,Gradient Boosting,85.193622
2,SVC,85.842697
3,LogisticRegression,84.545455


#### Selected Classifier Gradient Boosting

In [26]:
## Overall F1 score
f1_scores = results_complete_case_df['F1 Score']
f1_gb = f1_scores.iloc[1]


f1_imputed_scores = f1_df['F1 Score']
f1_gb_2 = f1_imputed_scores.iloc[1]

overall = (f1_gb + f1_gb_2) / 2

overall 

86.10478359908883

####  Predicting  Using the Test Data

In [22]:
test_data = pd.read_csv('synthetic_data_test.csv')
test_data.head()

Unnamed: 0,gender,age,bmi,systolic_bp,diastolic_bp,exercise_frequency,smoker,family_history,diet_quality,us_state,shoe_size,cholesterol_level
0,Male,24,31.084429,79.735428,76.334048,Never,0.0,0.0,,NC,12.803742,232.868297
1,Male,53,36.168233,179.902564,93.005086,Never,0.0,0.0,Average,CA,5.101318,179.375333
2,Male,20,,106.626029,,Never,,0.0,Poor,CA,12.266724,107.443451
3,Female,69,21.879392,99.452795,114.495464,Regularly,0.0,1.0,Good,CA,5.643639,268.170173
4,Female,78,30.289369,102.042081,58.688564,Frequently,1.0,0.0,Average,NC,11.761399,239.892775


#### Make Predictions Using Best Classifier

In [23]:
## make predictions
y_pred_test = gbc_clf.predict(test_data)


#### Save the Tests Predictions

In [25]:
# Create df
pred_df = pd.DataFrame({'Preds': y_pred_test})

# save the df
pred_df.to_csv('answers.csv', index=False)

##display head
pred_df.head()

Unnamed: 0,Preds
0,0
1,1
2,0
3,1
4,1
