In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
import pandas as pd

In [3]:
bmd_data = pd.read_csv('./BMD-2.csv')

In [4]:
# Determine the data dimensionality by finding the following (5pts)

total_patients = len(bmd_data)
print(f"Total number of patients: {total_patients}")

num_attributes = bmd_data.shape[1]
print(f"Number of attributes (categories): {num_attributes}")

data_types = bmd_data.dtypes
print("Data types:")
print(data_types)

missing_values = bmd_data.isnull().sum()
print("Missing values:")
print(missing_values)

patients_per_class = bmd_data['Fracture'].value_counts()
print("Number of patients in each target class:")
print(patients_per_class)



Total number of patients: 169
Number of attributes (categories): 5
Data types:
Age          float64
Weight_kg    float64
Height_cm    float64
BMD          float64
Fracture      object
dtype: object
Missing values:
Age          0
Weight_kg    0
Height_cm    0
BMD          0
Fracture     0
dtype: int64
Number of patients in each target class:
Fracture
no fracture    119
fracture        50
Name: count, dtype: int64


In [5]:
# Apply Logistic Regression using Ridge Regulation and explain the following (5pts):
X = bmd_data.drop('Fracture', axis=1)  # Features
y = bmd_data['Fracture']  # Target


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


model = LogisticRegression(penalty='l2', solver='liblinear')  
model.fit(X_train, y_train)


y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred, labels=model.classes_)
misclassified_counts = pd.DataFrame(conf_matrix, index=model.classes_, columns=['Predicted ' + cls for cls in model.classes_])
correct_predictions = pd.Series(conf_matrix.diagonal(), index=model.classes_)
misclassified_counts['Misclassified'] = misclassified_counts.sum(axis=1) - correct_predictions


coefficients = pd.DataFrame(model.coef_[0], index=X.columns, columns=['Coefficient'])

print("Feature importance indicated by coefficient:")
print(coefficients)

print(f"The accuracy of Classification: {accuracy}")
print("The Number of patients misclassified for each target class:")
print(misclassified_counts[['Misclassified']])


Feature importance indicated by coefficient:
           Coefficient
Age          -0.046444
Weight_kg     0.066782
Height_cm    -0.014642
BMD           2.382925
The accuracy of Classification: 0.8431372549019608
The Number of patients misclassified for each target class:
             Misclassified
fracture                 6
no fracture              2


In [6]:
# Apply Logistic Regression using Lasso Regulation and explain the following (5pts):

lasso_model = LogisticRegression(penalty='l1', solver='liblinear')  # 'l1' for Lasso
lasso_model.fit(X_train, y_train)


y_pred_lasso = lasso_model.predict(X_test)
accuracy_lasso = accuracy_score(y_test, y_pred_lasso)
conf_matrix_lasso = confusion_matrix(y_test, y_pred_lasso, labels=lasso_model.classes_)
misclassified_counts_lasso = pd.DataFrame(conf_matrix_lasso, index=lasso_model.classes_, columns=['Predicted ' + cls for cls in lasso_model.classes_])
misclassified_counts_lasso['Misclassified'] = misclassified_counts_lasso.sum(axis=1) - pd.Series(conf_matrix_lasso.diagonal(), index=lasso_model.classes_)


coefficients_lasso = pd.DataFrame(lasso_model.coef_[0], index=X.columns, columns=['Coefficient'])


print("Feature importance indicated by coefficients:")
print(coefficients_lasso)

print("The results of Lasso Regularization:")
print(f"The accuracy of Classification: {accuracy_lasso}")
print("The Number of patients misclassified for each target class:")


print(misclassified_counts_lasso[['Misclassified']])



Feature importance indicated by coefficients:
           Coefficient
Age          -0.034442
Weight_kg     0.041916
Height_cm    -0.032225
BMD           7.633766
The results of Lasso Regularization:
The accuracy of Classification: 0.8627450980392157
The Number of patients misclassified for each target class:
             Misclassified
fracture                 5
no fracture              2


In [35]:
print("\nComparison of classification accuracies among the regulation methods:")

print(f"The accuracy of Ridge: {accuracy}")
print(f"The accuracy of Lasso: {accuracy_lasso}")


Comparison of classification accuracies among the regulation methods:
The accuracy of Ridge: 0.8431372549019608
The accuracy of Lasso: 0.8627450980392157
