In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [11]:
df = pd.read_csv('../ProcessedData/stroke_cleaned.csv')

In [12]:
X = df.drop('stroke', axis=1)
y = df['stroke']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [13]:
df.select_dtypes(include='object').columns

Index([], dtype='object')

In [14]:
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)
y_pred_logreg = logreg.predict(X_val)
f1_logreg = f1_score(y_val, y_pred_logreg)

In [15]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_val)
f1_knn = f1_score(y_val, y_pred_knn)

In [16]:
print("Logistic Regression F1 Score: {:.4f}".format(f1_logreg))
print(classification_report(y_val, y_pred_logreg))

print("\nKNN F1 Score: {:.4f}".format(f1_knn))
print(classification_report(y_val, y_pred_knn))

Logistic Regression F1 Score: 0.0545
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      2348
           1       0.33      0.03      0.05       101

    accuracy                           0.96      2449
   macro avg       0.65      0.51      0.52      2449
weighted avg       0.93      0.96      0.94      2449


KNN F1 Score: 0.1818
              precision    recall  f1-score   support

           0       0.96      0.99      0.98      2348
           1       0.39      0.12      0.18       101

    accuracy                           0.96      2449
   macro avg       0.68      0.56      0.58      2449
weighted avg       0.94      0.96      0.94      2449



In [24]:
print(y.value_counts(normalize=True))

stroke
0    0.95867
1    0.04133
Name: proportion, dtype: float64


In [25]:
logreg = LogisticRegression(class_weight='balanced', max_iter=1000)
logreg.fit(X_resampled, y_resampled)

In [35]:
acc = accuracy_score(y_val, y_pred_logreg)
f1 = f1_score(y_val, y_pred_logreg)
prec = precision_score(y_val, y_pred_logreg)
rec = recall_score(y_val, y_pred_logreg)
auc = roc_auc_score(y_val, y_pred_logreg)

In [36]:
print("Logistic Regression Accuracy Score: {:.4f}".format(acc))
print("Logistic Regression F1 Score: {:.4f}".format(f1))
print("Logistic Regression Precision Score: {:.4f}".format(prec))
print("Logistic Regression Recall Score: {:.4f}".format(rec))
print("Logistic Regression AUC: {:.4f}".format(auc))

Logistic Regression Accuracy Score: 0.8640
Logistic Regression F1 Score: 0.2053
Logistic Regression Precision Score: 0.1352
Logistic Regression Recall Score: 0.4257
Logistic Regression AUC: 0.6543


In [53]:
with open('../Output/evaluation_metrics.txt', 'w') as f:
    f.write("Logistic Regression Accuracy Score: {:.4f}\n".format(acc))
    f.write("Logistic Regression F1 Score: {:.4f}\n".format(f1))
    f.write("Logistic Regression Precision Score: {:.4f}\n".format(prec))
    f.write("Logistic Regression Recall Score: {:.4f}\n".format(rec))
    f.write("Logistic Regression AUC: {:.4f}\n".format(auc))

In [37]:
print(classification_report(y_val, y_pred_logreg))

              precision    recall  f1-score   support

           0       0.97      0.88      0.93      2348
           1       0.14      0.43      0.21       101

    accuracy                           0.86      2449
   macro avg       0.55      0.65      0.57      2449
weighted avg       0.94      0.86      0.90      2449



In [38]:
test_df = pd.read_csv('../RawData/test.csv')
test_ids = test_df['id']
test_df = test_df.drop('id', axis=1)

In [39]:
test_df['gender'] = test_df['gender'].map({'Male': 0, 'Female': 1, 'Other': 2})
test_df['ever_married'] = test_df['ever_married'].map({'No': 0, 'Yes': 1})
test_df['Residence_type'] = test_df['Residence_type'].map({'Rural': 0, 'Urban': 1})
test_df = pd.get_dummies(test_df, columns=['smoking_status'], drop_first=True)
test_df = pd.get_dummies(test_df, columns=['work_type'], drop_first=True)

In [41]:
test_df = test_df[X.columns] 

In [49]:
test_logistic = logreg.predict_proba(test_df)[:, 1]

In [50]:
test_logistic = (test_logistic >= 0.5).astype(int)

In [51]:
submission = pd.DataFrame({
    'id': test_ids,
    'TARGET': test_logistic
})
submission.to_csv('../Output/submission.csv', index=False)