# Import necessary libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Read the training data
df = pd.read_csv("/kaggle/input/stroke-prediction-by-123-of-ai-dec-2023/stroke_train_set.csv")

In [None]:
# Preprocessing: Fill missing values
df['bmi'].fillna(df['bmi'].median(), inplace=True)

In [None]:
plt.hist(df['age'], bins=20, color='blue')
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()

In [None]:
# One-Hot Encoding
df = pd.get_dummies(df, columns=['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'], drop_first=True)

In [None]:
# Correlation Heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, cmap='viridis')
plt.title('Feature Correlation Heatmap')
plt.show()

In [None]:
# Normalize the features
scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(df.drop(columns=['stroke']))
y = df['stroke']

In [None]:
# Class Distribution
plt.figure(figsize=(6, 6))
df['stroke'].value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.title('Class Distribution of Stroke')
plt.ylabel('')
plt.show()

In [None]:
# Handling Imbalanced Dataset with SMOTE
smote = SMOTE(random_state=42)
X_sm, y_sm = smote.fit_resample(df_scaled, y)

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.2, random_state=42)

In [None]:
# Initialize and train the Logistic Regression Classifier
logreg_model = LogisticRegression(random_state=42)

In [None]:
# Define parameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'penalty': ['l1', 'l2'],       # Type of regularization
    'solver': ['liblinear']         # Solver that supports both l1 and l2 penalties
}

In [None]:
# Grid Search with Cross-Validation
grid_search = GridSearchCV(logreg_model, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [None]:
# Best Model
best_model = grid_search.best_estimator_

In [None]:
# Make predictions
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:,1]

In [None]:
# Evaluate the model
print("Logistic Regression Classifier Performance:")
print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_pred_proba))

## --- Code for preparing test data and making predictions --- #"""

In [None]:
# Reading the test data
test_df = pd.read_csv("/kaggle/input/stroke-prediction-by-123-of-ai-dec-2023/stroke_test_set_nogt.csv")


In [None]:
# Preprocessing: Fill missing values
test_df['bmi'].fillna(test_df['bmi'].median(), inplace=True)

In [None]:
# One-Hot Encoding for categorical variables in test data
test_df = pd.get_dummies(test_df, columns=['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'], drop_first=True)

In [None]:
# Add missing columns with zeros
missing_cols = set(df.columns) - set(test_df.columns) - {'stroke'}
for col in missing_cols:
    test_df[col] = 0

In [None]:
# Ensure the order of columns in test data is the same as in training data
test_df = test_df[df.columns.drop('stroke')]

In [None]:
# Apply the scaler
test_df_scaled = scaler.transform(test_df)

In [None]:
# Use the Logistic Regression model to make predictions on the scaled test data
y_test_pred = best_model.predict(test_df_scaled)

In [None]:

# Create a new DataFrame for submission
submission_df = pd.DataFrame({
    'ID': range(len(y_test_pred)),  # Creates a column 'ID' with values from 0 to the length of your predictions - 1
    'stroke': y_test_pred           # Your model's predictions for each ID
})

# Now we save this DataFrame to a CSV file without the index column
submission_df.to_csv('submission.csv', index=False)



In [None]:
submission_df.head()