In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import missingno as msn

from sklearn.impute import SimpleImputer
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler, OneHotEncoder


import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, LeaveOneOut, learning_curve
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, chi2, RFE


from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, 
                             roc_curve, roc_auc_score, classification_report,precision_recall_curve)

In [None]:
df = pd.read_csv(r"C:\Users\Chinna Joka\Downloads\Logistic Regresssion Project\Logistic Regresssion Project\Dataset\h1n1_vaccine_prediction.csv")
df.head()


In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
(df.isnull().mean()*100).sort_values(ascending=False)

In [None]:
df.isnull().sum().sort_values(ascending=False)


In [None]:
msn.bar(df)

In [None]:
sns.heatmap(df.isnull() , cbar=True)

In [None]:
X = df.drop('h1n1_vaccine' , axis=1)
Y = df['h1n1_vaccine']

In [None]:
X.drop(columns=['unique_id' ,'has_health_insur'] , inplace=True , axis=1 , errors='ignore')

In [None]:
print(X.shape)

In [None]:
numerical_features = X.select_dtypes(include=['number']).columns
print(f'numerical columns :  {numerical_features.shape[0]}')
print(numerical_features)
X[numerical_features].describe()

In [None]:
categorical_features = X.select_dtypes(include=['object' ,'category']).columns
print(f'categorical Columns : {categorical_features.shape[0]}')
print(categorical_features)
X[categorical_features].describe()

In [None]:
num_transformer = Pipeline(
    steps=[
        ('num' , SimpleImputer(strategy='median')),
        ('scaler' , StandardScaler())
    ]
)

In [None]:
cat_transformer = Pipeline(
    steps=[
        ('category' , SimpleImputer(strategy='most_frequent')),
        ('encoder' , OneHotEncoder(drop='first' , handle_unknown='ignore'))
    ]
)

In [None]:
isf = IsolationForest(contamination=0.01 , random_state=42)
outliers_pred = isf.fit_predict(X[numerical_features])
remove_outliers = outliers_pred != -1 ##-1 For Outliers Removing ; Where as +1 for regular Values Not Values  

X = X[remove_outliers].reset_index(drop=True)
Y = Y[remove_outliers].reset_index(drop=True)

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('numeric' , num_transformer , numerical_features),
        ('category' , cat_transformer , categorical_features)
    ]
)
print(preprocessor.fit_transform(X).shape)
preprocessor.fit(X)

In [None]:
preprocessor.get_feature_names_out()

In [None]:
x_train , x_test , y_train , y_test = train_test_split(X,Y , test_size=0.2 , random_state=42 , stratify=Y)

In [None]:
logistic_regression_model = Pipeline(
    steps=[
        ('preprocessor' , preprocessor),
        ('model' , LogisticRegression(solver='liblinear' , max_iter=1000))
    ]
)

In [None]:
logistic_regression_model.fit(x_train , y_train)

In [None]:
y_pred = logistic_regression_model.predict(x_test)
y_pred

In [None]:
y_prob = logistic_regression_model.predict_proba(x_test)[:,1]

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall (Sensitivity):", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
specificity = tn / (tn + fp)
print("Specificity:", specificity)

print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
roc_auc = roc_auc_score(y_test, y_prob)
print("ROC-AUC:", roc_auc)


In [None]:
precision, recall, thresholds = precision_recall_curve(y_test, y_prob)
plt.plot(recall, precision)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.show()


In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
auc = roc_auc_score(y_test, y_prob)

plt.figure(figsize=(6, 5))
plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {auc:.2f})")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate (Sensitivity)")
plt.title("ROC Curve")
plt.legend()
plt.show()