# Logistic Regression

### 1. Import Libraries

In [1]:
from sklearn.linear_model import LogisticRegression                    # Import Logistic Regression
from sklearn.preprocessing import LabelEncoder                         # Encoding categorical data
from sklearn.model_selection import train_test_split                   # Splitting dataset into traning and testing sets
from sklearn.metrics import classification_report, confusion_matrix    # Evaluting the performance of the classifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

# More performance metrics
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### 2. Import Data

```python 
df = pd.read_excel(file) # if excel file
df = pd.read_csv(file)   # if csv file
```

### 3. Data Cleaning

#### 3.1 Basic Overview

```python 
df.info()
```

#### 3.2 Datatypes

#### 3.3 Missing values (Nulls)
```python 
df.dropna() # Remove rows
df.dropna(axis=1) # Remove columns
df[col] = df[col].fillna(np.mean(df[col])) # Fill with mean
```

#### 3.4 Outliers

```python 
df.describe()   # descriptive statistics
filtered_df = df[(df[column] > value) # Filter outliers 
```

#### 3.5 Duplicates

```python
df.drop_duplicates # Drop Duplicates
```

### 4. EDA

- **Drop nominal variabless:** new_df = df.drop([List of columns], axis = 1)

#### Convert Categorical data to numerical

```python

cat_features = df.select_dtypes(include = "object").columns

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

# Categories to Numbers (random mapping) 
le = LabelEncoder()
df[updated col] = le.fit_transform(df[col])
mapping = dict(zip(le.classes_, le.transform(le.classes_)))

# Categories to Numbers (manual mapping) 
order = [['S', 'M', 'L', 'XL']]
ord_enc = OrdinalEncoder(categories=order)
df[updated col] = ord_enc.fit_transform(df[[col]])
mapping = {cat: i for i, cat in enumerate(order[0])}

# Create new columns for each category
new_df = pd.get_dummies(df, columns=[list of columns])
```

#### Correlation

```python
corr = df.corr()
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", center=0)
plt.title("Correlation Heatmap")
plt.show()
```

- **Correlation**: which predictors are correlated with target?
- **Multicollinearity**: which predictors are correlated among themselves?


### 5. Create input & output

```python
X = df.drop(target, axis=1)
y = df[[target]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
```

### 6. Develop the Logistic Regression Model

```python
logclf = LogisticRegression(max_iter=1000)
logclf.fit(X_train, y_train)
y_pred = logclf.predict(X_test)
```

### 7. Evaluate the Model

```python
cm = confusion_matrix(y_test, prediction) 
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = cm)
cm_display.plot()
plt.show()

print('Test accuracy = ', accuracy_score(y_test, prediction)) 

# Coefficients
coeff = logclf.coef_.flatten()
feature_importance = pd.DataFrame({
    "Feature": X_train.columns,
    "Coefficients": coeff
}).sort_values("Coefficients", ascending=False)
feature_importance
```

### 8. Further Evaluation

```python
# Get predicted probabilities
y_pred_prob = log_reg.predict(X_test)

# Compute ROC AUC
logit_roc_auc = roc_auc_score(y_test, y_pred_prob)

# Compute ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, label='Statsmodels Logit (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.savefig('Logit_ROC')
plt.show()
```