# Logistic Regression

We have studies the data in the previous notebook. Now we will try to build a model to predict the survival of the passengers. We will use the logistic regression model for this purpose.

In [1]:
# Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from IPython import get_ipython
import os

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
data = pd.read_csv('../input/train_folds.csv')
data.head()

Unnamed: 0,id,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season,kfold
0,654337,p,13.16,x,k,w,t,d,d,n,6.93,29.49,,,w,,,f,f,,d,a,0
1,1196571,p,7.18,x,y,n,f,s,c,y,8.4,18.35,,y,y,,,f,f,,d,a,0
2,2225235,p,2.5,x,d,w,f,a,d,p,3.32,2.4,,h,w,,,f,f,,g,u,0
3,1283237,e,12.12,p,y,w,f,e,c,k,24.35,13.15,,s,w,,,t,e,,g,w,0
4,2232119,e,8.15,x,s,u,f,a,c,w,4.25,13.83,,,w,,,f,f,,d,a,0


In [4]:
df = data.copy()

In [5]:
# Split the data into training and validation sets based on the kfold column
df_train = df[df.kfold != 0].reset_index(drop=True)
X_valid = df[df.kfold == 0].reset_index(drop=True)

# Drop the kfold column from the training and validation sets
df_train = df_train.drop(['kfold'], axis=1)
X_valid = X_valid.drop(['kfold'], axis=1)

# Drop the target column from the validation set
y_valid = X_valid['class']
X_valid = X_valid.drop(['class'], axis=1)

# Drop the id column from the training and validation sets
df_train = df_train.drop(['id'], axis=1)
X_valid = X_valid.drop(['id'], axis=1)

# Split target column from the training set
X_train = df_train.drop(['class'], axis=1)
y_train = df_train['class']

# Print the shape of the training and validation sets
print(df_train.shape, X_valid.shape)

(2493556, 21) (623389, 20)


## Preprocessing

1. Drop the columns which have high number of missing values. 
2. Identify the categorical and numerical columns.
3. Fill the missing values in the numerical columns with the median value.
4. Fill the missing values in the categorical columns with the mode value.
5. Group the values of the categorical columns which have high unique values.
6. Encode the categorical columns.
7. Scale the numerical columns.


What we are not doing in this notebook but can be done:
1. Feature engineering
2. Using advanced methods to fill the missing values
3. Using advanced methods to encode the categorical columns
4. Using advanced methods to scale the numerical columns

This all we will do after building the base model and if we are not satisfied with the results.


In [6]:
def get_missing_values_summary(dataframe):
    """
    Generates a summary of missing values in the dataframe.

    Parameters:
    dataframe (pd.DataFrame): The input dataframe to analyze.

    Returns:
    pd.DataFrame: A dataframe containing the count and percentage of missing values, 
                  along with the data type of each column that has missing values.
    """
    missing_values_summary = pd.DataFrame({
        'Missing Count': dataframe.isnull().sum(),
        'Missing Percentage': (dataframe.isnull().sum() / len(dataframe) * 100).round(2),
        'Data Type': dataframe.dtypes
    })

    # Filter out columns with no missing values
    missing_values_summary = missing_values_summary[missing_values_summary['Missing Count'] > 0]

    return missing_values_summary


get_missing_values_summary(df_train)

Unnamed: 0,Missing Count,Missing Percentage,Data Type
cap-diameter,4,0.0,float64
cap-shape,32,0.0,object
cap-surface,536861,21.53,object
cap-color,8,0.0,object
does-bruise-or-bleed,5,0.0,object
gill-attachment,419243,16.81,object
gill-spacing,1006647,40.37,object
gill-color,49,0.0,object
stem-root,2205990,88.47,object
stem-surface,1585044,63.57,object


In [7]:
# Get the categorical and numerical columns
cat_cols =X_train.select_dtypes(include='object').columns
num_cols = X_train.select_dtypes(exclude='object').columns

In [8]:
# Fill missing values in the numerical columns with the median value
imputer = SimpleImputer(strategy='median')
X_train[num_cols] = imputer.fit_transform(X_train[num_cols])
X_valid[num_cols] = imputer.transform(X_valid[num_cols])

# Fill missing values in the categorical columsn with the mode value
imputer = SimpleImputer(strategy='most_frequent')
X_train[cat_cols] = imputer.fit_transform(X_train[cat_cols])
X_valid[cat_cols] = imputer.transform(X_valid[cat_cols])

In [9]:
from sklearn.base import BaseEstimator, TransformerMixin

class RareCategoryReplacer(BaseEstimator, TransformerMixin):
    def __init__(self, columns, proportion_threshold=0.02, replacement_value="Others"):
        self.columns = columns
        self.proportion_threshold = proportion_threshold
        self.replacement_value = replacement_value
        self.rare_categories_ = {}
        self.important_categories_ = {}

    def fit(self, X, y=None):
        # Calculate the percentage of each category for each specified column
        for column in self.columns:
            category_percentages = X[column].value_counts(normalize=True)
            self.rare_categories_[column] = category_percentages[
                category_percentages < self.proportion_threshold
            ].index.tolist()
            self.important_categories_[column] = category_percentages[
                category_percentages >= self.proportion_threshold
            ].index.tolist()

        return self

    def transform(self, X):
        X = X.copy()  # Create a copy of the DataFrame to avoid modifying the original data

        for column in self.columns:
            # Replace rare categories with the replacement value
            X[column] = np.where(
                X[column].isin(self.rare_categories_[column]), 
                self.replacement_value, 
                X[column]
            )

            # Replace any new categories not in the important categories with the replacement value
            allowed_categories = self.important_categories_[column] + [self.replacement_value]
            X[column] = np.where(
                ~X[column].isin(allowed_categories), 
                self.replacement_value, 
                X[column]
            )

        return X

    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)

In [10]:
replacer = RareCategoryReplacer(columns=cat_cols, proportion_threshold=0.02)
X_train = replacer.fit_transform(X_train)
X_valid = replacer.transform(X_valid)

In [11]:
# Encode the categorical columns using one-hot encoding
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_train_encoded = encoder.fit_transform(X_train[cat_cols])
X_valid_encoded = encoder.transform(X_valid[cat_cols])

# Create a DataFrame with the encoded columns
X_train_encoded = pd.DataFrame(X_train_encoded, columns=encoder.get_feature_names_out(cat_cols))
X_valid_encoded = pd.DataFrame(X_valid_encoded, columns=encoder.get_feature_names_out(cat_cols))

# Drop the original categorical columns from the training and validation sets
X_train = X_train.drop(cat_cols, axis=1)
X_valid = X_valid.drop(cat_cols, axis=1)

# Concatenate the numerical and encoded categorical columns
X_train = pd.concat([X_train, X_train_encoded], axis=1)
X_valid = pd.concat([X_valid, X_valid_encoded], axis=1)

In [12]:
# Scale the numerical columns using the StandardScaler
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_valid[num_cols] = scaler.transform(X_valid[num_cols])

In [14]:
# Train a Logistic Regression model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import matthews_corrcoef


model = LogisticRegression(random_state=0, max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_valid)



In [15]:
# Calculate the Matthews correlation coefficient
mcc = matthews_corrcoef(y_valid, y_pred)
print(f'Matthews Correlation Coefficient: {mcc}')

Matthews Correlation Coefficient: 0.6593098863497108


In [16]:
from sklearn.metrics import classification_report, accuracy_score

# Print the classification report
print(classification_report(y_valid, y_pred))
print(f'Accuracy: {accuracy_score(y_valid, y_pred)}')

              precision    recall  f1-score   support

           e       0.80      0.83      0.82    282309
           p       0.85      0.83      0.84    341080

    accuracy                           0.83    623389
   macro avg       0.83      0.83      0.83    623389
weighted avg       0.83      0.83      0.83    623389

Accuracy: 0.8306434665995069
