# Data exploration:-

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from sklearn.linear_model import LinearRegression
from google.colab import drive
drive.mount('/content/drive')
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,r2_score

data = pd.read_csv('/content/drive/MyDrive/Dataset/Supply_Chain_Dataset_New.csv', encoding='latin1', engine='python')

data.info()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180519 entries, 0 to 180518
Data columns (total 48 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   Type                           180519 non-null  object 
 1   Days for shipping (real)       180519 non-null  int64  
 2   Days for shipment (scheduled)  180519 non-null  int64  
 3   Benefit per order              180519 non-null  float64
 4   Sales per customer             180519 non-null  float64
 5   Delivery Status                180519 non-null  object 
 6   Late_delivery_risk             180519 non-null  int64  
 7   Category Id                    180519 non-null  int64  
 8   Category Name                  180519 non-null  object 
 9   Customer City                  180519 non-null  object 
 10  Customer Country       

# Pre process:-

In [5]:
#drop unwanted columns
unwanted_columns =["Category Id","Customer Email","Customer Zipcode","Customer Street","Customer Fname","Customer Id","Customer Lname","Customer Segment","Latitude","Longitude","Order Customer Id","order date (DateOrders)","Order Id","Order Item Cardprod Id","Order Item Id","Order Zipcode","Product Card Id","Product Category Id","Product Status","shipping date (DateOrders)"]
data.drop(unwanted_columns, axis=1, inplace=True)

# change object data into int or float

hist_data = data.copy(deep=False)

from sklearn.preprocessing import LabelEncoder
def Change_obj_type(data):
    for column in data.columns:
        if data[column].dtype == type(object):
            le = LabelEncoder()
            data[column] = le.fit_transform(data[column])
    return data
new_data = Change_obj_type(hist_data)
new_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180519 entries, 0 to 180518
Data columns (total 28 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   Type                           180519 non-null  int64  
 1   Days for shipping (real)       180519 non-null  int64  
 2   Days for shipment (scheduled)  180519 non-null  int64  
 3   Benefit per order              180519 non-null  float64
 4   Sales per customer             180519 non-null  float64
 5   Delivery Status                180519 non-null  int64  
 6   Late_delivery_risk             180519 non-null  int64  
 7   Category Name                  180519 non-null  int64  
 8   Customer City                  180519 non-null  int64  
 9   Customer Country               180519 non-null  int64  
 10  Customer State                 180519 non-null  int64  
 11  Market                         180519 non-null  int64  
 12  Order City                    

# Model build:

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import statsmodels.api as sm

# Load your dataset
# Assuming your DataFrame is named 'new_data'
X = new_data[['Days for shipping (real)', 'Days for shipment (scheduled)', 'Benefit per order', 'Sales per customer',
        'Order Item Discount', 'Order Item Discount Rate', 'Order Item Product Price', 'Order Item Profit Ratio',
        'Order Item Quantity', 'Sales', 'Order Item Total', 'Order Profit Per Order', 'Product Price']]
y = new_data['Late_delivery_risk']


Remove highly correlated features:

In [8]:
# Calculate the correlation matrix
correlation_matrix = X.corr()

# Create a mask to identify highly correlated features
mask = np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool)
highly_correlated = correlation_matrix.abs() > 0.8  # You can adjust the threshold as needed

# Identify pairs of highly correlated features
correlated_features = set()
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if highly_correlated.iloc[i, j]:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)

# Drop highly correlated features
X_no_high_corr = X.drop(columns=correlated_features)

# Display the remaining features
print("Remaining Features:")
print(X_no_high_corr.columns)

Remaining Features:
Index(['Days for shipping (real)', 'Days for shipment (scheduled)',
       'Benefit per order', 'Sales per customer', 'Order Item Discount',
       'Order Item Discount Rate', 'Order Item Product Price',
       'Order Item Quantity'],
      dtype='object')


Remove multi corlinearity:

In [9]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Calculate VIF for each feature for check and remove multicollinearity among features
vif_data = pd.DataFrame()
vif_data["Variable"] = X_no_high_corr.columns
vif_data["VIF"] = [variance_inflation_factor(X_no_high_corr.values, i) for i in range(X_no_high_corr.shape[1])]

# Display the VIF values
print(vif_data)

                        Variable        VIF
0       Days for shipping (real)   7.179267
1  Days for shipment (scheduled)   7.102015
2              Benefit per order   1.063257
3             Sales per customer  41.211407
4            Order Item Discount   6.342249
5       Order Item Discount Rate   7.213358
6       Order Item Product Price  27.330079
7            Order Item Quantity  14.728651


In [14]:
# Identify features with high VIF
high_vif_features = vif_data[vif_data["VIF"] > 40]["Variable"]

# Drop features with high VIF
X_filtered = X_no_high_corr.drop(high_vif_features, axis=1)

# Display the updated feature matrix
X_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180519 entries, 0 to 180518
Data columns (total 7 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   Days for shipping (real)       180519 non-null  int64  
 1   Days for shipment (scheduled)  180519 non-null  int64  
 2   Benefit per order              180519 non-null  float64
 3   Order Item Discount            180519 non-null  float64
 4   Order Item Discount Rate       180519 non-null  float64
 5   Order Item Product Price       180519 non-null  float64
 6   Order Item Quantity            180519 non-null  int64  
dtypes: float64(4), int64(3)
memory usage: 9.6 MB


**model training:**

In [15]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_filtered, y, test_size=0.2, random_state=42)

# Standardize the features (optional but recommended)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create a Logistic Regression model with regularization
model = LogisticRegression(random_state=42, C=0.001)  # You can experiment with different values of C

# Train the model
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Print the results
print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{classification_rep}')

# Calculate McFadden's R-squared using statsmodels
X_train_sm = sm.add_constant(X_train_scaled)
logit_model = sm.Logit(y_train, X_train_sm)
result = logit_model.fit()
print(f"McFadden's R-squared: {result.prsquared}")

Accuracy: 0.9755151783735874
Confusion Matrix:
[[15508   884]
 [    0 19712]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.95      0.97     16392
           1       0.96      1.00      0.98     19712

    accuracy                           0.98     36104
   macro avg       0.98      0.97      0.98     36104
weighted avg       0.98      0.98      0.98     36104

Optimization terminated successfully.
         Current function value: 0.168615
         Iterations 9
McFadden's R-squared: 0.7550489230799637


McFadden's R-squared: 0.755.Therefore good for bi.