### Feature Addition

In [None]:
#Feature Addtion

#import libraries
import pandas as pd
import numpy as np
import holidays

#load the dataset from CSV file contains 2 col: Delivery date - Estimated Receive date
date = pd.read_csv('data/date.csv')

#convert date columns to datetime format
date['Delivery_date'] = pd.to_datetime(date['Delivery_date'], format='%m/%d/%Y')
date['Estimated_Receive_date'] = pd.to_datetime(date['Estimated_Receive_date'], format='%m/%d/%Y')

#define national holidays for the given period (2020)
vn_holidays = holidays.Vietnam(years=2020)  

#function to count weekends and weekdays between two dates
def count_weekend_weekday(start_date, end_date):
    weekend_count = 0
    weekday_count = 0
    for single_date in pd.date_range(start=start_date, end=end_date):
        if single_date.weekday() >= 5:  #sat or sunday
            weekend_count += 1
        else:
            weekday_count += 1
    return weekend_count, weekday_count

#function to count the number of holidays and non-holidays in the given period
def count_holidays(start_date, end_date):
    holiday_count = 0
    non_holiday_count = 0
    for single_date in pd.date_range(start=start_date, end=end_date):
        if single_date in vn_holidays:
            holiday_count += 1
        else:
            non_holiday_count += 1
    return holiday_count, non_holiday_count

# Function to calculate the Covid period (0: No Covid, 1: Covid, 2: Both)
def covid_period(start_date, end_date):
    period_1_start = pd.to_datetime('01/23/2020', format='%m/%d/%Y')
    period_2_start = pd.to_datetime('04/25/2020', format='%m/%d/%Y')
    end_date_covid = pd.to_datetime('12/31/2020', format='%m/%d/%Y')

    covid_status = 0  #no Covid by default
    if start_date <= period_1_start <= end_date or start_date <= period_2_start <= end_date:
        covid_status = 1  #covid period
    if start_date <= period_1_start and end_date >= period_2_start:
        covid_status = 2  #both Covid and no Covid period
    return covid_status

# Apply the functions to the data frame
date['Weekend_Count'], date['Weekday_Count'] = zip(*date.apply(lambda row: count_weekend_weekday(row['Delivery_date'], row['Estimated_Receive_date']), axis=1))
date['Holiday_Count'], date['Non_Holiday_Count'] = zip(*date.apply(lambda row: count_holidays(row['Delivery_date'], row['Estimated_Receive_date']), axis=1))
date['Covid'] = date.apply(lambda row: covid_period(row['Delivery_date'], row['Estimated_Receive_date']), axis=1)

# Save the updated dataframe to file
date.to_csv('updated_date_with_features.csv', index=False)

# Display the resulting DataFrame with new features
print(date.head())


  Delivery_date Estimated_Receive_date  Weekend_Count  Weekday_Count  \
0    2020-01-04             2020-01-15              4              8   
1    2020-01-04             2020-01-11              3              5   
2    2020-01-09             2020-01-16              2              6   
3    2020-01-13             2020-01-19              2              5   
4    2020-01-15             2020-01-22              2              6   

   Holiday_Count  Non_Holiday_Count  Covid  
0              0                 12      0  
1              0                  8      0  
2              0                  8      0  
3              0                  7      0  
4              0                  8      0  


### Feature Selection

In [32]:
import pandas as pd
synthetic_data = pd.read_csv("data/balanced_synthetic_data_scenario3.csv")
feature_df = pd.read_csv("data/preprocessed_df_scenario3.csv")

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns
# set x to features in the real dataset, y is the label (can change to feature_df for observation on the synthetic feature importance)
X = feature_df.drop(columns=['Label'])
y = feature_df['Label']

#  Train a RandomForestClassifier (with hyperparameter: n_estimators = 100 means 100 dec trees will be build as part of RF ensemble, higher number of trees = higher performance, randomstate = 42 to make the experiment more consistent, ensure the results in every run, control the randomness of the algorithm)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X, y)

#  Get feature importances
importances = rf_model.feature_importances_

#  Create a DataFrame to display the feature importances
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
})

# Sort the features by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Plot the feature importances as a bar plot
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.title('Feature Importance of Real Data using Random Forest')
plt.show()


# Print the sorted importance table
print(feature_importance_df)


In [None]:
#MUTUAL INFO

#import libraries
from sklearn.feature_selection import mutual_info_classif
import pandas as pd

# synthetic data and real data already defined
# drop target variable and apply mutual information calculation on features
X_synthetic = synthetic_data.drop(columns=['Label'])
y_synthetic = synthetic_data['Label']

X_real = feature_df.drop(columns=['Label'])
y_real = feature_df['Label']

# calculate mutual information for synthetic data
mi_synthetic = mutual_info_classif(X_synthetic, y_synthetic)

# calculate mutual information for real data
mi_real = mutual_info_classif(X_real, y_real)

# create DataFrames to hold the feature names and their corresponding MI scores
mi_synthetic_df = pd.DataFrame({'Feature': X_synthetic.columns, 'MI_synthetic': mi_synthetic})
mi_real_df = pd.DataFrame({'Feature': X_real.columns, 'MI_real': mi_real})

# merge  MI dataframes on the Feature column
mi_df = pd.merge(mi_synthetic_df, mi_real_df, on='Feature')

# remove features where MI is zero in either dataset
mi_df = mi_df[(mi_df['MI_synthetic'] > 0) & (mi_df['MI_real'] > 0)]

# sort the features by mutual information (higher MI means more important)
mi_df = mi_df.sort_values(by=['MI_synthetic', 'MI_real'], ascending=False)
# filter features with non-zero MI in both datasets
mi_df_non_zero = mi_df[(mi_df['MI_synthetic'] > 0) & (mi_df['MI_real'] > 0)]

# sort features by their average MI score
mi_df['Average_MI'] = (mi_df['MI_synthetic'] + mi_df['MI_real']) / 2
mi_df_sorted = mi_df.sort_values(by='Average_MI', ascending=False)

# ensure 10 features: take non-zero MI features first, then fallback to highest MIs
top_features = mi_df_non_zero.head(12)
if len(top_features) < 10:
    remaining_features = mi_df_sorted[~mi_df_sorted['Feature'].isin(top_features['Feature'])]
    top_features = pd.concat([top_features, remaining_features.head(10 - len(top_features))])

# display the top features and save to file
print("Top 10 Features based on Mutual Information (MI) Scores in both datasets:")
print(top_features[['Feature', 'MI_synthetic', 'MI_real']].to_string(index=False))
top_features.to_csv("data/feature_selection.csv", index=False)
#  the top 10 features display
top_features = mi_df.head(10)



Top 10 Features based on Mutual Information (MI) Scores in both datasets:
                Feature  MI_synthetic  MI_real
           Container_20      0.041351 0.075049
          Holiday_Count      0.024054 0.019407
     Delivery_lead_time      0.022952 0.143644
    Deli_lead_time_late      0.017679 0.091875
                  Korea      0.014170 0.095455
          Weekend_Count      0.012888 0.060200
Deli_date_lateness_late      0.011210 0.144285
           Confirm_late      0.010309 0.019601
          Weekday_Count      0.010208 0.036871
    Estimated_deli_time      0.005940 0.081493


### Use SMOTE for handling class imbalance, update the class "0" equals to class "1"

In [None]:
synthetic_data = pd.read_csv("data/synthetic_data_scenario1.csv")

#### For synthetic data

In [None]:
#SMOTE 

#import library
from imblearn.over_sampling import SMOTE
#define synthetic or real data path 
synthetic_data = pd.read_csv("data/synthetic_data_scenario1.csv")
#drop target column (Label) for x and Label for y
X = synthetic_data.drop('Label', axis=1)  
y = synthetic_data['Label']  
#smote hyperparameters and train on synthetic data 
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Combine the resampled features and target variable back into a DataFrame
balanced_synthetic_data = pd.DataFrame(X_resampled, columns=X.columns)
balanced_synthetic_data['Label'] = y_resampled  

#  balanced_synthetic_data 
print(balanced_synthetic_data['Label'].value_counts())  
# export to file
balanced_synthetic_data.to_csv("data/balanced_synthetic_data_scenario2.csv", index=False)

#### For real data

In [None]:
from imblearn.over_sampling import SMOTE

X = feature_df.drop('Label', axis=1)  
y = feature_df['Label']  

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Combine the resampled features and target variable back into a DataFrame
balanced_feature_df = pd.DataFrame(X_resampled, columns=X.columns)
balanced_feature_df['Label'] = y_resampled  

#  balanced_synthetic_data 
print(balanced_feature_df['Label'].value_counts())  