In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.api as sm
from sklearn.metrics import roc_auc_score, roc_curve,auc
from datetime import datetime
import ast
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
import seaborn as sns; sns.set(style='whitegrid')
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import HistGradientBoostingClassifier

In [2]:
# read Excel data and combine into one data structure
data = pd.read_excel("Handbag Data_KP5.xlsx")

In [3]:
# Make sure 'Month' column exists
data['Paid at'] = pd.to_datetime(data['Paid at'], errors='coerce')  # Handle errors just in case
data['Month'] = data['Paid at'].dt.month

# Convert 'Month' to string to make sure we get separate columns for each month
data['Month'] = data['Month'].astype(str)

# Generate dummy variables for each month
month_dummies = pd.get_dummies(data['Month'], prefix='Month')

# Concatenate the dummy variables with the original DataFrame
data = pd.concat([data, month_dummies], axis=1)

# Now 'data' contains a set of new columns: 'Month_1', 'Month_2', ..., 'Month_12',
# each indicating the presence of a month with 0 or 1.

# Optionally, you can check the result
print(data.head())

      Name Financial Status  Expired  Paid  Partially_Paid  \
0  #136493             paid        0     1               0   
1  #136491             paid        0     1               0   
2  #136490         refunded        0     0               0   
3  #136487             paid        0     1               0   
4  #136486             paid        0     1               0   

   Partially_Refunded  Pending  Refunded  Voided  Successful_Order  ...  \
0                   0        0         0       0                 1  ...   
1                   0        0         0       0                 1  ...   
2                   0        0         1       0                 0  ...   
3                   0        0         0       0                 1  ...   
4                   0        0         0       0                 1  ...   

  Month_12.0  Month_2.0  Month_3.0  Month_4.0  Month_5.0  Month_6.0  \
0      False       True      False      False      False      False   
1      False      False      False

In [4]:
# Define the state to region mapping
state_to_region = {
    'AL': 'South', 'AK': 'West', 'AZ': 'West', 'AR': 'South', 'CA': 'West',
    'CO': 'West', 'CT': 'East', 'DE': 'East', 'FL': 'South', 'GA': 'South',
    'HI': 'West', 'ID': 'West', 'IL': 'Midwest', 'IN': 'Midwest', 'IA': 'Midwest',
    'KS': 'Midwest', 'KY': 'South', 'LA': 'South', 'ME': 'East', 'MD': 'East',
    'MA': 'East', 'MI': 'Midwest', 'MN': 'Midwest', 'MS': 'South', 'MO': 'Midwest',
    'MT': 'West', 'NE': 'Midwest', 'NV': 'West', 'NH': 'East', 'NJ': 'East',
    'NM': 'West', 'NY': 'East', 'NC': 'South', 'ND': 'Midwest', 'OH': 'Midwest',
    'OK': 'South', 'OR': 'West', 'PA': 'East', 'RI': 'East', 'SC': 'South',
    'SD': 'Midwest', 'TN': 'South', 'TX': 'South', 'UT': 'West', 'VT': 'East',
    'VA': 'South', 'WA': 'West', 'WV': 'South', 'WI': 'Midwest', 'WY': 'West',
    'DC': 'East', 'PR': 'South'
}

In [5]:
df = pd.DataFrame(data)

# Function to map state to region
def map_state_to_region(state):
    return state_to_region.get(state, 'International')

# Creating a new 'Region' column by applying the mapping function
df['Region'] = df['Shipping Province'].apply(map_state_to_region)

# Function to add dummy variable columns for each region directly to the DataFrame
def add_region_columns(df):
    # Create dummy variables for the regions based on the 'Region' column
    region_dummies = pd.get_dummies(df['Region'])

    # Concatenate the dummy variables with the original DataFrame
    return pd.concat([df, region_dummies], axis=1)

print(df)

          Name    Financial Status  Expired  Paid  Partially_Paid  \
0      #136493                paid        0     1               0   
1      #136491                paid        0     1               0   
2      #136490            refunded        0     0               0   
3      #136487                paid        0     1               0   
4      #136486                paid        0     1               0   
...        ...                 ...      ...   ...             ...   
36007   #97828                paid        0     1               0   
36008   #97827                paid        0     1               0   
36009   #97826  partially_refunded        0     0               0   
36010   #97825  partially_refunded        0     0               0   
36011   #97824                paid        0     1               0   

       Partially_Refunded  Pending  Refunded  Voided  Successful_Order  ...  \
0                       0        0         0       0                 1  ...   
1            

In [6]:
trainIndices = np.arange(21607)
valIndices = np.arange(21608,len(data))
training = df.iloc[trainIndices,:]
validation = df.iloc[valIndices,:]

In [22]:
Region_encoded = pd.get_dummies(df['Region'], prefix='Region')
df = pd.concat([df, Region_encoded], axis=1)

# Now, update feature_names to include the new columns from Region_encoded
# This requires knowing or retrieving the column names generated by get_dummies
new_region_features = Region_encoded.columns.tolist()
feature_names = ['Number_of_Bags', 'Last_Chance', 'Discount', 'Lineitem_quantity', 'Subtotal', 'Month'] + new_region_features

In [24]:
trainIndices = np.arange(21607)
valIndices = np.arange(21608,len(data))
training = df.iloc[trainIndices,:]
validation = df.iloc[valIndices,:]

In [25]:
# Select features for X and the target variable y from the training set
X = training[feature_names]
y = training['Successful_Order']

# Prepare the test set features similarly
Z = validation[feature_names]

In [33]:
# Drop rows with NaN values from X and ensure y is aligned
X_clean = X.dropna()
y_clean = y.loc[X_clean.index]  # Align y with the cleaned X

In [35]:
print(f"NaN values in X: {X_clean.isnull().any().sum()}")
print(f"NaN values in y: {y_clean.isnull().any().sum()}")

NaN values in X: 0
NaN values in y: 0


In [None]:
# Now, fit your model with the cleaned datasets
model = RandomForestClassifier(n_estimators=5)
model.fit(X_clean, y_clean)

In [62]:
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

# Create an imputer and the RandomForestClassifier
imputer = SimpleImputer(strategy='mean')  # Or another strategy as appropriate
classifier = RandomForestClassifier(n_estimators=100)

# Create a pipeline that first imputes missing values, then fits the model
pipeline = Pipeline(steps=[('imputer', imputer), ('classifier', classifier)])

# Fit the pipeline on your datasets
pipeline.fit(X, y)  # Note: Use the original X and y here, the imputer will handle NaN values

# Now you can use the pipeline to make predictions, and it will handle missing values using the specified strategy

In [63]:
# Make predictions on the test set
test_predictions = pipeline.predict(Z)

test_auc = roc_auc_score(validation['Successful_Order'], test_predictions)
print('AUC of the RandomForestClassifier on the test set is',test_auc)

AUC of the RandomForestClassifier on the test set is 0.5392391826077838


In [59]:
# Now, fit your model with the cleaned datasets
model = HistGradientBoostingClassifier()
model.fit(X, y)

In [60]:
# Make predictions on the test set
test_predictions = model.predict(Z)

In [61]:
test_auc = roc_auc_score(validation['Successful_Order'], test_predictions)
print('AUC of the HistGradientBoostingClassifier on the test set is',test_auc)

AUC of the HistGradientBoostingClassifier on the test set is 0.5218044212190314
