### Modeling

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.linear_model import Ridge, Lasso
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV


- from Google Drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/Othercomputers/My Mac/Final Project/Time-Series-Forecasting/data/cleaned_05.csv')

- from Local Computer

In [2]:
df = pd.read_csv('/Users/leahchen/Documents/LHL/Final Project/Time-Series-Forecasting/data/cleaned_05.csv')

In [3]:
df.head()

Unnamed: 0,order_id,date,country,quantity,gift_quantity,gross_revenue,profit,day_of_week_x,month_of_year_x,year_x,...,payment_CARD_CSOB,payment_CHECKOUT_CARD,payment_PAYPAL_EXPRESS,payment_CHECKOUT_APPLE,payment_CHECKOUT_BANCONTACT,payment_BANK_TRANSFER,payment_CASH,payment_CSOB_CARD,payment_INVOICE_HUF,payment_PAYPAL_IDEALO
0,1900079010,2019-06-01,Czech Republic,3,0.0,265.519767,109.789535,5,6,2019,...,0,0,0,0,0,0,0,0,0,0
1,1900079006,2019-06-01,Czech Republic,5,0.0,80.658914,34.23062,5,6,2019,...,0,0,0,0,0,0,0,0,0,0
2,1900080490,2019-06-01,Czech Republic,2,0.0,228.329457,53.247287,5,6,2019,...,1,0,0,0,0,0,0,0,0,0
3,1900078998,2019-06-01,Czech Republic,2,0.0,23.384108,11.603101,5,6,2019,...,1,0,0,0,0,0,0,0,0,0
4,1900078996,2019-06-01,Slovakia,1,0.0,45.199763,17.174569,5,6,2019,...,0,0,0,0,0,0,0,0,0,0


In [4]:
df.columns

Index(['order_id', 'date', 'country', 'quantity', 'gift_quantity',
       'gross_revenue', 'profit', 'day_of_week_x', 'month_of_year_x', 'year_x',
       'week_of_year_x', 'price_difference', 'add_on_products', 'hour_of_day',
       'morning', 'afternoon', 'evening', 'avg_sales_country_product_day',
       'total_orders', 'total_products', 'average_orders_per_day',
       'average_products_per_day', 'category_football', 'category_running',
       'category_fitness', 'category_lifestyle', 'gender_men', 'gender_women',
       'gender_unisex', 'age_adults', 'age_kids', 'color_Red', 'color_White',
       'color_Pink', 'color_Blue', 'color_Orange', 'color_Green',
       'color_Black', 'color_Silver', 'group0_Other', 'group0_Apparel',
       'group0_Footwear', 'group0_Equipment', 'group0_Nutrition',
       'payment_COD', 'payment_GOPAY_CARD', 'payment_PAYPAL',
       'payment_CARD_GOPAY', 'payment_CARD_CSOB', 'payment_CHECKOUT_CARD',
       'payment_PAYPAL_EXPRESS', 'payment_CHECKOUT_APPLE',

In [5]:
# Assuming your DataFrame is named 'df'
df1 = df.round(2)

# If you want to round specific columns, you can do it like this:
columns_to_round = ['gross_revenue','profit','average_orders_per_day','avg_sales_country_product_day','average_products_per_day']
df1 = df.copy()
for col in columns_to_round:
    df1[col] = df[col].round(2)

df1.head()

Unnamed: 0,order_id,date,country,quantity,gift_quantity,gross_revenue,profit,day_of_week_x,month_of_year_x,year_x,...,payment_CARD_CSOB,payment_CHECKOUT_CARD,payment_PAYPAL_EXPRESS,payment_CHECKOUT_APPLE,payment_CHECKOUT_BANCONTACT,payment_BANK_TRANSFER,payment_CASH,payment_CSOB_CARD,payment_INVOICE_HUF,payment_PAYPAL_IDEALO
0,1900079010,2019-06-01,Czech Republic,3,0.0,265.52,109.79,5,6,2019,...,0,0,0,0,0,0,0,0,0,0
1,1900079006,2019-06-01,Czech Republic,5,0.0,80.66,34.23,5,6,2019,...,0,0,0,0,0,0,0,0,0,0
2,1900080490,2019-06-01,Czech Republic,2,0.0,228.33,53.25,5,6,2019,...,1,0,0,0,0,0,0,0,0,0
3,1900078998,2019-06-01,Czech Republic,2,0.0,23.38,11.6,5,6,2019,...,1,0,0,0,0,0,0,0,0,0
4,1900078996,2019-06-01,Slovakia,1,0.0,45.2,17.17,5,6,2019,...,0,0,0,0,0,0,0,0,0,0


In [6]:
def encode_cyclical(df1, col, max_val):
    df1[col + '_sin'] = np.sin(2 * np.pi * df[col]/max_val)
    df1[col + '_cos'] = np.cos(2 * np.pi * df[col]/max_val)
    return df1

# Encode date-related features as cyclical features
df1 = encode_cyclical(df, 'day_of_week_x', 6)
df1 = encode_cyclical(df, 'month_of_year_x', 11)
df1 = encode_cyclical(df, 'hour_of_day', 23)
df1 = encode_cyclical(df, 'week_of_year_x', 52)

# Drop the original date-related features
df1 = df1.drop(['day_of_week_x', 'month_of_year_x', 'hour_of_day', 'year_x', 'week_of_year_x'], axis=1)

In [7]:
# Define the features (X) and the target variable (y)
X = df1.drop(columns=['date','total_products','total_orders','gross_revenue','profit','order_id','average_products_per_day','avg_sales_country_product_day'])
y = df1['gross_revenue']

In [8]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Define the lists of numeric, categorical, binary, and cyclical features
numeric_features = ['quantity','average_orders_per_day','price_difference']
categorical_features = ['country','gift_quantity'] 
binary_features = ['morning', 'afternoon', 'evening']
cyclical_features = ['day_of_week_x_sin','day_of_week_x_cos','month_of_year_x_sin',
                     'month_of_year_x_cos','hour_of_day_sin','hour_of_day_cos','week_of_year_x_sin',
                     'week_of_year_x_cos']

In [10]:
# Create transformers for numeric, categorical, binary, and cyclical features
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
binary_transformer = 'passthrough'

# Create a column transformer that applies the appropriate transformations to each feature subset
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('bin', binary_transformer, binary_features),
        ('cyc', 'passthrough', cyclical_features)])

In [11]:
# Create a baseline model (mean of the target variable)
y_baseline = np.mean(y_train)

- Linear Regression Model

In [13]:
# Create a pipeline with the preprocessor and the linear regression model
lr_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('regressor', LinearRegression())])

# Train and evaluate the models
pipelines = [lr_pipeline]
model_names = ['Linear Regression']

for pipeline, model_name in zip(pipelines, model_names):
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Evaluate the model
    y_pred = lr_pipeline.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f"{model_name} Mean Squared Error: {mse}")

# Calculate the mean squared error for the baseline model
mse_baseline = mean_squared_error(y_test, [y_baseline] * len(y_test))
print(f"Baseline Model Mean Squared Error: {mse_baseline}") 


Linear Regression Mean Squared Error: 2813.9813428136226
Baseline Model Mean Squared Error: 3481.813810487228


In [14]:
mae_lr = mean_absolute_error(y_test, y_pred)
mse_lr = mean_squared_error(y_test, y_pred)
rmse_lr = np.sqrt(mse_lr)
r2_lr = r2_score(y_test, y_pred)
print("Linear Regression:")
print(f"MAE: {mae_lr:.2f}")
print(f"MSE: {mse_lr:.2f}")
print(f"RMSE: {rmse_lr:.2f}")
print(f"R2:{r2_lr:.2f}")

Linear Regression:
MAE: 35.35
MSE: 2813.98
RMSE: 53.05
R2:0.19


In [15]:
# Get the Linear Regression model from the pipeline
lr_model = lr_pipeline.named_steps['regressor']

# Get the coefficients
coefficients = lr_model.coef_

# Get the preprocessor from the pipeline
preprocessor = lr_pipeline.named_steps['preprocessor']

# Transform the training data using the preprocessor
X_train_transformed = preprocessor.transform(X_train)

# Get the feature names after preprocessing
# This line assumes that the preprocessor is a ColumnTransformer
feature_names = preprocessor.get_feature_names_out()

# Create a DataFrame with feature names and their coefficients
coef_df = pd.DataFrame({'feature': feature_names, 'coefficient': coefficients})

# Sort the DataFrame by the absolute value of coefficients (descending order)
coef_df['abs_coefficient'] = coef_df['coefficient'].abs()
coef_df = coef_df.sort_values(by='abs_coefficient', ascending=False).drop(columns=['abs_coefficient'])

# Display the coefficients
print("Coefficients:")
print(coef_df)

Coefficients:
                        feature   coefficient
26       cat__gift_quantity_2.0  1.177153e+12
25       cat__gift_quantity_1.0  1.177153e+12
27       cat__gift_quantity_3.0  1.177153e+12
24       cat__gift_quantity_0.0  1.177153e+12
23  cat__country_United Kingdom  3.350567e+10
5         cat__country_Bulgaria  3.350567e+10
3          cat__country_Austria  3.350567e+10
22          cat__country_Sweden  3.350567e+10
9          cat__country_Finland  3.350567e+10
11         cat__country_Germany  3.350567e+10
20        cat__country_Slovenia  3.350567e+10
8          cat__country_Denmark  3.350567e+10
10          cat__country_France  3.350567e+10
15     cat__country_Netherlands  3.350567e+10
4          cat__country_Belgium  3.350567e+10
13         cat__country_Ireland  3.350567e+10
21           cat__country_Spain  3.350567e+10
14           cat__country_Italy  3.350567e+10
17        cat__country_Portugal  3.350567e+10
6          cat__country_Croatia  3.350567e+10
7   cat__country_Cze

- XGBoost Model

In [16]:
# Create a pipeline for the XGBoost model
xgb_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', XGBRegressor(n_jobs=-1, random_state=42))])

# Train and evaluate the models
pipelines = [xgb_pipeline]
model_names = ['xgboost']

# Train and evaluate the models
for pipeline, model_name in zip(pipelines, model_names):
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Evaluate the model
    y_pred_xg = xgb_pipeline.predict(X_test)

# Calculate the mean squared error for the baseline model
y_baseline = y_train.mean()
mse_baseline = mean_squared_error(y_test, [y_baseline] * len(y_test))
print(f"Baseline Model Mean Squared Error: {mse_baseline}")


Baseline Model Mean Squared Error: 3481.813810487228


In [17]:
# Evaluate the model
y_pred_xg = xgb_pipeline.predict(X_test)
mae_xg = mean_absolute_error(y_test, y_pred_xg)
mse_xg = mean_squared_error(y_test, y_pred_xg)
rmse_xg = np.sqrt(mse_xg)
r2_xg = r2_score(y_test, y_pred_xg)
print("XGBoost :")
print(f"MAE: {mae_xg:.2f}")
print(f"MSE: {mse_xg:.2f}")
print(f"RMSE: {rmse_xg:.2f}")
print(f"R2:{r2_xg:.2f}")

XGBoost :
MAE: 28.64
MSE: 2142.73
RMSE: 46.29
R2:0.38


- Tuning the model

In [18]:
xgb_param_grid = {
    'regressor__n_estimators': [100, 200, 300,400],
    'regressor__learning_rate': [0.01, 0.1, 0.2],
    'regressor__max_depth': [3, 6, 9],
    'regressor__gamma': [0, 0.1, 0.2]
}

In [19]:
xgb_grid_search = GridSearchCV(xgb_pipeline, xgb_param_grid, scoring='neg_mean_squared_error',
                               cv=3, n_jobs=-1, verbose=2)


In [20]:
xgb_grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits
[CV] END regressor__learning_rate=0.01, regressor__max_depth=3, regressor__n_estimators=100; total time=  48.1s
[CV] END regressor__learning_rate=0.01, regressor__max_depth=3, regressor__n_estimators=400; total time= 3.4min
[CV] END regressor__learning_rate=0.01, regressor__max_depth=6, regressor__n_estimators=400; total time= 8.0min
[CV] END regressor__learning_rate=0.01, regressor__max_depth=9, regressor__n_estimators=300; total time=10.3min
[CV] END regressor__learning_rate=0.1, regressor__max_depth=3, regressor__n_estimators=400; total time= 4.0min
[CV] END regressor__learning_rate=0.1, regressor__max_depth=6, regressor__n_estimators=200; total time= 4.2min
[CV] END regressor__learning_rate=0.1, regressor__max_depth=6, regressor__n_estimators=400; total time= 8.7min
[CV] END regressor__learning_rate=0.1, regressor__max_depth=9, regressor__n_estimators=400; total time=14.0min
[CV] END regressor__learning_rate=0.2, regress

In [21]:
best_params = xgb_grid_search.best_params_
best_pipeline = xgb_grid_search.best_estimator_


[CV] END regressor__learning_rate=0.01, regressor__max_depth=3, regressor__n_estimators=300; total time= 2.5min
[CV] END regressor__learning_rate=0.01, regressor__max_depth=6, regressor__n_estimators=200; total time= 3.6min
[CV] END regressor__learning_rate=0.01, regressor__max_depth=6, regressor__n_estimators=400; total time= 8.4min
[CV] END regressor__learning_rate=0.01, regressor__max_depth=9, regressor__n_estimators=400; total time=13.4min
[CV] END regressor__learning_rate=0.1, regressor__max_depth=6, regressor__n_estimators=300; total time= 6.3min
[CV] END regressor__learning_rate=0.1, regressor__max_depth=9, regressor__n_estimators=200; total time= 6.9min
[CV] END regressor__learning_rate=0.1, regressor__max_depth=9, regressor__n_estimators=400; total time=14.4min
[CV] END regressor__learning_rate=0.2, regressor__max_depth=6, regressor__n_estimators=400; total time= 9.1min
[CV] END regressor__learning_rate=0.2, regressor__max_depth=9, regressor__n_estimators=400; total time=11.0m

In [22]:
y_pred_best = best_pipeline.predict(X_test)
mse_best = mean_squared_error(y_test, y_pred_best)
print(f"Best XGBoost Mean Squared Error: {mse_best}")


Best XGBoost Mean Squared Error: 2118.721693316591


In [23]:
best_params = xgb_grid_search.best_params_
print("Best parameters found by grid search:")
for param, value in best_params.items():
    print(f"{param}: {value}")

Best parameters found by grid search:
regressor__learning_rate: 0.2
regressor__max_depth: 6
regressor__n_estimators: 400


In [24]:
mse_best = mean_squared_error(y_test, y_pred_best)
mae_best = mean_absolute_error(y_test, y_pred_best)
rmse_best = np.sqrt(mse_best)
r2_best = r2_score(y_test, y_pred_best)
print(f"Best XGBoost Mean Squared Error: {mse_best}")
print(f"MAE_best: {mae_best:.2f}")
print(f"RMSE_best: {rmse_best:.2f}")
print(f"R2_best:{r2_best:.2f}")

Best XGBoost Mean Squared Error: 2118.721693316591
MAE_best: 28.24
RMSE_best: 46.03
R2_best:0.39


In [30]:
xgb_feature_importances = best_pipeline.named_steps["regressor"].feature_importances_


In [31]:
# Get the feature names after one-hot encoding
cat_feature_names = best_pipeline.named_steps["preprocessor"].named_transformers_["cat"].get_feature_names_out(categorical_features)

# Combine all feature names
all_feature_names = np.concatenate([numeric_features, cat_feature_names, binary_features, cyclical_features])


In [32]:
importances_df = pd.DataFrame({"feature": all_feature_names, "importance": xgb_feature_importances})


In [36]:
importances_df.head(15)

Unnamed: 0,feature,importance
24,gift_quantity_0.0,0.214374
25,gift_quantity_1.0,0.124442
0,quantity,0.09127
21,country_Spain,0.064434
11,country_Germany,0.053458
12,country_Hungary,0.049837
2,price_difference,0.04712
3,country_Austria,0.034659
10,country_France,0.030159
18,country_Romania,0.028312


In [37]:
importances_df = importances_df.sort_values("importance", ascending=False)
selected_features = importances_df["feature"].head(15).tolist()


In [39]:
# Apply the preprocessing steps to the datasets
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Convert the preprocessed data to DataFrames
X_train_preprocessed_df = pd.DataFrame(X_train_preprocessed, columns=all_feature_names)
X_test_preprocessed_df = pd.DataFrame(X_test_preprocessed, columns=all_feature_names)

# Select the chosen features from the preprocessed datasets
X_train_selected = X_train_preprocessed_df[selected_features]
X_test_selected = X_test_preprocessed_df[selected_features]

# Train and evaluate the model with the selected features
xgb_selected = XGBRegressor(learning_rate=0.2, n_estimators=400,max_depth=6, n_jobs=-1, random_state=42)
xgb_selected.fit(X_train_selected, y_train)

# Make predictions and evaluate the model
y_pred_selected = xgb_selected.predict(X_test_selected)


In [41]:
# Calculate the evaluation metrics
mae_xgselect = mean_absolute_error(y_test, y_pred_selected)
mse_xgselect = mean_squared_error(y_test, y_pred_selected)
rmse_xgselect = np.sqrt(mse_xgselect)
r2_xgselect = r2_score(y_test, y_pred_selected)

# Print the evaluation metrics
print(f"MAE: {mae_xgselect:.2f}")
print(f"MSE: {mse_xgselect:.2f}")
print(f"RMSE: {rmse_xgselect:.2f}")
print(f"R2: {r2_xgselect:.2f}")


MAE: 28.43
MSE: 2140.25
RMSE: 46.26
R2: 0.39


- Random Forest Model

In [37]:
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('regressor', RandomForestRegressor(n_jobs=-1, random_state=42))])

# Train and evaluate the models
pipelines = [rf_pipeline]
model_names = ['Random Forest']

for pipeline, model_name in zip(pipelines, model_names):
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Evaluate the model
    y_pred_rf = pipeline.predict(X_test)

# Calculate the mean squared error for the baseline model
y_baseline = y_train.mean()
mse_baseline = mean_squared_error(y_test, [y_baseline] * len(y_test))
print(f"Baseline Model Mean Squared Error: {mse_baseline}")

Baseline Model Mean Squared Error: 3481.813810487228


In [38]:
# Evaluate the best model
y_pred_rf = rf_pipeline.predict(X_test)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"MAE: {mae_rf:.2f}")
print(f"MSE: {mse_rf:.2f}")
print(f"RMSE: {rmse_rf:.2f}")
print(f"R2:{r2_rf:.2f}")

MAE: 29.27
MSE: 2311.97
RMSE: 48.08
R2:0.34
