In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, median_absolute_error
from sklearn.metrics import r2_score
from datetime import timedelta
import matplotlib.pyplot as plt

## Naive prediction - benchmark

With this naive prediction we want to create a "baseline" model, where we calculate the avg_dso for the customers in X_train and use it as "prediction" for the payments of those customers in the side_data (newest ARs). The goal is to create a model that performs better than this model

In [2]:
df = pd.read_csv('EDA_and_feature_analysis/data/Model_1_and_2.csv')

#reset index
df = df.reset_index(drop=True)

In [3]:
# Sort the DataFrame by date
df = df.sort_values('weighted_payment_date')

# Determine the split index
split_index = int(len(df) * 0.8)

# Find the payment date at split_index
date_train_split = df.iloc[split_index]['weighted_payment_date']

# Splitting main and side data based on date_train_split; side_data used for validation and test
X_train = df[df['weighted_payment_date'] <= date_train_split]
side_data = df[df['weighted_payment_date'] > date_train_split]

# Determine the split index
split_index_test = int(len(side_data) * 0.5)

# Find the payment date at split_index
date_val_split = side_data.iloc[split_index_test]['weighted_payment_date']

# Splitting the test data to be same as in the other models
test_data = side_data[side_data['weighted_payment_date'] > date_val_split]


#Store unique customer ids from train data
unique_customer_ids_train = X_train['customer_id'].unique()

# Create a boolean mask for rows in the side data with customer_ids that are in the main data
mask_test_data = test_data['customer_id'].isin(unique_customer_ids_train)

# Create subset of side_data containing only customer_ids that are in the main data
test_data_subset = test_data[mask_test_data]


In [None]:
print("Train data shape: ", X_train.shape)
print("Side data shape: ", side_data.shape)
print(" ")
print("-------------------------")
print("Test if it worked:") #printing the earliest and latest dates in the train and test sets
print("Earliest payment date in Train data: ", X_train['weighted_payment_date'].min())
print("Latest payment date in Train data: ", X_train['weighted_payment_date'].max())
print("Earliest payment date in test_data: ", test_data['weighted_payment_date'].min())
print("Latest payment  date in test_data: ", test_data['weighted_payment_date'].max())
print("-------------------------")
print("Test data shape: ", test_data.shape)
print("Test data subset shape: ", test_data_subset.shape)

In [5]:
X_train = X_train[['customer_id', 'receipt_date', 'dso']]
test_data_subset = test_data_subset[['customer_id', 'receipt_date', 'dso']]

## Calculation of avg_dso as prediction

In [None]:
#Calculate average dso for each customer
tmp = X_train.groupby("customer_id")["dso"].mean()

# Create new column avg_dso
X_train["avg_dso"] = X_train["customer_id"].map(tmp)

X_train

In [None]:
# Drop duplicates in X_train
avg_dso_customer = X_train.drop_duplicates(subset=['customer_id', 'avg_dso'])

# merge the avg_dso column from the train data to the side data
test_data_subset = test_data_subset.merge(avg_dso_customer[['customer_id', 'avg_dso']], on='customer_id')

test_data_subset.head()

## Evaluation

In [None]:
print('------------------------------------------------')  
#print the MAE of the total dataset
MAE = np.mean(abs(test_data_subset['dso'] - test_data_subset['avg_dso']))
median_AE = median_absolute_error(test_data_subset['dso'], test_data_subset['avg_dso'])

print(f"Mean Absolute Error: {round(MAE, ndigits= 2)}")
print('------------------------------------------------')
print(f"Median Absolute Error: {round(median_AE, ndigits= 2)}")

In [None]:
# Plot distribution of dso and predicted dso

test_data_subset['avg_dso'].plot(kind = 'hist', bins = 50, rwidth = 0.8, alpha = 0.5, label = 'avg_dso')
test_data_subset['dso'].plot(kind = 'hist', bins = 50, rwidth = 0.8, alpha = 0.5, label = 'dso')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.legend()
plt.show()