In [None]:
import pandas as pd
from datetime import timedelta
from utils import *
from customer_features import * 
from reminders_features import *
from clarification_features import *

# Dataset creator file

The 5 datasets evaluated in the study are created in this file using functions defined in utils,customer_features, clarification_features and reminders_features

# Creating dataset 1 & 2

In [None]:
df = pd.read_csv('data/invoices_basic.csv')
df = preprocessing(df)
df = extract_datetime_features(df)

#save csv
df.to_csv("data/dataset_1_and_2.csv", index=False)

## Creating Dataset 3: Invoice + Customer features

In [None]:
df = pd.read_csv('data/invoices_basic.csv')

df = preprocessing(df)

# Calculate: near_payment_term_count; near_payment_term_ratio and overdue_ratio
print("Calculating Payment Behaviour features")
df = payment_behaviour(df, 6, 4) # 6 months lookback; Overdue defined as +/- 4 days from due date

# Calculate: rolling_avg_dso
print("Calculating Rolling avg")
df = calculate_rolling_avg(df, 6) # in bast 6 months

# Calculate: outstanding_invoices, paid_invoices, issued_invoices
print("Calculating Outstanding invoices")
df = calculate_outstanding_invoices(df, 4) # 4 months = 120 days; Since all invoices with DSO and payment terms >120 were removed

# Calculate ratio outstanding from issued_invoices and outstanding_invoices
df = ratio_outstanding(df)

# Bin count variables
df = binning_counts(df)

# Binarize count variables
df = binarize_counts(df)

# Extract day/month/year of due date and day of week
df = extract_datetime_features(df)


df.to_csv("data/dataset_3.csv", index=False)

## Creating Dataset 4

- Inputs required: Dataset 3 and Reminders dataset

In [3]:
# Step 1: Create subset of reminders that contains only invoices that are in main data

df_main = pd.read_csv('data/dataset_3.csv')
df_reminders = pd.read_csv("data/reminders.csv")

#Get unique customer ids from main data
unique_customer_ids_main = df_main['customer_id'].unique()

# Get list of customers in reminders that are also in main data
mask_unique_customers = df_reminders['customer_id'].isin(unique_customer_ids_main)

#Filter reminders
df_reminders = df_reminders[mask_unique_customers]

#Save df
df_reminders.to_csv("data/reminders_subset.csv", index=False)

In [None]:
df_reminders = pd.read_csv("data/reminders_subset.csv")
df_main = pd.read_csv('data/dataset_3.csv')

# Make sure all dates are in datetime format
df_main['receipt_date']=pd.to_datetime(df_main['receipt_date'],format='%Y-%m-%d')
df_main['weighted_payment_date']=pd.to_datetime(df_main['weighted_payment_date'],format='%Y-%m-%d')
df_main['due_date']=pd.to_datetime(df_main['due_date'],format='%Y-%m-%d')
df_reminders['reminder_date'] = pd.to_datetime(df_reminders['reminder_date'])


print("Calculating feature 1: ")
df = count_past_reminders(df_main, df_reminders, 6)

print("Calculating feature 2: ")
df = average_reminder_stage(df_main, df_reminders, 6)

print("Binning")
df = reminders_binning(df)

print("Done")

#df.to_csv("data/dataset_4.csv", index=False)

## Clarifications

Clarification featues added: 
- Count_past_clarifications
- Count_dunning_stop
- Avg_clarification_days


In [None]:
main_df = pd.read_csv("data/dataset_4.csv")
df_clarification = pd.read_csv('data/clarifications.csv')
df_reminders = pd.read_csv('data/reminders_subset.csv')

# Make sure all dates are in datetime format
df_clarification['created_at']=pd.to_datetime(df_clarification['created_at'],format='%Y-%m-%d %H:%M:%S.%f')
df_clarification['resolved_at']=pd.to_datetime(df_clarification['resolved_at'],format="%Y-%m-%d %H:%M:%S.%f")
main_df['receipt_date']=pd.to_datetime(main_df['receipt_date'],format='%Y-%m-%d')
main_df['weighted_payment_date']=pd.to_datetime(main_df['weighted_payment_date'],format='%Y-%m-%d')
main_df['due_date']=pd.to_datetime(main_df['due_date'],format='%Y-%m-%d')

# To see how many change after cleaning
print("Number of clarifications before cleaning: ", len(df_clarification))


    # Cleaning #

# Claning: all rows created after 01.05.2024 are dropped & all rows with missing values for "resolved_at" (cleaning like main data)
df_clarification = df_clarification[df_clarification['created_at'] <= '2024-05-01']

#Drop clarifications that are still open; clarifications that are not yet resolved
df_clarification = df_clarification.dropna(subset=['resolved_at'])


    # Merge with reminders to get customer_id:

# Get customer_id for each clarification
df_clarification = df_clarification.merge(df_reminders[['journal_entry_id', 'customer_id']], left_on='item_id', right_on='journal_entry_id', how='left')

# percentage of rows with missing value for journal_entry_id
print("Percentage of rows with missing value for journal_entry_id: ", df_clarification['journal_entry_id'].isnull().sum() / len(df_clarification))

# Drop rows with missing value for journal_entry_id since cannot assign to customer
df_clarification = df_clarification.dropna(subset=['journal_entry_id'])

# Drop column journal_entry_id since item_id is the same
df_clarification = df_clarification.drop(columns=['journal_entry_id'])


    #Creation of clarification duration; defined as days between created_at and resolved_at
    
df_clarification["duration"] = (df_clarification["resolved_at"] - df_clarification["created_at"]).dt.days

# Drop unnecessary columns
df_clarification = df_clarification.drop(columns=['restart_count', 'restart_date'])

print("Number of clarifications after cleaning: ", len(df_clarification))


    # Creation clarification features
df = clarification_features(main_df, df_clarification, 6)

# Bin clarification features since distribution heavily skewed especially for counts (most entries in df don't have clarifications, so value = 0)
df = binned_clarification_features(df)


#df.to_csv("data/dataset_5.csv", index=False)