# Data splits

In this notebook the datasets are split into Train/Test and validation data:

- Split: 80/10/10
- Scaling using X_train for min and max values


In [17]:
import pandas as pd
from feature_scaler import *

df = pd.read_csv('EDA_and_feature_analysis/Data/Dataset_5.csv')
df = df.reset_index(drop=True)

In [18]:
### One hot encoding

# Select features to encode
cat_to_encode = ["day_of_week"]

df = pd.get_dummies(df, columns=cat_to_encode)

# Turn bools into ints
for col in df.columns:
    if df[col].dtype == 'bool':
        df[col] = df[col].astype(int)

In [19]:
target = ["dso"]

features = [#keep for operations in this notebook then drop: 
            "customer_id", "weighted_payment_date",
       # Dataset 1:
       'payment_terms', 'customer_id_enc', 'log_amount', #  Feature            
       'day_of_week_0', # day_of_due_date 
       'day_of_week_1', 'day_of_week_2', 'day_of_week_3', 'day_of_week_4',
       'day_of_week_5', 'day_of_week_6',
       # Dataset 2:
       'country_enc',
       # Dataset 3: payment features
       'ratio_outstanding', "near_payment_term_ratio", "overdue_ratio", "rolling_avg_dso", 
       'paid_invoices',"outstanding_invoices" , "near_payment_term_count",
       # Dataset 4: Reminder features
       "binary_reminder_count" , "average_reminder_stage",
       # Dataset 5: Clarification features
       "binary_count_past_clarifications" , "log_avg_clarification_days", "binary_dunning_stop" 
       ]

dataset = 5  # Select which dataset we can to create the split for

In [None]:
#reset index
df = df.reset_index(drop=True)

# Sort the DataFrame by date
df = df.sort_values('weighted_payment_date')

# Determine the split index
split_index = int(len(df) * 0.8)

# Find the payment date at split_index
date_train_split = df.iloc[split_index]['weighted_payment_date']

# Splitting main and side data based on date_train_split; side_data used for validation and test
Train = df[df['weighted_payment_date'] <= date_train_split]
side_data = df[df['weighted_payment_date'] > date_train_split]

#Test if it worked
print("Total invoice data shape: ", df.shape)
print("Train_data shape: ", Train.shape)
print("Test_data shape: ", side_data.shape)
print("-------------------------")
print("Test if it worked:") #printing the earliest and latest dates in the train and test sets
print("Earliest date in train_data: ", Train['weighted_payment_date'].min())
print("Latest date in train_data: ", Train['weighted_payment_date'].max())
print("Earliest date in side_data: ", side_data['weighted_payment_date'].min())
print("Latest date in side_data: ", side_data['weighted_payment_date'].max())
print("-------------------------")


# Determine the split index for validation and test data (50% of 20% = 10/10)
split_index_test = int(len(side_data) * 0.5)

# Find the payment date at split_index
date_val_split = side_data.iloc[split_index_test]['weighted_payment_date']

# Splitting main and side data based on date_val_split
Val = side_data[side_data['weighted_payment_date'] <= date_val_split]
test_data = side_data[side_data['weighted_payment_date'] > date_val_split]


## Create subset of val and test data with only the customers in train data

#Get unique customer ids of train
unique_customer_ids_train = Train['customer_id'].unique()

mask_val_data = Val['customer_id'].isin(unique_customer_ids_train)
mask_test_data = test_data['customer_id'].isin(unique_customer_ids_train)

Val_subset = Val[mask_val_data]
Test_subset = test_data[mask_test_data]



"""feature_scaler(features_to_scale, X_train, X_val, X_test, scaler)"""

#features to scale
to_scale = ["log_amount"]

# Call the feature_scaler function
Train, Val_subset, Test_subset, fitted_scaler = feature_scaler(to_scale, Train, Val_subset, Test_subset)


X_train = Train[features]
y_train = Train[target]

X_val_subset = Val_subset[features]
y_val_subset = Val_subset[target]

X_val = Val[features]
y_val = Val[target]

X_test_subset = Test_subset[features]
y_test_subset = Test_subset[target]

X_test = test_data[features]
y_test = test_data[target]




#store dso of Test data
dbt_test_data = test_data["dso"]
dso_test_subset = Test_subset["dso"]


print("X_train shape: ", X_train.shape)
print("X_val shape: ", X_val.shape)
print("X_val_subset shape: ", X_val_subset.shape)
print("Test data shape: ", X_test.shape)
print("Test data subset shape: ", X_test_subset.shape)
print("-------------------------")
print("Test if it worked:") #printing the earliest and latest dates in the train and test sets
print("Earliest payment date in X_train: ", X_train['weighted_payment_date'].min())
print("Latest payment date in X_train: ", X_train['weighted_payment_date'].max())
print("-----")
print("Earliest payment date in X_val: ", X_val['weighted_payment_date'].min())
print("Latest payment date in X_val: ", X_val['weighted_payment_date'].max())
print("-----")
print("Earliest payment date in X_val_subset: ", X_val_subset['weighted_payment_date'].min())
print("Latest payment date in X_val_subset: ", X_val_subset['weighted_payment_date'].max())
print("-----")
print("Earliest payment date in X_test: ", X_test['weighted_payment_date'].min())
print("Latest payment date in X_test: ", X_test['weighted_payment_date'].max())
print("-----")
print("Earliest payment date in X_test_subset: ", X_test_subset['weighted_payment_date'].min())
print("Latest payment date in X_test_subset: ", X_test_subset['weighted_payment_date'].max())



#drop weighted_payment_date
X_train = X_train.drop(columns = ["weighted_payment_date"])
X_val_subset = X_val_subset.drop(columns = ["weighted_payment_date"])
X_test_subset = X_test_subset.drop(columns = ["weighted_payment_date"])
X_test = X_test.drop(columns = ["weighted_payment_date"])

#Drop customer id
X_train = X_train.drop(columns = ["customer_id"])
X_val_subset = X_val_subset.drop(columns = ["customer_id"])
X_test_subset = X_test_subset.drop(columns = ["customer_id"])
X_test = X_test.drop(columns = ["customer_id"])

#Convert to float
X_train = X_train.astype(float)
y_train = y_train.astype(float)
X_val_subset = X_val_subset.astype(float)
y_val_subset = y_val_subset.astype(float)
X_test_subset = X_test_subset.astype(float)
y_test_subset = y_test_subset.astype(float)
X_test = X_test.astype(float)
y_test = y_test.astype(float)

In [6]:
# Store csv files
X_train.to_csv(f"Inputs/Dataset_{dataset}/X_train.csv", index=False)
y_train.to_csv(f"Inputs/Dataset_{dataset}/y_train.csv", index=False)
X_val_subset.to_csv(f"Inputs/Dataset_{dataset}/X_val.csv", index=False)
y_val_subset.to_csv(f"Inputs/Dataset_{dataset}/y_val.csv", index=False)
X_test_subset.to_csv(f"Inputs/Dataset_{dataset}/X_test.csv", index=False)
y_test_subset.to_csv(f"Inputs/Dataset_{dataset}/y_test.csv", index=False)
X_test.to_csv(f"Inputs/Dataset_{dataset}/X_test_all_customers.csv", index=False)
y_test.to_csv(f"Inputs/Dataset_{dataset}/y_test_all_customers.csv", index=False)