# Data Cleaning and Preprocessing


Imports


In [1]:
import pandas as pd
import numpy as np
from fitter import Fitter, get_common_distributions
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import warnings
from datetime import datetime, timedelta
from scipy import stats

warnings.filterwarnings("ignore")

pd.set_option("future.no_silent_downcasting", True)

Read the orders dataset into a pandas dataframe


In [2]:
orders_df = pd.read_csv("../Data/orders_data_competition.csv")

In [3]:
orders_df.head()

Unnamed: 0,Order ID,Account ID,Market Key,Security ID,Order Type,Order Time,Order Via,Is Completed,Is Canceled,Expire Date,Execution Status,quantity,Price,Sector Name,Executed Quantity,Quantity
0,0,2312,Egypt,0,Buy,2024-01-10 10:16:10.000,Online,1,0,2024-01-10 00:00:00.000,Executed,100,66.3,Real Estate,100,100
1,1,1196,Egypt,0,Sell,2024-01-10 10:16:11.000,Online,1,0,2024-01-10 00:00:00.000,Executed,200,66.0,Real Estate,200,200
2,2,1759,Egypt,0,Buy,2024-01-10 10:16:12.000,Online,0,1,2024-01-10 00:00:00.000,Not Executed,200,65.0,Real Estate,0,200
3,3,4476,Egypt,1,Sell,2024-01-10 10:16:16.000,Online,0,1,2024-01-10 00:00:00.000,Not Executed,820,75.2,Financials,0,820
4,4,838,Egypt,2,Sell,2024-01-10 10:16:18.000,Online,0,1,2024-01-10 00:00:00.000,Not Executed,500,10.68,Materials,0,500


Read the clients dataset into a pandas dataframe


In [4]:
clients_df = pd.read_csv("../Data/clients_data_competition.csv")

In [5]:
clients_df.head()

Unnamed: 0,Client ID,Account ID,Gender,Risk Rate,Company Name,Is Closed,Is Dormant,Is Profile Suspended,Is Client Suspended,Client Type Name,OpenDate,BirthDate
0,0,0,Male,Low,HSB,0,0.0,0,0,Individuals,2/16/2014,1990-07-27
1,1,1,Female,Low,HSB,0,-1.0,1,0,Individuals,2/16/2014,1963-10-22
2,2,2,Male,Low,HSB,0,0.0,0,0,Individuals,2/17/2014,1971-05-14
3,3,3,Male,Low,HSB,0,0.0,0,0,Individuals,2/17/2014,1953-01-14
4,4,4,Male,Low,HSB,0,0.0,0,0,Individuals,2/17/2014,1976-06-09


## Account Level Dataset


### Orders Data


**_We created another dataframe "agg_orders" which represents the aggregate orders of each account_**


Drop the rows with null values


In [6]:
orders_df.dropna(inplace=True)

In [7]:
orders_df = orders_df[orders_df["Order Via"] == "Online"]

In [8]:
orders_df.drop(columns=["Order Via"], inplace=True)

In [9]:
account_ids_to_remove = clients_df[
    (clients_df["Client Type Name"] != "Individuals")
    | (clients_df["Company Name"] != "HSB")
]["Account ID"].unique()

# Remove the accounts from the accounts dataframe where Client Type Name is not 'individuals'
clients_df = clients_df[~clients_df["Account ID"].isin(account_ids_to_remove)]

# Remove the orders of these accounts from the orders dataframe
orders_df = orders_df[~orders_df["Account ID"].isin(account_ids_to_remove)]

In [10]:
clients_df.drop(columns=["Client Type Name", "Company Name"], inplace=True)

In [11]:
len(clients_df["Account ID"].unique())

11323

In [12]:
len(orders_df["Account ID"].unique())

6715

In [13]:
# Check if there are any accounts in the orders dataframe that are not in the accounts dataframe
accounts_in_orders_not_in_accounts = orders_df[
    ~orders_df["Account ID"].isin(clients_df["Account ID"])
]
if accounts_in_orders_not_in_accounts.empty:
    print(
        "There are no accounts in the orders dataframe that are not in the accounts dataframe."
    )
else:
    print(
        "There are accounts in the orders dataframe that are not in the accounts dataframe."
    )

# Check if there are any accounts in the accounts dataframe that are not in the orders dataframe
accounts_in_accounts_not_in_orders = clients_df[
    ~clients_df["Account ID"].isin(orders_df["Account ID"])
]
if accounts_in_accounts_not_in_orders.empty:
    print(
        "There are no accounts in the accounts dataframe that are not in the orders dataframe."
    )
else:
    print(
        "There are accounts in the accounts dataframe that are not in the orders dataframe."
    )

There are no accounts in the orders dataframe that are not in the accounts dataframe.
There are accounts in the accounts dataframe that are not in the orders dataframe.


In [14]:
orders_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1833491 entries, 0 to 1987940
Data columns (total 15 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Order ID           int64  
 1   Account ID         int64  
 2   Market Key         object 
 3   Security ID        int64  
 4   Order Type         object 
 5   Order Time         object 
 6   Is Completed       int64  
 7   Is Canceled        int64  
 8   Expire Date        object 
 9   Execution Status   object 
 10  quantity           int64  
 11  Price              float64
 12  Sector Name        object 
 13  Executed Quantity  int64  
 14  Quantity           int64  
dtypes: float64(1), int64(8), object(6)
memory usage: 223.8+ MB


Initialize a new Dataframe to store the aggregate orders per account


In [15]:
agg_orders = pd.DataFrame()

Add an AccountID column to the aggregate orders df


In [16]:
agg_orders["AccountID"] = clients_df["Account ID"].unique()

Compute and add the number of orders for each account


In [17]:
# Add a NumOfOrders column that has the number of orders for each account
agg_orders["NumOfOrders"] = orders_df.groupby("Account ID").size()

# Set the number of orders for the accounts that didn't place any order to 0
agg_orders["NumOfOrders"] = agg_orders["NumOfOrders"].fillna(0)

# Convert the type to int
agg_orders["NumOfOrders"] = agg_orders["NumOfOrders"].astype(int)

Compute and add the last order date for each account


In [18]:
# Convert the 'order_date' column to datetime
orders_df["Order Time"] = pd.to_datetime(orders_df["Order Time"])

# Create a new dataframe with the last order date for each account
agg_orders["LastOrder"] = orders_df.groupby("Account ID")["Order Time"].max()

# Convert the LastOrder column to datetime
agg_orders["LastOrder"] = pd.to_datetime(agg_orders["LastOrder"], format="%d-%m-%Y")

Compute and add the first order date for each account


In [19]:
# Create a new dataframe with the last order date for each account
agg_orders["FirstOrder"] = orders_df.groupby("Account ID")["Order Time"].min()

# Convert the LastOrder column to datetime
agg_orders["FirstOrder"] = pd.to_datetime(agg_orders["FirstOrder"], format="%d-%m-%Y")

Compute and add the number of completed orders for each account


In [20]:
# Add a NumOfCompleted column that has the number of completed orders for each account
agg_orders["NumOfCompleted"] = orders_df.groupby("Account ID")["Is Completed"].sum()

# Set the number of completed orders for the accounts that didn't place any order to 0
agg_orders["NumOfCompleted"] = agg_orders["NumOfCompleted"].fillna(0)

# # Convert the type to int
agg_orders["NumOfCompleted"] = agg_orders["NumOfCompleted"].astype(int)

Compute and add the number of canceled for each account


In [21]:
# Add a NumOfCanceled column that has the number of completed orders for each account
agg_orders["NumOfCanceled"] = orders_df.groupby("Account ID")["Is Canceled"].sum()

# Set the number of completed orders for the accounts that didn't place any order to 0
agg_orders["NumOfCanceled"] = agg_orders["NumOfCanceled"].fillna(0)

# # Convert the type to int
agg_orders["NumOfCanceled"] = agg_orders["NumOfCanceled"].astype(int)

Compute and add the average price of orders for each account


In [22]:
# Add a AvgPrice column that has the number of completed orders for each account
agg_orders["AvgPrice"] = (
    orders_df.groupby("Account ID")["Price"].sum()
    / orders_df.groupby("Account ID").size()
)

# Set the number of completed orders for the accounts that didn't place any order to 0
agg_orders["AvgPrice"] = agg_orders["AvgPrice"].fillna(0.0)

Compute and add the total executed quantity of orders for each account


In [23]:
# Add a TotalExecutedQuantity column that has the number of completed orders for each account
agg_orders["TotalExecutedQuantity"] = orders_df.groupby("Account ID")[
    "Executed Quantity"
].sum()

# Set the number of completed orders for the accounts that didn't place any order to 0
agg_orders["TotalExecutedQuantity"] = agg_orders["TotalExecutedQuantity"].fillna(0)

# # Convert the type to int
agg_orders["TotalExecutedQuantity"] = agg_orders["TotalExecutedQuantity"].astype(int)

Compute and add the total quantity of orders for each account


In [24]:
# Add a TotalQuantity column that has the number of completed orders for each account
agg_orders["TotalQuantity"] = orders_df.groupby("Account ID")["Quantity"].sum()

# Set the number of completed orders for the accounts that didn't place any order to 0
agg_orders["TotalQuantity"] = agg_orders["TotalQuantity"].fillna(0)

# # Convert the type to int
agg_orders["TotalQuantity"] = agg_orders["TotalQuantity"].astype(int)

Define a function to convert a categorical feature to its TFIDF dataframe and concatenate it with the aggregate orders dataframe


In [25]:
def FrequencyEncoder(column_name: str):
    temp = pd.get_dummies(orders_df, columns=[column_name])
    temp.replace({True: 1, False: 0}, inplace=True)

    frequency_encode_df = pd.DataFrame(data=agg_orders.iloc[:, 0])

    for word in orders_df[column_name].unique():
        col_name = f"{column_name}_{word}"

        frequency_encode_df[col_name.replace(" ", "")] = (
            temp.groupby("Account ID")[col_name].sum()
            / temp.groupby("Account ID").size()
        )

        frequency_encode_df[col_name.replace(" ", "")] = frequency_encode_df[
            col_name.replace(" ", "")
        ].fillna(0)

        frequency_encode_df[col_name.replace(" ", "")] = frequency_encode_df[
            col_name.replace(" ", "")
        ].astype(float)

    return pd.concat([agg_orders, frequency_encode_df.iloc[:, 1:]], axis=1)

Compute and add the Frequency Encoding of the Security ID column


In [26]:
agg_orders = FrequencyEncoder("Security ID")

Compute and add the Frequency Encoding of the Order Type column


In [27]:
agg_orders = FrequencyEncoder("Order Type")

Compute and add the Frequency Encoding of the Execution Status column


In [28]:
agg_orders = FrequencyEncoder("Execution Status")

Compute and add the Frequency Encoding of the Sector Name column


In [29]:
agg_orders = FrequencyEncoder("Sector Name")

Display the head of the agg_orders dataframe


In [30]:
agg_orders.head()

Unnamed: 0,AccountID,NumOfOrders,LastOrder,FirstOrder,NumOfCompleted,NumOfCanceled,AvgPrice,TotalExecutedQuantity,TotalQuantity,SecurityID_0,...,SectorName_INVESTMENT,SectorName_TelecommunicationServices,SectorName_REALESTATE,SectorName_Telecommunications,SectorName_FOOD,SectorName_Others,SectorName_Tourism,SectorName_ConsumerServices,SectorName_Utilities,SectorName_PharmaceuticalIndustries
0,0,0,NaT,NaT,0,0,0.0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0,NaT,NaT,0,0,0.0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,21,2024-02-28 11:06:33,2022-01-04 12:06:31,17,2,7.652667,123895,137395,0.0,...,0.095238,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,87,2024-03-10 12:32:34,2023-01-17 12:02:35,64,11,71.908161,682538,908478,0.0,...,0.0,0.022989,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,30,2024-02-26 12:47:33,2022-01-11 11:05:32,8,10,2.846267,356686,594953,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Display the info of the agg_orders dataframe


In [31]:
agg_orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11323 entries, 0 to 11322
Columns: 341 entries, AccountID to SectorName_PharmaceuticalIndustries
dtypes: datetime64[ns](2), float64(333), int64(6)
memory usage: 29.5 MB


### Clients Data


Drop the rows with null values


In [32]:
clients_df.dropna(inplace=True)

In [33]:
clients_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 11323 entries, 0 to 13521
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Client ID             11323 non-null  int64  
 1   Account ID            11323 non-null  int64  
 2   Gender                11323 non-null  object 
 3   Risk Rate             11323 non-null  object 
 4   Is Closed             11323 non-null  int64  
 5   Is Dormant            11323 non-null  float64
 6   Is Profile Suspended  11323 non-null  int64  
 7   Is Client Suspended   11323 non-null  int64  
 8   OpenDate              11323 non-null  object 
 9   BirthDate             11323 non-null  object 
dtypes: float64(1), int64(5), object(4)
memory usage: 973.1+ KB


Convert the OpenDate column to DateTime object


In [34]:
# Convert the OpenDate column to datetime
clients_df["OpenDate"] = pd.to_datetime(clients_df["OpenDate"], format="%m/%d/%Y")

Convert the BirthDate column to DateTime object, and calculate the age of the client


In [35]:
# Convert the BirthDate column to datetime
clients_df["BirthDate"] = pd.to_datetime(clients_df["BirthDate"], format="%Y-%m-%d")

In [36]:
def calculate_age(birth_date):
    # Assuming birth_date is a datetime object
    today = datetime.today()
    return (
        today.year
        - birth_date.year
        - ((today.month, today.day) < (birth_date.month, birth_date.day))
    )

In [37]:
clients_df["Age"] = clients_df["BirthDate"].apply(calculate_age)

Remove all whitespaces in the column names


In [38]:
clients_df.rename(columns=lambda x: x.replace(" ", ""), inplace=True)

Display the head of the clients dataframe


In [39]:
clients_df.head()

Unnamed: 0,ClientID,AccountID,Gender,RiskRate,IsClosed,IsDormant,IsProfileSuspended,IsClientSuspended,OpenDate,BirthDate,Age
0,0,0,Male,Low,0,0.0,0,0,2014-02-16,1990-07-27,33
1,1,1,Female,Low,0,-1.0,1,0,2014-02-16,1963-10-22,60
2,2,2,Male,Low,0,0.0,0,0,2014-02-17,1971-05-14,52
3,3,3,Male,Low,0,0.0,0,0,2014-02-17,1953-01-14,71
4,4,4,Male,Low,0,0.0,0,0,2014-02-17,1976-06-09,47


Display the info of the clients dataframe


In [40]:
clients_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11323 entries, 0 to 13521
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   ClientID            11323 non-null  int64         
 1   AccountID           11323 non-null  int64         
 2   Gender              11323 non-null  object        
 3   RiskRate            11323 non-null  object        
 4   IsClosed            11323 non-null  int64         
 5   IsDormant           11323 non-null  float64       
 6   IsProfileSuspended  11323 non-null  int64         
 7   IsClientSuspended   11323 non-null  int64         
 8   OpenDate            11323 non-null  datetime64[ns]
 9   BirthDate           11323 non-null  datetime64[ns]
 10  Age                 11323 non-null  int64         
dtypes: datetime64[ns](2), float64(1), int64(6), object(2)
memory usage: 1.0+ MB


### Merging


**_We will inner join the agg_orders and clients_df dataframes on the AccountID column_**


Perform the merging between both dataframes


In [41]:
clients_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11323 entries, 0 to 13521
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   ClientID            11323 non-null  int64         
 1   AccountID           11323 non-null  int64         
 2   Gender              11323 non-null  object        
 3   RiskRate            11323 non-null  object        
 4   IsClosed            11323 non-null  int64         
 5   IsDormant           11323 non-null  float64       
 6   IsProfileSuspended  11323 non-null  int64         
 7   IsClientSuspended   11323 non-null  int64         
 8   OpenDate            11323 non-null  datetime64[ns]
 9   BirthDate           11323 non-null  datetime64[ns]
 10  Age                 11323 non-null  int64         
dtypes: datetime64[ns](2), float64(1), int64(6), object(2)
memory usage: 1.0+ MB


In [42]:
agg_orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11323 entries, 0 to 11322
Columns: 341 entries, AccountID to SectorName_PharmaceuticalIndustries
dtypes: datetime64[ns](2), float64(333), int64(6)
memory usage: 29.5 MB


In [43]:
df_account_level = pd.merge(clients_df, agg_orders, on="AccountID", how="inner")

In [44]:
# Get the current date
now = datetime.now()

In [45]:
order_rates_start = []
order_rates_end = []
quantity_rates_start = []
quantity_rates_end = []
avg_quantity_per_order_start = []
avg_quantity_per_order_end = []

In [46]:
for i, account in df_account_level.iterrows():
    midpointDate = account["OpenDate"] + (now - account["OpenDate"]) / 2
    openDate = account["OpenDate"]

    days_open_midpoint = (midpointDate - openDate).days
    days_midpoint_now = (now - midpointDate).days

    filtered_date_df = orders_df[
        (orders_df["Order Time"] >= openDate)
        & (orders_df["Order Time"] < midpointDate)
        & (orders_df["Account ID"] == account["AccountID"])
    ]

    filtered_df_2 = orders_df[
        (orders_df["Order Time"] >= midpointDate)
        & (orders_df["Order Time"] < now)
        & (orders_df["Account ID"] == account["AccountID"])
    ]

    num_of_orders_start = len(filtered_date_df)
    num_of_orders_end = len(filtered_df_2)

    quantity_ordered_start = filtered_date_df["Quantity"].sum()
    quantity_ordered_end = filtered_df_2["Quantity"].sum()

    avg_quantity_start = filtered_date_df["Quantity"].mean()
    avg_quantity_end = filtered_df_2["Quantity"].mean()

    order_rates_start.append(num_of_orders_start / days_open_midpoint)
    order_rates_end.append(num_of_orders_end / days_midpoint_now)

    quantity_rates_start.append(quantity_ordered_start / days_open_midpoint)
    quantity_rates_end.append(quantity_ordered_end / days_midpoint_now)

    avg_quantity_per_order_start.append(avg_quantity_start)
    avg_quantity_per_order_end.append(avg_quantity_end)

In [47]:
df_account_level["OrderRate_Start"] = pd.Series(order_rates_start)
df_account_level["OrderRate_End"] = pd.Series(order_rates_end)
df_account_level["QuantityOrderedRate_Start"] = pd.Series(quantity_rates_start)
df_account_level["QuantityOrderedRate_End"] = pd.Series(quantity_rates_end)
df_account_level["AvgQuantityPerOrder_Start"] = pd.Series(avg_quantity_per_order_start)
df_account_level["AvgQuantityPerOrder_End"] = pd.Series(avg_quantity_per_order_end)

In [48]:
df_account_level["AvgQuantityPerOrder_Start"] = df_account_level[
    "AvgQuantityPerOrder_Start"
].fillna(0)
df_account_level["AvgQuantityPerOrder_End"] = df_account_level[
    "AvgQuantityPerOrder_End"
].fillna(0)

In [49]:
df_account_level["ExecutedQuantityRatio"] = df_account_level.apply(
    lambda row: (
        1
        if row["TotalQuantity"] == 0
        else row["TotalExecutedQuantity"] / row["TotalQuantity"]
    ),
    axis=1,
)

Display the head of the dataframe


In [50]:
df_account_level.head()

Unnamed: 0,ClientID,AccountID,Gender,RiskRate,IsClosed,IsDormant,IsProfileSuspended,IsClientSuspended,OpenDate,BirthDate,...,SectorName_ConsumerServices,SectorName_Utilities,SectorName_PharmaceuticalIndustries,OrderRate_Start,OrderRate_End,QuantityOrderedRate_Start,QuantityOrderedRate_End,AvgQuantityPerOrder_Start,AvgQuantityPerOrder_End,ExecutedQuantityRatio
0,0,0,Male,Low,0,0.0,0,0,2014-02-16,1990-07-27,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1,1,Female,Low,0,-1.0,1,0,2014-02-16,1963-10-22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2,2,Male,Low,0,0.0,0,0,2014-02-17,1971-05-14,...,0.0,0.0,0.0,0.0,0.011302,0.0,73.947793,0.0,6542.619048,0.901743
3,3,3,Male,Low,0,0.0,0,0,2014-02-17,1953-01-14,...,0.0,0.0,0.0,0.0,0.046825,0.0,488.95479,0.0,10442.275862,0.751298
4,4,4,Male,Low,0,0.0,0,0,2014-02-17,1976-06-09,...,0.0,0.0,0.0,0.0,0.016146,0.0,320.211518,0.0,19831.766667,0.59952


Display the info of the dataframe


In [51]:
df_account_level.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11323 entries, 0 to 11322
Data columns (total 358 columns):
 #    Column                                Dtype         
---   ------                                -----         
 0    ClientID                              int64         
 1    AccountID                             int64         
 2    Gender                                object        
 3    RiskRate                              object        
 4    IsClosed                              int64         
 5    IsDormant                             float64       
 6    IsProfileSuspended                    int64         
 7    IsClientSuspended                     int64         
 8    OpenDate                              datetime64[ns]
 9    BirthDate                             datetime64[ns]
 10   Age                                   int64         
 11   NumOfOrders                           int64         
 12   LastOrder                             datetime64[ns]
 13  

In [52]:
len(df_account_level["ClientID"].unique())

8871

## Client Level Dataset


In [53]:
df = pd.DataFrame()

In [54]:
# Create a new DataFrame with ClientID and corresponding account count
df = pd.DataFrame(
    {
        "ClientID": df_account_level.groupby("ClientID")["AccountID"].nunique().index,
    }
)

In [55]:
df["Gender"] = (df_account_level.groupby("ClientID")["Gender"].first().reset_index())[
    "Gender"
]
df["Age"] = (df_account_level.groupby("ClientID")["Age"].first().reset_index())["Age"]
df["IsClientSuspended"] = (
    df_account_level.groupby("ClientID")["IsClientSuspended"].first().reset_index()
)["IsClientSuspended"]
df["RiskRate"] = (
    df_account_level.groupby("ClientID")["RiskRate"].first().reset_index()
)["RiskRate"]

In [56]:
df["NumOfAccounts"] = df_account_level.groupby("ClientID").size().values

In [57]:
df["NumOfClosedAccounts"] = (
    df_account_level.groupby("ClientID")["IsClosed"].sum().values
)

In [58]:
df["NumOfSuspendedAccounts"] = (
    df_account_level.groupby("ClientID")["IsProfileSuspended"].sum().values
)

In [59]:
df["NumOfOrders"] = df_account_level.groupby("ClientID")["NumOfOrders"].sum().values

In [60]:
df["NumOfCompletedOrders"] = (
    df_account_level.groupby("ClientID")["NumOfCompleted"].sum().values
)

In [61]:
df["NumOfCanceledOrders"] = (
    df_account_level.groupby("ClientID")["NumOfCanceled"].sum().values
)

In [62]:
df["TotalExecutedQuantity"] = (
    df_account_level.groupby("ClientID")["TotalExecutedQuantity"].sum().values
)

In [63]:
df["TotalQuantity"] = df_account_level.groupby("ClientID")["TotalQuantity"].sum().values

In [64]:
df["AvgPrice"] = (
    df_account_level.groupby("ClientID")["AvgPrice"].sum()
    / df_account_level.groupby("ClientID").size()
).values

In [65]:
agg_dict = {}

for col in df_account_level.columns:
    if (
        "OrderType" in col
        or "SecurityID" in col
        or "ExecutionStatus" in col
        or "SectorName" in col
    ):
        agg_dict[col] = (
            df_account_level.groupby("ClientID")[col].sum()
            / df_account_level.groupby("ClientID").size()
        ).values

df = pd.concat([df, pd.DataFrame(agg_dict)], axis=1)

In [66]:
df["FirstOpenAccountDate"] = (
    df_account_level.groupby("ClientID")["OpenDate"].min().values
)

In [67]:
df["LastOpenAccountDate"] = (
    df_account_level.groupby("ClientID")["OpenDate"].max().values
)

In [68]:
df["FirstOrderDateAcrossAccounts"] = (
    df_account_level.groupby("ClientID")["LastOrder"].min().values
)

df["FirstOrderDateAcrossAccounts"] = pd.to_datetime(
    df["FirstOrderDateAcrossAccounts"].dt.date
)

In [69]:
df["LastOrderDateAcrossAccounts"] = (
    df_account_level.groupby("ClientID")["LastOrder"].max().values
)

df["LastOrderDateAcrossAccounts"] = pd.to_datetime(
    df["LastOrderDateAcrossAccounts"].dt.date
)

Define a function so that we can compute if the account is dormant or not.

If the account has not placed any order, we set the Is Dormant value to -1, so that we can isolate the accounts who have not placed any orders.


In [70]:
def check_dormant(date):
    one_year_before_now = datetime.now() - timedelta(days=365)

    if date < one_year_before_now:
        return 1
    else:
        return 0

Recompute the Is Dormant column


In [71]:
df["IsDormant"] = df["LastOrderDateAcrossAccounts"].apply(check_dormant)

In [72]:
df["AvgOrderRate_Start"] = (
    df_account_level.groupby("ClientID")["OrderRate_Start"].mean().values
)

In [73]:
df["AvgOrderRate_End"] = (
    df_account_level.groupby("ClientID")["OrderRate_End"].mean().values
)

In [74]:
df["AvgOrderRate_Difference"] = df["AvgOrderRate_End"] - df["AvgOrderRate_Start"]

In [75]:
df["AvgQuantityOrderedRate_Start"] = (
    df_account_level.groupby("ClientID")["QuantityOrderedRate_Start"].mean().values
)

In [76]:
df["AvgQuantityOrderedRate_End"] = (
    df_account_level.groupby("ClientID")["QuantityOrderedRate_End"].mean().values
)

In [77]:
df["AvgQuantityOrderedRate_Difference"] = (
    df["AvgQuantityOrderedRate_End"] - df["AvgQuantityOrderedRate_Start"]
)

In [78]:
df["AvgQuantityPerAccount_Start"] = (
    df_account_level.groupby("ClientID")["AvgQuantityPerOrder_Start"].mean().values
)

In [79]:
df["AvgQuantityPerAccount_End"] = (
    df_account_level.groupby("ClientID")["AvgQuantityPerOrder_End"].mean().values
)

In [80]:
df["AvgQuantityPerAccount_Difference"] = (
    df["AvgQuantityPerAccount_End"] - df["AvgQuantityPerAccount_Start"]
)

In [81]:
df["AvgExecutedQuantityRatio"] = (
    df_account_level.groupby("ClientID")["ExecutedQuantityRatio"].mean().values
)

In [82]:
df["ClosedAccountsRatio"] = df["NumOfClosedAccounts"] / df["NumOfAccounts"]

In [83]:
df["SuspendedAccountsRatio"] = df["NumOfSuspendedAccounts"] / df["NumOfAccounts"]

In [84]:
# Create the Year columns
df["FirstOpenAccountDate_Year"] = df["FirstOpenAccountDate"].dt.year

# Convert the type to int
df["FirstOpenAccountDate_Year"] = df["FirstOpenAccountDate_Year"].astype(int)

In [85]:
# Create the Year columns
df["LastOpenAccountDate_Year"] = df["LastOpenAccountDate"].dt.year

# Convert the type to int
df["LastOpenAccountDate_Year"] = df["LastOpenAccountDate_Year"].astype(int)

In [86]:
# Create the Year columns
df["FirstOrderDateAcrossAccounts_Year"] = df["FirstOrderDateAcrossAccounts"].dt.year

df["FirstOrderDateAcrossAccounts_Year"] = df[
    "FirstOrderDateAcrossAccounts_Year"
].fillna(0)

# Convert the type to int
df["FirstOrderDateAcrossAccounts_Year"] = df[
    "FirstOrderDateAcrossAccounts_Year"
].astype(int)

In [87]:
# Create the Year columns
df["LastOrderDateAcrossAccounts_Year"] = df["LastOrderDateAcrossAccounts"].dt.year

df["LastOrderDateAcrossAccounts_Year"] = df["LastOrderDateAcrossAccounts_Year"].fillna(
    0
)

# Convert the type to int
df["LastOrderDateAcrossAccounts_Year"] = df["LastOrderDateAcrossAccounts_Year"].astype(
    int
)

In [88]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8871 entries, 0 to 8870
Data columns (total 367 columns):
 #    Column                                Dtype         
---   ------                                -----         
 0    ClientID                              int64         
 1    Gender                                object        
 2    Age                                   int64         
 3    IsClientSuspended                     int64         
 4    RiskRate                              object        
 5    NumOfAccounts                         int64         
 6    NumOfClosedAccounts                   int64         
 7    NumOfSuspendedAccounts                int64         
 8    NumOfOrders                           int64         
 9    NumOfCompletedOrders                  int64         
 10   NumOfCanceledOrders                   int64         
 11   TotalExecutedQuantity                 int64         
 12   TotalQuantity                         int64         
 13   A

## Defining The Label


**_We defined our label to be a combination of multiple features_**


Create the label


In [89]:
df["Churned"] = (
    (df["IsDormant"] == 1)
    & ((df["AvgOrderRate_Difference"] < 0) | (df["NumOfOrders"] == 0))
    | (df["RiskRate"] == "High") & (df["AvgExecutedQuantityRatio"] <= 0.5)
    | (df["ClosedAccountsRatio"] > 0.9)
    | (df["IsClientSuspended"] == 1)
    | (df["SuspendedAccountsRatio"] >= 0.5)
)

df = df.replace({True: 1, False: 0})

df["Churned"] = df["Churned"].astype(int)

In [90]:
df.to_csv("../Data/visualization_data.csv", index=False)

In [91]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8871 entries, 0 to 8870
Data columns (total 368 columns):
 #    Column                                Dtype         
---   ------                                -----         
 0    ClientID                              int64         
 1    Gender                                object        
 2    Age                                   int64         
 3    IsClientSuspended                     int64         
 4    RiskRate                              object        
 5    NumOfAccounts                         int64         
 6    NumOfClosedAccounts                   int64         
 7    NumOfSuspendedAccounts                int64         
 8    NumOfOrders                           int64         
 9    NumOfCompletedOrders                  int64         
 10   NumOfCanceledOrders                   int64         
 11   TotalExecutedQuantity                 int64         
 12   TotalQuantity                         int64         
 13   A

## OHE


The columns to be one-hot encoded


In [92]:
columns = [
    "Gender",
    "RiskRate",
    "FirstOpenAccountDate_Year",
    "LastOpenAccountDate_Year",
    "FirstOrderDateAcrossAccounts_Year",
    "LastOrderDateAcrossAccounts_Year",
]

OHE these columns


In [93]:
df = pd.get_dummies(
    df,
    columns=columns,
    dtype=int,
)

Drop the columns used for creating the label to remove bias, also drop the unnecessary columns


In [94]:
df.drop(
    columns=[
        "AvgExecutedQuantityRatio",
        "IsDormant",
        "RiskRate_High",
        "ClosedAccountsRatio",
        "IsClientSuspended",
        "SuspendedAccountsRatio",
        "NumOfOrders",
        "AvgOrderRate_Difference",
        "FirstOpenAccountDate",
        "LastOpenAccountDate",
        "FirstOrderDateAcrossAccounts",
        "LastOrderDateAcrossAccounts",
        "NumOfSuspendedAccounts",
    ],
    inplace=True,
)

Remove all whitespaces in the column names


In [95]:
df.rename(columns=lambda x: x.replace(" ", ""), inplace=True)

## Normalization


The columns to be normalized


In [96]:
columns = [
    "NumOfCompletedOrders",
    "NumOfCanceledOrders",
    "AvgPrice",
    "TotalExecutedQuantity",
    "TotalQuantity",
    "Age",
    "NumOfAccounts",
    "NumOfClosedAccounts",
    "NumOfCompletedOrders",
    "NumOfCanceledOrders",
    "AvgOrderRate_Start",
    "AvgOrderRate_End",
    "AvgQuantityOrderedRate_Start",
    "AvgQuantityOrderedRate_End",
    "AvgQuantityOrderedRate_Difference",
    "AvgQuantityPerAccount_Start",
    "AvgQuantityPerAccount_End",
    "AvgQuantityPerAccount_Difference",
]

In [97]:
df = pd.read_csv("../Data/cleaned_dataset.csv")

Define a function to normalize the data based on a distribution


In [98]:
def get_normalized_data(data, dist):
    if dist == "uniform":
        return MinMaxScaler().fit_transform(data)
    elif dist == "norm":
        return StandardScaler().fit_transform(data)
    elif dist == "cauchy":
        ranked = stats.rankdata(data, method="average")
        # Convert ranks to percentiles
        percentiles = 100.0 * (ranked - 1) / (len(data) - 1)
        # Get the quantiles
        return np.percentile(data, percentiles)
    else:
        return np.log(np.abs(data.flatten()) + 1)

Define a function to get the best distribution that fits the data


In [99]:
def get_best_distribution(columns, df):
    columns_distributions_dict = {column: "" for column in columns}

    for column in columns:
        print("###### " + column + " ######")

        data = df[column].values

        f = Fitter(
            data,
            distributions=get_common_distributions(),
        )
        f.fit()
        f.summary(plot=False)
        dist = f.get_best(method="sumsquare_error")
        best_dist = ""
        for key in dist.keys():
            best_dist = key

        columns_distributions_dict[column] = str(best_dist)
        print(column)
        print(f"Best Distribution: {best_dist}")
        print()

    return columns_distributions_dict

Define a function to normalize the data based on its best distribution


In [100]:
def normalize(columns, df):
    columns_distributions_dict = get_best_distribution(columns, df)

    for column in columns_distributions_dict.keys():
        data = np.array(df[column]).reshape(-1, 1)
        df[column] = get_normalized_data(
            data=data, dist=columns_distributions_dict[column]
        )
    return columns_distributions_dict

Normalize the data


In [101]:
print(normalize(columns, df))

###### NumOfCompletedOrders ######


[32m2024-04-22 22:16:48.022[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted expon distribution with error=51.274878)[0m
[32m2024-04-22 22:16:48.036[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted norm distribution with error=42.333425)[0m
[32m2024-04-22 22:16:48.065[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted uniform distribution with error=44.384776)[0m
[32m2024-04-22 22:16:48.082[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted rayleigh distribution with error=44.863439)[0m
[32m2024-04-22 22:16:48.117[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted cauchy distribution with error=43.372838)[0m
[32m2024-04-22 22:16:48.433[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fi

NumOfCompletedOrders
Best Distribution: norm

###### NumOfCanceledOrders ######


[32m2024-04-22 22:16:48.978[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=229.50121)[0m
[32m2024-04-22 22:16:49.040[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=206.461286)[0m
[32m2024-04-22 22:16:49.041[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=235.263055)[0m
[32m2024-04-22 22:16:49.086[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=156.496175)[0m
[32m2024-04-22 22:16:49.106[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted lognorm distribution with error=243.958992)[0m
[32m2024-04-22 22:16:49.191[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[

NumOfCanceledOrders
Best Distribution: exponpow

###### AvgPrice ######


[32m2024-04-22 22:16:49.627[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=0.002665)[0m
[32m2024-04-22 22:16:49.660[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=0.002246)[0m
[32m2024-04-22 22:16:49.669[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=0.001051)[0m
[32m2024-04-22 22:16:49.711[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=0.000999)[0m
[32m2024-04-22 22:16:49.722[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted lognorm distribution with error=0.002633)[0m
[32m2024-04-22 22:16:49.776[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_s

AvgPrice
Best Distribution: cauchy

###### TotalExecutedQuantity ######


[32m2024-04-22 22:16:50.220[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=0.162832)[0m
[32m2024-04-22 22:16:50.274[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=0.400024)[0m
[32m2024-04-22 22:16:50.286[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=0.348359)[0m
[32m2024-04-22 22:16:50.333[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=0.923748)[0m
[32m2024-04-22 22:16:50.344[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted lognorm distribution with error=0.490695)[0m
[32m2024-04-22 22:16:50.385[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_s

TotalExecutedQuantity
Best Distribution: cauchy

###### TotalQuantity ######


[32m2024-04-22 22:16:50.747[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=1.023626)[0m
[32m2024-04-22 22:16:50.771[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=1.102228)[0m
[32m2024-04-22 22:16:50.838[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted lognorm distribution with error=0.93271)[0m
[32m2024-04-22 22:16:50.848[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=16.06327)[0m
[32m2024-04-22 22:16:50.882[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted norm distribution with error=668.746696)[0m
[32m2024-04-22 22:16:50.886[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_sing

TotalQuantity
Best Distribution: lognorm

###### Age ######


[32m2024-04-22 22:16:51.056[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=1615.742902)[0m
[32m2024-04-22 22:16:51.071[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=1758.410427)[0m
[32m2024-04-22 22:16:51.086[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=577.569899)[0m
[32m2024-04-22 22:16:51.122[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted expon distribution with error=3912.064368)[0m
[32m2024-04-22 22:16:51.130[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted norm distribution with error=4670.220196)[0m
[32m2024-04-22 22:16:51.154[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[3

Age
Best Distribution: powerlaw

###### NumOfAccounts ######


[32m2024-04-22 22:16:51.362[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted cauchy distribution with error=4923.86385)[0m
[32m2024-04-22 22:16:51.372[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=4061.317622)[0m
[32m2024-04-22 22:16:51.390[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=4112.35577)[0m
[32m2024-04-22 22:16:51.410[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=4384.032867)[0m
[32m2024-04-22 22:16:51.440[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted norm distribution with error=11786.996283)[0m
[32m2024-04-22 22:16:51.441[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[

NumOfAccounts
Best Distribution: expon

###### NumOfClosedAccounts ######


[32m2024-04-22 22:16:51.769[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted cauchy distribution with error=12617.913636)[0m
[32m2024-04-22 22:16:51.844[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=8881.975582)[0m
[32m2024-04-22 22:16:51.851[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=10339.910317)[0m
[32m2024-04-22 22:16:51.889[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=11526.610333)[0m
[32m2024-04-22 22:16:51.902[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=12543.944253)[0m
[32m2024-04-22 22:16:51.952[0m | [1mINFO    [0m | [36mfitter.fitt

NumOfClosedAccounts
Best Distribution: expon

###### NumOfCompletedOrders ######


[32m2024-04-22 22:16:52.449[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=42.541988)[0m
[32m2024-04-22 22:16:52.500[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=54.727439)[0m
[32m2024-04-22 22:16:52.514[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=48.418172)[0m
[32m2024-04-22 22:16:52.569[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=60.956898)[0m
[32m2024-04-22 22:16:52.587[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted lognorm distribution with error=61.855216)[0m
[32m2024-04-22 22:16:52.620[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_

NumOfCompletedOrders
Best Distribution: norm

###### NumOfCanceledOrders ######


[32m2024-04-22 22:16:53.038[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=229.50121)[0m
[32m2024-04-22 22:16:53.104[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=235.263055)[0m
[32m2024-04-22 22:16:53.111[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=206.461286)[0m
[32m2024-04-22 22:16:53.161[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=156.496175)[0m
[32m2024-04-22 22:16:53.183[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted lognorm distribution with error=243.958992)[0m
[32m2024-04-22 22:16:53.212[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[

NumOfCanceledOrders
Best Distribution: exponpow

###### AvgOrderRate_Start ######


[32m2024-04-22 22:16:53.540[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted cauchy distribution with error=2423.284071)[0m
[32m2024-04-22 22:16:53.784[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=1533.539749)[0m
[32m2024-04-22 22:16:53.820[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=1782.031497)[0m
[32m2024-04-22 22:16:53.825[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=2389.572657)[0m
[32m2024-04-22 22:16:53.866[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=1559.226266)[0m
[32m2024-04-22 22:16:53.896[0m | [1mINFO    [0m | [36mfitter.fitter[

AvgOrderRate_Start
Best Distribution: expon

###### AvgOrderRate_End ######


[32m2024-04-22 22:16:54.415[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=188.001679)[0m
[32m2024-04-22 22:16:54.435[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=416.886867)[0m
[32m2024-04-22 22:16:54.442[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=267.47227)[0m
[32m2024-04-22 22:16:54.512[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=216.169939)[0m
[32m2024-04-22 22:16:54.518[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted lognorm distribution with error=435.297246)[0m
[32m2024-04-22 22:16:54.577[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[

AvgOrderRate_End
Best Distribution: cauchy

###### AvgQuantityOrderedRate_Start ######


[32m2024-04-22 22:16:54.879[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted cauchy distribution with error=505.079009)[0m
[32m2024-04-22 22:16:55.051[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=235.016725)[0m
[32m2024-04-22 22:16:55.077[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=451.142013)[0m
[32m2024-04-22 22:16:55.135[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=496.996339)[0m
[32m2024-04-22 22:16:55.150[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=297.628967)[0m
[32m2024-04-22 22:16:55.177[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[

AvgQuantityOrderedRate_Start
Best Distribution: gamma

###### AvgQuantityOrderedRate_End ######


[32m2024-04-22 22:16:55.680[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=52.402019)[0m
[32m2024-04-22 22:16:55.735[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=55.328976)[0m
[32m2024-04-22 22:16:55.754[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=49.171814)[0m
[32m2024-04-22 22:16:55.804[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=34.505108)[0m
[32m2024-04-22 22:16:55.817[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted lognorm distribution with error=61.957939)[0m
[32m2024-04-22 22:16:55.866[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_

AvgQuantityOrderedRate_End
Best Distribution: exponpow

###### AvgQuantityOrderedRate_Difference ######


[32m2024-04-22 22:16:56.102[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=192.081014)[0m
[32m2024-04-22 22:16:56.142[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=193.711475)[0m
[32m2024-04-22 22:16:56.191[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted norm distribution with error=412.744074)[0m
[32m2024-04-22 22:16:56.199[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted expon distribution with error=392.543241)[0m
[32m2024-04-22 22:16:56.221[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted uniform distribution with error=408.899615)[0m
[32m2024-04-22 22:16:56.228[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36

AvgQuantityOrderedRate_Difference
Best Distribution: cauchy

###### AvgQuantityPerAccount_Start ######


[32m2024-04-22 22:16:56.511[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted cauchy distribution with error=420.902363)[0m
[32m2024-04-22 22:16:56.712[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=313.695909)[0m
[32m2024-04-22 22:16:56.746[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=376.922896)[0m
[32m2024-04-22 22:16:56.755[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=414.22341)[0m
[32m2024-04-22 22:16:56.792[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=392.506311)[0m
[32m2024-04-22 22:16:56.820[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[3

AvgQuantityPerAccount_Start
Best Distribution: gamma

###### AvgQuantityPerAccount_End ######


[32m2024-04-22 22:16:57.277[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=58.08407)[0m
[32m2024-04-22 22:16:57.302[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=63.743045)[0m
[32m2024-04-22 22:16:57.330[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=51.356413)[0m
[32m2024-04-22 22:16:57.366[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=50.210993)[0m
[32m2024-04-22 22:16:57.382[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted lognorm distribution with error=69.153325)[0m
[32m2024-04-22 22:16:57.420[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_f

AvgQuantityPerAccount_End
Best Distribution: cauchy

###### AvgQuantityPerAccount_Difference ######


[32m2024-04-22 22:16:57.694[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted lognorm distribution with error=0.988918)[0m
[32m2024-04-22 22:16:57.734[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=0.968907)[0m
[32m2024-04-22 22:16:57.742[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=0.969264)[0m
[32m2024-04-22 22:16:57.785[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=0.949526)[0m


AvgQuantityPerAccount_Difference
Best Distribution: norm

{'NumOfCompletedOrders': 'norm', 'NumOfCanceledOrders': 'exponpow', 'AvgPrice': 'cauchy', 'TotalExecutedQuantity': 'cauchy', 'TotalQuantity': 'lognorm', 'Age': 'powerlaw', 'NumOfAccounts': 'expon', 'NumOfClosedAccounts': 'expon', 'AvgOrderRate_Start': 'expon', 'AvgOrderRate_End': 'cauchy', 'AvgQuantityOrderedRate_Start': 'gamma', 'AvgQuantityOrderedRate_End': 'exponpow', 'AvgQuantityOrderedRate_Difference': 'cauchy', 'AvgQuantityPerAccount_Start': 'gamma', 'AvgQuantityPerAccount_End': 'cauchy', 'AvgQuantityPerAccount_Difference': 'norm'}


## Saving


Make the Churned column to be the last column


In [102]:
cols = list(df.columns)
cols.append(cols.pop(cols.index("Churned")))
df = df[cols]

Display the head of the dataframe


In [103]:
df.head()

Unnamed: 0,ClientID,Age,NumOfAccounts,NumOfClosedAccounts,NumOfCompletedOrders,NumOfCanceledOrders,TotalExecutedQuantity,TotalQuantity,AvgPrice,OrderType_Buy,...,LastOpenAccountDate_Year_2024,FirstOrderDateAcrossAccounts_Year_0,FirstOrderDateAcrossAccounts_Year_2022,FirstOrderDateAcrossAccounts_Year_2023,FirstOrderDateAcrossAccounts_Year_2024,LastOrderDateAcrossAccounts_Year_0,LastOrderDateAcrossAccounts_Year_2022,LastOrderDateAcrossAccounts_Year_2023,LastOrderDateAcrossAccounts_Year_2024,Churned
0,0,0.92025,0.423036,0.0,-0.861603,0.0,9.392745,0.423719,1.21,1.485619,...,0,0,1,0,0,0,1,0,0,0
1,1,0.967505,0.423036,0.0,-0.265479,0.423036,8.377471,0.57009,1.982429,0.0,...,0,0,1,0,0,0,1,0,0,1
2,2,0.956848,0.423036,0.0,0.223327,0.554618,11.727198,0.072226,7.652667,13.370571,...,0,0,0,0,1,0,0,0,1,0
3,3,0.979564,0.423036,0.0,0.724664,0.810237,13.433575,0.433375,72.059886,30.45519,...,0,0,0,0,1,0,0,0,1,0
4,4,0.949083,0.554618,0.0,0.105945,0.810237,12.907934,0.377431,1.572133,1.857024,...,0,0,0,0,1,0,0,0,1,0


Display the info of the dataframe


In [104]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9123 entries, 0 to 9122
Data columns (total 447 columns):
 #    Column                                  Dtype  
---   ------                                  -----  
 0    ClientID                                int64  
 1    Age                                     float64
 2    NumOfAccounts                           float64
 3    NumOfClosedAccounts                     float64
 4    NumOfCompletedOrders                    float64
 5    NumOfCanceledOrders                     float64
 6    TotalExecutedQuantity                   float64
 7    TotalQuantity                           float64
 8    AvgPrice                                float64
 9    OrderType_Buy                           float64
 10   OrderType_Sell                          float64
 11   SecurityID_0                            float64
 12   SecurityID_1                            float64
 13   SecurityID_2                            float64
 14   SecurityID_3          

Save the dataframe into a csv "cleaned_dataset.csv"


In [105]:
df.to_csv("../Data/cleaned_dataset.csv", index=False)