# Data Cleaning and Preprocessing


Imports


In [1]:
import pandas as pd
import numpy as np
from fitter import Fitter, get_common_distributions
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import warnings
from datetime import datetime, timedelta
from scipy import stats

warnings.filterwarnings("ignore")

pd.set_option("future.no_silent_downcasting", True)

Read the orders dataset into a pandas dataframe


In [2]:
orders_df = pd.read_csv("../Data/orders_data_competition.csv")

In [3]:
orders_df.head()

Unnamed: 0,Order ID,Account ID,Market Key,Security ID,Order Type,Order Time,Order Via,Is Completed,Is Canceled,Expire Date,Execution Status,quantity,Price,Sector Name,Executed Quantity,Quantity
0,0,2312,Egypt,0,Buy,2024-01-10 10:16:10.000,Online,1,0,2024-01-10 00:00:00.000,Executed,100,66.3,Real Estate,100,100
1,1,1196,Egypt,0,Sell,2024-01-10 10:16:11.000,Online,1,0,2024-01-10 00:00:00.000,Executed,200,66.0,Real Estate,200,200
2,2,1759,Egypt,0,Buy,2024-01-10 10:16:12.000,Online,0,1,2024-01-10 00:00:00.000,Not Executed,200,65.0,Real Estate,0,200
3,3,4476,Egypt,1,Sell,2024-01-10 10:16:16.000,Online,0,1,2024-01-10 00:00:00.000,Not Executed,820,75.2,Financials,0,820
4,4,838,Egypt,2,Sell,2024-01-10 10:16:18.000,Online,0,1,2024-01-10 00:00:00.000,Not Executed,500,10.68,Materials,0,500


Read the clients dataset into a pandas dataframe


In [4]:
clients_df = pd.read_csv("../Data/clients_data_competition.csv")

In [5]:
clients_df.head()

Unnamed: 0,Client ID,Account ID,Gender,Risk Rate,Company Name,Is Closed,Is Dormant,Is Profile Suspended,Is Client Suspended,Client Type Name,OpenDate,BirthDate
0,0,0,Male,Low,HSB,0,0.0,0,0,Individuals,2/16/2014,1990-07-27
1,1,1,Female,Low,HSB,0,-1.0,1,0,Individuals,2/16/2014,1963-10-22
2,2,2,Male,Low,HSB,0,0.0,0,0,Individuals,2/17/2014,1971-05-14
3,3,3,Male,Low,HSB,0,0.0,0,0,Individuals,2/17/2014,1953-01-14
4,4,4,Male,Low,HSB,0,0.0,0,0,Individuals,2/17/2014,1976-06-09


## Account Level Dataset


### Orders Data


**_We created another dataframe "agg_orders" which represents the aggregate orders of each account_**


Drop the rows with null values


In [6]:
orders_df.dropna(inplace=True)

In [7]:
orders_df = orders_df[orders_df["Order Via"] == "Online"]

In [8]:
orders_df.drop(columns=["Order Via"], inplace=True)

In [9]:
account_ids_to_remove = clients_df[(clients_df["Client Type Name"] != "Individuals")][
    "Account ID"
].unique()

# Remove the accounts from the accounts dataframe where Client Type Name is not 'individuals'
clients_df = clients_df[~clients_df["Account ID"].isin(account_ids_to_remove)]

# Remove the orders of these accounts from the orders dataframe
orders_df = orders_df[~orders_df["Account ID"].isin(account_ids_to_remove)]

In [10]:
clients_df.drop(columns=["Client Type Name", "Company Name"], inplace=True)

In [11]:
len(clients_df["Account ID"].unique())

12773

In [12]:
len(orders_df["Account ID"].unique())

6715

In [13]:
# Check if there are any accounts in the orders dataframe that are not in the accounts dataframe
accounts_in_orders_not_in_accounts = orders_df[
    ~orders_df["Account ID"].isin(clients_df["Account ID"])
]
if accounts_in_orders_not_in_accounts.empty:
    print(
        "There are no accounts in the orders dataframe that are not in the accounts dataframe."
    )
else:
    print(
        "There are accounts in the orders dataframe that are not in the accounts dataframe."
    )

# Check if there are any accounts in the accounts dataframe that are not in the orders dataframe
accounts_in_accounts_not_in_orders = clients_df[
    ~clients_df["Account ID"].isin(orders_df["Account ID"])
]
if accounts_in_accounts_not_in_orders.empty:
    print(
        "There are no accounts in the accounts dataframe that are not in the orders dataframe."
    )
else:
    print(
        "There are accounts in the accounts dataframe that are not in the orders dataframe."
    )

There are no accounts in the orders dataframe that are not in the accounts dataframe.
There are accounts in the accounts dataframe that are not in the orders dataframe.


In [14]:
orders_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1833491 entries, 0 to 1987940
Data columns (total 15 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Order ID           int64  
 1   Account ID         int64  
 2   Market Key         object 
 3   Security ID        int64  
 4   Order Type         object 
 5   Order Time         object 
 6   Is Completed       int64  
 7   Is Canceled        int64  
 8   Expire Date        object 
 9   Execution Status   object 
 10  quantity           int64  
 11  Price              float64
 12  Sector Name        object 
 13  Executed Quantity  int64  
 14  Quantity           int64  
dtypes: float64(1), int64(8), object(6)
memory usage: 223.8+ MB


Initialize a new Dataframe to store the aggregate orders per account


In [15]:
agg_orders = pd.DataFrame()

Add an AccountID column to the aggregate orders df


In [16]:
agg_orders["AccountID"] = clients_df["Account ID"].unique()

Compute and add the number of orders for each account


In [17]:
# Add a NumOfOrders column that has the number of orders for each account
agg_orders["NumOfOrders"] = orders_df.groupby("Account ID").size()

# Set the number of orders for the accounts that didn't place any order to 0
agg_orders["NumOfOrders"] = agg_orders["NumOfOrders"].fillna(0)

# Convert the type to int
agg_orders["NumOfOrders"] = agg_orders["NumOfOrders"].astype(int)

Compute and add the last order date for each account


In [18]:
# Convert the 'order_date' column to datetime
orders_df["Order Time"] = pd.to_datetime(orders_df["Order Time"])

# Create a new dataframe with the last order date for each account
agg_orders["LastOrder"] = orders_df.groupby("Account ID")["Order Time"].max()

# Convert the LastOrder column to datetime
agg_orders["LastOrder"] = pd.to_datetime(agg_orders["LastOrder"], format="%d-%m-%Y")

Compute and add the first order date for each account


In [19]:
# Create a new dataframe with the last order date for each account
agg_orders["FirstOrder"] = orders_df.groupby("Account ID")["Order Time"].min()

# Convert the LastOrder column to datetime
agg_orders["FirstOrder"] = pd.to_datetime(agg_orders["FirstOrder"], format="%d-%m-%Y")

Compute and add the number of completed orders for each account


In [20]:
# Add a NumOfCompleted column that has the number of completed orders for each account
agg_orders["NumOfCompleted"] = orders_df.groupby("Account ID")["Is Completed"].sum()

# Set the number of completed orders for the accounts that didn't place any order to 0
agg_orders["NumOfCompleted"] = agg_orders["NumOfCompleted"].fillna(0)

# # Convert the type to int
agg_orders["NumOfCompleted"] = agg_orders["NumOfCompleted"].astype(int)

Compute and add the number of canceled for each account


In [21]:
# Add a NumOfCanceled column that has the number of completed orders for each account
agg_orders["NumOfCanceled"] = orders_df.groupby("Account ID")["Is Canceled"].sum()

# Set the number of completed orders for the accounts that didn't place any order to 0
agg_orders["NumOfCanceled"] = agg_orders["NumOfCanceled"].fillna(0)

# # Convert the type to int
agg_orders["NumOfCanceled"] = agg_orders["NumOfCanceled"].astype(int)

Compute and add the average price of orders for each account


In [22]:
# Add a AvgPrice column that has the number of completed orders for each account
agg_orders["AvgPrice"] = (
    orders_df.groupby("Account ID")["Price"].sum()
    / orders_df.groupby("Account ID").size()
)

# Set the number of completed orders for the accounts that didn't place any order to 0
agg_orders["AvgPrice"] = agg_orders["AvgPrice"].fillna(0.0)

Compute and add the total executed quantity of orders for each account


In [23]:
# Add a TotalExecutedQuantity column that has the number of completed orders for each account
agg_orders["TotalExecutedQuantity"] = orders_df.groupby("Account ID")[
    "Executed Quantity"
].sum()

# Set the number of completed orders for the accounts that didn't place any order to 0
agg_orders["TotalExecutedQuantity"] = agg_orders["TotalExecutedQuantity"].fillna(0)

# # Convert the type to int
agg_orders["TotalExecutedQuantity"] = agg_orders["TotalExecutedQuantity"].astype(int)

Compute and add the total quantity of orders for each account


In [24]:
# Add a TotalQuantity column that has the number of completed orders for each account
agg_orders["TotalQuantity"] = orders_df.groupby("Account ID")["Quantity"].sum()

# Set the number of completed orders for the accounts that didn't place any order to 0
agg_orders["TotalQuantity"] = agg_orders["TotalQuantity"].fillna(0)

# # Convert the type to int
agg_orders["TotalQuantity"] = agg_orders["TotalQuantity"].astype(int)

Define a function to convert a categorical feature to its TFIDF dataframe and concatenate it with the aggregate orders dataframe


In [25]:
def FrequencyEncoder(column_name: str):
    temp = pd.get_dummies(orders_df, columns=[column_name])
    temp.replace({True: 1, False: 0}, inplace=True)

    frequency_encode_df = pd.DataFrame(data=agg_orders.iloc[:, 0])

    for word in orders_df[column_name].unique():
        col_name = f"{column_name}_{word}"

        frequency_encode_df[col_name.replace(" ", "")] = (
            temp.groupby("Account ID")[col_name].sum()
            / temp.groupby("Account ID").size()
        )

        frequency_encode_df[col_name.replace(" ", "")] = frequency_encode_df[
            col_name.replace(" ", "")
        ].fillna(0)

        frequency_encode_df[col_name.replace(" ", "")] = frequency_encode_df[
            col_name.replace(" ", "")
        ].astype(float)

    return pd.concat([agg_orders, frequency_encode_df.iloc[:, 1:]], axis=1)

Compute and add the Frequency Encoding of the Security ID column


In [26]:
agg_orders = FrequencyEncoder("Security ID")

Compute and add the Frequency Encoding of the Order Type column


In [27]:
agg_orders = FrequencyEncoder("Order Type")

Compute and add the Frequency Encoding of the Execution Status column


In [28]:
agg_orders = FrequencyEncoder("Execution Status")

Compute and add the Frequency Encoding of the Sector Name column


In [29]:
agg_orders = FrequencyEncoder("Sector Name")

Display the head of the agg_orders dataframe


In [30]:
agg_orders.head()

Unnamed: 0,AccountID,NumOfOrders,LastOrder,FirstOrder,NumOfCompleted,NumOfCanceled,AvgPrice,TotalExecutedQuantity,TotalQuantity,SecurityID_0,...,SectorName_INVESTMENT,SectorName_TelecommunicationServices,SectorName_REALESTATE,SectorName_Telecommunications,SectorName_FOOD,SectorName_Others,SectorName_Tourism,SectorName_ConsumerServices,SectorName_Utilities,SectorName_PharmaceuticalIndustries
0,0,0,NaT,NaT,0,0,0.0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0,NaT,NaT,0,0,0.0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,21,2024-02-28 11:06:33,2022-01-04 12:06:31,17,2,7.652667,123895,137395,0.0,...,0.095238,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,87,2024-03-10 12:32:34,2023-01-17 12:02:35,64,11,71.908161,682538,908478,0.0,...,0.0,0.022989,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,30,2024-02-26 12:47:33,2022-01-11 11:05:32,8,10,2.846267,356686,594953,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Display the info of the agg_orders dataframe


In [31]:
agg_orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12773 entries, 0 to 12772
Columns: 341 entries, AccountID to SectorName_PharmaceuticalIndustries
dtypes: datetime64[ns](2), float64(333), int64(6)
memory usage: 33.2 MB


### Clients Data


Drop the rows with null values


In [32]:
clients_df.dropna(inplace=True)

In [33]:
clients_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 12773 entries, 0 to 13522
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Client ID             12773 non-null  int64  
 1   Account ID            12773 non-null  int64  
 2   Gender                12773 non-null  object 
 3   Risk Rate             12773 non-null  object 
 4   Is Closed             12773 non-null  int64  
 5   Is Dormant            12773 non-null  float64
 6   Is Profile Suspended  12773 non-null  int64  
 7   Is Client Suspended   12773 non-null  int64  
 8   OpenDate              12773 non-null  object 
 9   BirthDate             12773 non-null  object 
dtypes: float64(1), int64(5), object(4)
memory usage: 1.1+ MB


Convert the OpenDate column to DateTime object


In [34]:
# Convert the OpenDate column to datetime
clients_df["OpenDate"] = pd.to_datetime(clients_df["OpenDate"], format="%m/%d/%Y")

Convert the BirthDate column to DateTime object, and calculate the age of the client


In [35]:
# Convert the BirthDate column to datetime
clients_df["BirthDate"] = pd.to_datetime(clients_df["BirthDate"], format="%Y-%m-%d")

In [36]:
def calculate_age(birth_date):
    # Assuming birth_date is a datetime object
    today = datetime.today()
    return (
        today.year
        - birth_date.year
        - ((today.month, today.day) < (birth_date.month, birth_date.day))
    )

In [37]:
clients_df["Age"] = clients_df["BirthDate"].apply(calculate_age)

Remove all whitespaces in the column names


In [38]:
clients_df.rename(columns=lambda x: x.replace(" ", ""), inplace=True)

Display the head of the clients dataframe


In [39]:
clients_df.head()

Unnamed: 0,ClientID,AccountID,Gender,RiskRate,IsClosed,IsDormant,IsProfileSuspended,IsClientSuspended,OpenDate,BirthDate,Age
0,0,0,Male,Low,0,0.0,0,0,2014-02-16,1990-07-27,33
1,1,1,Female,Low,0,-1.0,1,0,2014-02-16,1963-10-22,60
2,2,2,Male,Low,0,0.0,0,0,2014-02-17,1971-05-14,52
3,3,3,Male,Low,0,0.0,0,0,2014-02-17,1953-01-14,71
4,4,4,Male,Low,0,0.0,0,0,2014-02-17,1976-06-09,47


Display the info of the clients dataframe


In [40]:
clients_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12773 entries, 0 to 13522
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   ClientID            12773 non-null  int64         
 1   AccountID           12773 non-null  int64         
 2   Gender              12773 non-null  object        
 3   RiskRate            12773 non-null  object        
 4   IsClosed            12773 non-null  int64         
 5   IsDormant           12773 non-null  float64       
 6   IsProfileSuspended  12773 non-null  int64         
 7   IsClientSuspended   12773 non-null  int64         
 8   OpenDate            12773 non-null  datetime64[ns]
 9   BirthDate           12773 non-null  datetime64[ns]
 10  Age                 12773 non-null  int64         
dtypes: datetime64[ns](2), float64(1), int64(6), object(2)
memory usage: 1.2+ MB


### Merging


**_We will inner join the agg_orders and clients_df dataframes on the AccountID column_**


Perform the merging between both dataframes


In [41]:
clients_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12773 entries, 0 to 13522
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   ClientID            12773 non-null  int64         
 1   AccountID           12773 non-null  int64         
 2   Gender              12773 non-null  object        
 3   RiskRate            12773 non-null  object        
 4   IsClosed            12773 non-null  int64         
 5   IsDormant           12773 non-null  float64       
 6   IsProfileSuspended  12773 non-null  int64         
 7   IsClientSuspended   12773 non-null  int64         
 8   OpenDate            12773 non-null  datetime64[ns]
 9   BirthDate           12773 non-null  datetime64[ns]
 10  Age                 12773 non-null  int64         
dtypes: datetime64[ns](2), float64(1), int64(6), object(2)
memory usage: 1.2+ MB


In [42]:
agg_orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12773 entries, 0 to 12772
Columns: 341 entries, AccountID to SectorName_PharmaceuticalIndustries
dtypes: datetime64[ns](2), float64(333), int64(6)
memory usage: 33.2 MB


In [43]:
df_account_level = pd.merge(clients_df, agg_orders, on="AccountID", how="inner")

In [44]:
# Get the current date
now = datetime.now()

In [45]:
order_rates_start = []
order_rates_end = []
quantity_rates_start = []
quantity_rates_end = []
avg_quantity_per_order_start = []
avg_quantity_per_order_end = []

In [46]:
for i, account in df_account_level.iterrows():
    midpointDate = account["OpenDate"] + (now - account["OpenDate"]) / 2
    openDate = account["OpenDate"]

    days_open_midpoint = (midpointDate - openDate).days
    days_midpoint_now = (now - midpointDate).days

    filtered_date_df = orders_df[
        (orders_df["Order Time"] >= openDate)
        & (orders_df["Order Time"] < midpointDate)
        & (orders_df["Account ID"] == account["AccountID"])
    ]

    filtered_df_2 = orders_df[
        (orders_df["Order Time"] >= midpointDate)
        & (orders_df["Order Time"] < now)
        & (orders_df["Account ID"] == account["AccountID"])
    ]

    num_of_orders_start = len(filtered_date_df)
    num_of_orders_end = len(filtered_df_2)

    quantity_ordered_start = filtered_date_df["Quantity"].sum()
    quantity_ordered_end = filtered_df_2["Quantity"].sum()

    avg_quantity_start = filtered_date_df["Quantity"].mean()
    avg_quantity_end = filtered_df_2["Quantity"].mean()

    order_rates_start.append(num_of_orders_start / days_open_midpoint)
    order_rates_end.append(num_of_orders_end / days_midpoint_now)

    quantity_rates_start.append(quantity_ordered_start / days_open_midpoint)
    quantity_rates_end.append(quantity_ordered_end / days_midpoint_now)

    avg_quantity_per_order_start.append(avg_quantity_start)
    avg_quantity_per_order_end.append(avg_quantity_end)

In [47]:
df_account_level["OrderRate_Start"] = pd.Series(order_rates_start)
df_account_level["OrderRate_End"] = pd.Series(order_rates_end)
df_account_level["QuantityOrderedRate_Start"] = pd.Series(quantity_rates_start)
df_account_level["QuantityOrderedRate_End"] = pd.Series(quantity_rates_end)
df_account_level["AvgQuantityPerOrder_Start"] = pd.Series(avg_quantity_per_order_start)
df_account_level["AvgQuantityPerOrder_End"] = pd.Series(avg_quantity_per_order_end)

In [48]:
df_account_level["AvgQuantityPerOrder_Start"] = df_account_level[
    "AvgQuantityPerOrder_Start"
].fillna(0)
df_account_level["AvgQuantityPerOrder_End"] = df_account_level[
    "AvgQuantityPerOrder_End"
].fillna(0)

In [49]:
df_account_level["ExecutedQuantityRatio"] = df_account_level.apply(
    lambda row: (
        1
        if row["TotalQuantity"] == 0
        else row["TotalExecutedQuantity"] / row["TotalQuantity"]
    ),
    axis=1,
)

Display the head of the dataframe


In [50]:
df_account_level.head()

Unnamed: 0,ClientID,AccountID,Gender,RiskRate,IsClosed,IsDormant,IsProfileSuspended,IsClientSuspended,OpenDate,BirthDate,...,SectorName_ConsumerServices,SectorName_Utilities,SectorName_PharmaceuticalIndustries,OrderRate_Start,OrderRate_End,QuantityOrderedRate_Start,QuantityOrderedRate_End,AvgQuantityPerOrder_Start,AvgQuantityPerOrder_End,ExecutedQuantityRatio
0,0,0,Male,Low,0,0.0,0,0,2014-02-16,1990-07-27,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1,1,Female,Low,0,-1.0,1,0,2014-02-16,1963-10-22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2,2,Male,Low,0,0.0,0,0,2014-02-17,1971-05-14,...,0.0,0.0,0.0,0.0,0.011302,0.0,73.947793,0.0,6542.619048,0.901743
3,3,3,Male,Low,0,0.0,0,0,2014-02-17,1953-01-14,...,0.0,0.0,0.0,0.0,0.046825,0.0,488.95479,0.0,10442.275862,0.751298
4,4,4,Male,Low,0,0.0,0,0,2014-02-17,1976-06-09,...,0.0,0.0,0.0,0.0,0.016146,0.0,320.211518,0.0,19831.766667,0.59952


Display the info of the dataframe


In [51]:
df_account_level.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12773 entries, 0 to 12772
Data columns (total 358 columns):
 #    Column                                Dtype         
---   ------                                -----         
 0    ClientID                              int64         
 1    AccountID                             int64         
 2    Gender                                object        
 3    RiskRate                              object        
 4    IsClosed                              int64         
 5    IsDormant                             float64       
 6    IsProfileSuspended                    int64         
 7    IsClientSuspended                     int64         
 8    OpenDate                              datetime64[ns]
 9    BirthDate                             datetime64[ns]
 10   Age                                   int64         
 11   NumOfOrders                           int64         
 12   LastOrder                             datetime64[ns]
 13  

In [52]:
len(df_account_level["ClientID"].unique())

8871

## Client Level Dataset


In [53]:
df = pd.DataFrame()

In [54]:
# Create a new DataFrame with ClientID and corresponding account count
df = pd.DataFrame(
    {
        "ClientID": df_account_level.groupby("ClientID")["AccountID"].nunique().index,
    }
)

In [55]:
df["Gender"] = (df_account_level.groupby("ClientID")["Gender"].first().reset_index())[
    "Gender"
]
df["Age"] = (df_account_level.groupby("ClientID")["Age"].first().reset_index())["Age"]
df["IsClientSuspended"] = (
    df_account_level.groupby("ClientID")["IsClientSuspended"].first().reset_index()
)["IsClientSuspended"]
df["RiskRate"] = (
    df_account_level.groupby("ClientID")["RiskRate"].first().reset_index()
)["RiskRate"]

In [56]:
df["NumOfAccounts"] = df_account_level.groupby("ClientID").size().values

In [57]:
df["NumOfClosedAccounts"] = (
    df_account_level.groupby("ClientID")["IsClosed"].sum().values
)

In [58]:
df["NumOfSuspendedAccounts"] = (
    df_account_level.groupby("ClientID")["IsProfileSuspended"].sum().values
)

In [59]:
df["NumOfOrders"] = df_account_level.groupby("ClientID")["NumOfOrders"].sum().values

In [60]:
df["NumOfCompletedOrders"] = (
    df_account_level.groupby("ClientID")["NumOfCompleted"].sum().values
)

In [61]:
df["NumOfCanceledOrders"] = (
    df_account_level.groupby("ClientID")["NumOfCanceled"].sum().values
)

In [62]:
df["TotalExecutedQuantity"] = (
    df_account_level.groupby("ClientID")["TotalExecutedQuantity"].sum().values
)

In [63]:
df["TotalQuantity"] = df_account_level.groupby("ClientID")["TotalQuantity"].sum().values

In [64]:
df["AvgPrice"] = (
    df_account_level.groupby("ClientID")["AvgPrice"].sum()
    / df_account_level.groupby("ClientID").size()
).values

In [65]:
agg_dict = {}

for col in df_account_level.columns:
    if (
        "OrderType" in col
        or "SecurityID" in col
        or "ExecutionStatus" in col
        or "SectorName" in col
    ):
        agg_dict[col] = (
            df_account_level.groupby("ClientID")[col].sum()
            / df_account_level.groupby("ClientID").size()
        ).values

df = pd.concat([df, pd.DataFrame(agg_dict)], axis=1)

In [66]:
df["FirstOpenAccountDate"] = (
    df_account_level.groupby("ClientID")["OpenDate"].min().values
)

In [67]:
df["LastOpenAccountDate"] = (
    df_account_level.groupby("ClientID")["OpenDate"].max().values
)

In [68]:
df["FirstOrderDateAcrossAccounts"] = (
    df_account_level.groupby("ClientID")["LastOrder"].min().values
)

df["FirstOrderDateAcrossAccounts"] = pd.to_datetime(
    df["FirstOrderDateAcrossAccounts"].dt.date
)

In [69]:
df["LastOrderDateAcrossAccounts"] = (
    df_account_level.groupby("ClientID")["LastOrder"].max().values
)

df["LastOrderDateAcrossAccounts"] = pd.to_datetime(
    df["LastOrderDateAcrossAccounts"].dt.date
)

Define a function so that we can compute if the account is dormant or not.

If the account has not placed any order, we set the Is Dormant value to -1, so that we can isolate the accounts who have not placed any orders.


In [70]:
def check_dormant(date):
    one_year_before_now = datetime.now() - timedelta(days=365)

    if date < one_year_before_now:
        return 1
    else:
        return 0

Recompute the Is Dormant column


In [71]:
df["IsDormant"] = df["LastOrderDateAcrossAccounts"].apply(check_dormant)

In [72]:
df["AvgOrderRate_Start"] = (
    df_account_level.groupby("ClientID")["OrderRate_Start"].mean().values
)

In [73]:
df["AvgOrderRate_End"] = (
    df_account_level.groupby("ClientID")["OrderRate_End"].mean().values
)

In [74]:
df["AvgOrderRate_Difference"] = df["AvgOrderRate_End"] - df["AvgOrderRate_Start"]

In [75]:
df["AvgQuantityOrderedRate_Start"] = (
    df_account_level.groupby("ClientID")["QuantityOrderedRate_Start"].mean().values
)

In [76]:
df["AvgQuantityOrderedRate_End"] = (
    df_account_level.groupby("ClientID")["QuantityOrderedRate_End"].mean().values
)

In [77]:
df["AvgQuantityOrderedRate_Difference"] = (
    df["AvgQuantityOrderedRate_End"] - df["AvgQuantityOrderedRate_Start"]
)

In [78]:
df["AvgQuantityPerAccount_Start"] = (
    df_account_level.groupby("ClientID")["AvgQuantityPerOrder_Start"].mean().values
)

In [79]:
df["AvgQuantityPerAccount_End"] = (
    df_account_level.groupby("ClientID")["AvgQuantityPerOrder_End"].mean().values
)

In [80]:
df["AvgQuantityPerAccount_Difference"] = (
    df["AvgQuantityPerAccount_End"] - df["AvgQuantityPerAccount_Start"]
)

In [81]:
df["AvgExecutedQuantityRatio"] = (
    df_account_level.groupby("ClientID")["ExecutedQuantityRatio"].mean().values
)

In [82]:
df["ClosedAccountsRatio"] = df["NumOfClosedAccounts"] / df["NumOfAccounts"]

In [83]:
df["SuspendedAccountsRatio"] = df["NumOfSuspendedAccounts"] / df["NumOfAccounts"]

In [84]:
# Create the Year columns
df["FirstOpenAccountDate_Year"] = df["FirstOpenAccountDate"].dt.year

# Convert the type to int
df["FirstOpenAccountDate_Year"] = df["FirstOpenAccountDate_Year"].astype(int)

In [85]:
# Create the Year columns
df["LastOpenAccountDate_Year"] = df["LastOpenAccountDate"].dt.year

# Convert the type to int
df["LastOpenAccountDate_Year"] = df["LastOpenAccountDate_Year"].astype(int)

In [86]:
# Create the Year columns
df["FirstOrderDateAcrossAccounts_Year"] = df["FirstOrderDateAcrossAccounts"].dt.year

df["FirstOrderDateAcrossAccounts_Year"] = df[
    "FirstOrderDateAcrossAccounts_Year"
].fillna(0)

# Convert the type to int
df["FirstOrderDateAcrossAccounts_Year"] = df[
    "FirstOrderDateAcrossAccounts_Year"
].astype(int)

In [87]:
# Create the Year columns
df["LastOrderDateAcrossAccounts_Year"] = df["LastOrderDateAcrossAccounts"].dt.year

df["LastOrderDateAcrossAccounts_Year"] = df["LastOrderDateAcrossAccounts_Year"].fillna(
    0
)

# Convert the type to int
df["LastOrderDateAcrossAccounts_Year"] = df["LastOrderDateAcrossAccounts_Year"].astype(
    int
)

In [88]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8871 entries, 0 to 8870
Data columns (total 367 columns):
 #    Column                                Dtype         
---   ------                                -----         
 0    ClientID                              int64         
 1    Gender                                object        
 2    Age                                   int64         
 3    IsClientSuspended                     int64         
 4    RiskRate                              object        
 5    NumOfAccounts                         int64         
 6    NumOfClosedAccounts                   int64         
 7    NumOfSuspendedAccounts                int64         
 8    NumOfOrders                           int64         
 9    NumOfCompletedOrders                  int64         
 10   NumOfCanceledOrders                   int64         
 11   TotalExecutedQuantity                 int64         
 12   TotalQuantity                         int64         
 13   A

## Defining The Label


**_We defined our label to be a combination of multiple features_**


Create the label


In [89]:
df["Churned"] = (
    (df["IsDormant"] == 1)
    & ((df["AvgOrderRate_Difference"] < 0) | (df["NumOfOrders"] == 0))
    | (df["RiskRate"] == "High") & (df["AvgExecutedQuantityRatio"] <= 0.5)
    | (df["ClosedAccountsRatio"] > 0.9)
    | (df["IsClientSuspended"] == 1)
    | (df["SuspendedAccountsRatio"] >= 0.5)
)

df = df.replace({True: 1, False: 0})

df["Churned"] = df["Churned"].astype(int)

In [90]:
df.to_csv("../Data/visualization_data.csv", index=False)

In [91]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8871 entries, 0 to 8870
Data columns (total 368 columns):
 #    Column                                Dtype         
---   ------                                -----         
 0    ClientID                              int64         
 1    Gender                                object        
 2    Age                                   int64         
 3    IsClientSuspended                     int64         
 4    RiskRate                              object        
 5    NumOfAccounts                         int64         
 6    NumOfClosedAccounts                   int64         
 7    NumOfSuspendedAccounts                int64         
 8    NumOfOrders                           int64         
 9    NumOfCompletedOrders                  int64         
 10   NumOfCanceledOrders                   int64         
 11   TotalExecutedQuantity                 int64         
 12   TotalQuantity                         int64         
 13   A

## OHE


The columns to be one-hot encoded


In [92]:
columns = [
    "Gender",
    "RiskRate",
    "FirstOpenAccountDate_Year",
    "LastOpenAccountDate_Year",
    "FirstOrderDateAcrossAccounts_Year",
    "LastOrderDateAcrossAccounts_Year",
]

OHE these columns


In [93]:
df = pd.get_dummies(
    df,
    columns=columns,
    dtype=int,
)

Drop the columns used for creating the label to remove bias, also drop the unnecessary columns


In [94]:
df.drop(
    columns=[
        "AvgExecutedQuantityRatio",
        "IsDormant",
        "RiskRate_High",
        "ClosedAccountsRatio",
        "IsClientSuspended",
        "SuspendedAccountsRatio",
        "NumOfOrders",
        "AvgOrderRate_Difference",
        "FirstOpenAccountDate",
        "LastOpenAccountDate",
        "FirstOrderDateAcrossAccounts",
        "LastOrderDateAcrossAccounts",
        "NumOfSuspendedAccounts",
    ],
    inplace=True,
)

Remove all whitespaces in the column names


In [95]:
df.rename(columns=lambda x: x.replace(" ", ""), inplace=True)

## Normalization


The columns to be normalized


In [96]:
columns = [
    "NumOfCompletedOrders",
    "NumOfCanceledOrders",
    "AvgPrice",
    "TotalExecutedQuantity",
    "TotalQuantity",
    "Age",
    "NumOfAccounts",
    "NumOfClosedAccounts",
    "NumOfCompletedOrders",
    "NumOfCanceledOrders",
    "AvgOrderRate_Start",
    "AvgOrderRate_End",
    "AvgQuantityOrderedRate_Start",
    "AvgQuantityOrderedRate_End",
    "AvgQuantityOrderedRate_Difference",
    "AvgQuantityPerAccount_Start",
    "AvgQuantityPerAccount_End",
    "AvgQuantityPerAccount_Difference",
]

In [97]:
df = pd.read_csv("../Data/cleaned_dataset.csv")

Define a function to normalize the data based on a distribution


In [98]:
def get_normalized_data(data, dist):
    if dist == "uniform":
        return MinMaxScaler().fit_transform(data)
    elif dist == "norm":
        return StandardScaler().fit_transform(data)
    elif dist == "cauchy":
        ranked = stats.rankdata(data, method="average")
        # Convert ranks to percentiles
        percentiles = 100.0 * (ranked - 1) / (len(data) - 1)
        # Get the quantiles
        return np.percentile(data, percentiles)
    else:
        return np.log(np.abs(data.flatten()) + 1)

Define a function to get the best distribution that fits the data


In [99]:
def get_best_distribution(columns, df):
    columns_distributions_dict = {column: "" for column in columns}

    for column in columns:
        print("###### " + column + " ######")

        data = df[column].values

        f = Fitter(
            data,
            distributions=get_common_distributions(),
        )
        f.fit()
        f.summary(plot=False)
        dist = f.get_best(method="sumsquare_error")
        best_dist = ""
        for key in dist.keys():
            best_dist = key

        columns_distributions_dict[column] = str(best_dist)
        print(column)
        print(f"Best Distribution: {best_dist}")
        print()

    return columns_distributions_dict

Define a function to normalize the data based on its best distribution


In [100]:
def normalize(columns, df):
    columns_distributions_dict = get_best_distribution(columns, df)

    for column in columns_distributions_dict.keys():
        data = np.array(df[column]).reshape(-1, 1)
        df[column] = get_normalized_data(
            data=data, dist=columns_distributions_dict[column]
        )
    return columns_distributions_dict

Normalize the data


In [101]:
print(normalize(columns, df))

###### NumOfCompletedOrders ######


[32m2024-04-22 21:38:11.651[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted norm distribution with error=2.366671)[0m
[32m2024-04-22 21:38:11.656[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted expon distribution with error=2.436031)[0m
[32m2024-04-22 21:38:11.680[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted uniform distribution with error=2.683179)[0m
[32m2024-04-22 21:38:11.693[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted rayleigh distribution with error=2.324286)[0m
[32m2024-04-22 21:38:11.747[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted cauchy distribution with error=2.543657)[0m
[32m2024-04-22 21:38:11.972[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_sin

NumOfCompletedOrders
Best Distribution: rayleigh

###### NumOfCanceledOrders ######


[32m2024-04-22 21:38:12.709[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=9.238079)[0m
[32m2024-04-22 21:38:12.761[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=13.440102)[0m
[32m2024-04-22 21:38:12.766[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=16.77187)[0m
[32m2024-04-22 21:38:12.822[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=9.255947)[0m
[32m2024-04-22 21:38:12.839[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted lognorm distribution with error=17.410087)[0m
[32m2024-04-22 21:38:12.947[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit

NumOfCanceledOrders
Best Distribution: gamma

###### AvgPrice ######


[32m2024-04-22 21:38:13.457[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=0.002665)[0m
[32m2024-04-22 21:38:13.495[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=0.002246)[0m
[32m2024-04-22 21:38:13.499[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=0.001051)[0m
[32m2024-04-22 21:38:13.544[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=0.000999)[0m
[32m2024-04-22 21:38:13.558[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted lognorm distribution with error=0.002633)[0m
[32m2024-04-22 21:38:13.599[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_s

AvgPrice
Best Distribution: cauchy

###### TotalExecutedQuantity ######


[32m2024-04-22 21:38:14.047[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=0.162832)[0m
[32m2024-04-22 21:38:14.105[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=0.400024)[0m
[32m2024-04-22 21:38:14.127[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=0.348359)[0m
[32m2024-04-22 21:38:14.170[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=0.923748)[0m
[32m2024-04-22 21:38:14.187[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted lognorm distribution with error=0.490695)[0m
[32m2024-04-22 21:38:14.233[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_s

TotalExecutedQuantity
Best Distribution: cauchy

###### TotalQuantity ######


[32m2024-04-22 21:38:14.720[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=0.064485)[0m
[32m2024-04-22 21:38:14.758[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=0.374194)[0m
[32m2024-04-22 21:38:14.770[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=0.066528)[0m
[32m2024-04-22 21:38:14.838[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=0.85466)[0m
[32m2024-04-22 21:38:14.851[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted lognorm distribution with error=0.405244)[0m
[32m2024-04-22 21:38:14.893[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_si

TotalQuantity
Best Distribution: norm

###### Age ######


[32m2024-04-22 21:38:15.153[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=29.822207)[0m
[32m2024-04-22 21:38:15.173[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=57.061904)[0m
[32m2024-04-22 21:38:15.203[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=26.54599)[0m
[32m2024-04-22 21:38:15.238[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted expon distribution with error=385.302501)[0m
[32m2024-04-22 21:38:15.247[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted norm distribution with error=503.444344)[0m
[32m2024-04-22 21:38:15.265[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_s

Age
Best Distribution: powerlaw

###### NumOfAccounts ######


[32m2024-04-22 21:38:15.514[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted cauchy distribution with error=547.186063)[0m
[32m2024-04-22 21:38:15.552[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=319.115718)[0m
[32m2024-04-22 21:38:15.556[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=391.98158)[0m
[32m2024-04-22 21:38:15.583[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=352.323657)[0m
[32m2024-04-22 21:38:15.617[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted expon distribution with error=1394.693367)[0m
[32m2024-04-22 21:38:15.621[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m

NumOfAccounts
Best Distribution: chi2

###### NumOfClosedAccounts ######


[32m2024-04-22 21:38:16.014[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted cauchy distribution with error=4966.577414)[0m
[32m2024-04-22 21:38:16.124[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=3967.455854)[0m
[32m2024-04-22 21:38:16.129[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=3517.447596)[0m
[32m2024-04-22 21:38:16.193[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=3559.219944)[0m
[32m2024-04-22 21:38:16.206[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=4937.480197)[0m
[32m2024-04-22 21:38:16.259[0m | [1mINFO    [0m | [36mfitter.fitter[

NumOfClosedAccounts
Best Distribution: expon

###### NumOfCompletedOrders ######


[32m2024-04-22 21:38:16.661[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=2.344221)[0m
[32m2024-04-22 21:38:16.814[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=3.268166)[0m
[32m2024-04-22 21:38:16.832[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=2.574435)[0m
[32m2024-04-22 21:38:16.894[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=2.775329)[0m
[32m2024-04-22 21:38:16.901[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted lognorm distribution with error=3.802378)[0m
[32m2024-04-22 21:38:16.946[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_s

NumOfCompletedOrders
Best Distribution: rayleigh

###### NumOfCanceledOrders ######


[32m2024-04-22 21:38:17.491[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=9.238079)[0m
[32m2024-04-22 21:38:17.519[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=13.440102)[0m
[32m2024-04-22 21:38:17.524[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=16.77187)[0m
[32m2024-04-22 21:38:17.572[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=9.255947)[0m
[32m2024-04-22 21:38:17.593[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted lognorm distribution with error=17.410087)[0m
[32m2024-04-22 21:38:17.632[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit

NumOfCanceledOrders
Best Distribution: gamma

###### AvgOrderRate_Start ######


[32m2024-04-22 21:38:17.980[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted cauchy distribution with error=560.303193)[0m
[32m2024-04-22 21:38:18.190[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=290.156162)[0m
[32m2024-04-22 21:38:18.217[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=381.88995)[0m
[32m2024-04-22 21:38:18.232[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=553.038585)[0m
[32m2024-04-22 21:38:18.263[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=404.069173)[0m
[32m2024-04-22 21:38:18.295[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[3

AvgOrderRate_Start
Best Distribution: expon

###### AvgOrderRate_End ######


[32m2024-04-22 21:38:18.891[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=188.001679)[0m
[32m2024-04-22 21:38:18.919[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=267.472352)[0m
[32m2024-04-22 21:38:18.926[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=416.886867)[0m
[32m2024-04-22 21:38:18.969[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=216.169786)[0m
[32m2024-04-22 21:38:18.988[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted lognorm distribution with error=435.297246)[0m
[32m2024-04-22 21:38:19.022[0m | [1mINFO    [0m | [36mfitter.fitter[0m:

AvgOrderRate_End
Best Distribution: cauchy

###### AvgQuantityOrderedRate_Start ######


[32m2024-04-22 21:38:19.387[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted cauchy distribution with error=21.248978)[0m
[32m2024-04-22 21:38:19.601[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=15.527631)[0m
[32m2024-04-22 21:38:19.634[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=15.403487)[0m
[32m2024-04-22 21:38:19.645[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=20.908158)[0m
[32m2024-04-22 21:38:19.686[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=13.636612)[0m
[32m2024-04-22 21:38:19.710[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_f

AvgQuantityOrderedRate_Start
Best Distribution: exponpow

###### AvgQuantityOrderedRate_End ######


[32m2024-04-22 21:38:20.309[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=1.812934)[0m
[32m2024-04-22 21:38:20.371[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=1.331093)[0m
[32m2024-04-22 21:38:20.373[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=2.379061)[0m
[32m2024-04-22 21:38:20.436[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=1.576526)[0m
[32m2024-04-22 21:38:20.445[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted lognorm distribution with error=2.671859)[0m
[32m2024-04-22 21:38:20.491[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_s

AvgQuantityOrderedRate_End
Best Distribution: chi2

###### AvgQuantityOrderedRate_Difference ######


[32m2024-04-22 21:38:20.828[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=3.125585)[0m
[32m2024-04-22 21:38:20.831[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=2.803067)[0m
[32m2024-04-22 21:38:20.907[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=2.902819)[0m
[32m2024-04-22 21:38:20.934[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted lognorm distribution with error=1.755301)[0m
[32m2024-04-22 21:38:20.966[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted norm distribution with error=11.771175)[0m
[32m2024-04-22 21:38:20.968[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_sing

AvgQuantityOrderedRate_Difference
Best Distribution: lognorm

###### AvgQuantityPerAccount_Start ######


[32m2024-04-22 21:38:21.269[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted cauchy distribution with error=12.162131)[0m
[32m2024-04-22 21:38:21.459[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=5.583041)[0m
[32m2024-04-22 21:38:21.508[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=8.629339)[0m
[32m2024-04-22 21:38:21.521[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=11.967854)[0m
[32m2024-04-22 21:38:21.566[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=8.017303)[0m
[32m2024-04-22 21:38:21.584[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_

AvgQuantityPerAccount_Start
Best Distribution: gamma

###### AvgQuantityPerAccount_End ######


[32m2024-04-22 21:38:22.152[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=1.34664)[0m
[32m2024-04-22 21:38:22.204[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=1.59123)[0m
[32m2024-04-22 21:38:22.218[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=1.58537)[0m
[32m2024-04-22 21:38:22.268[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=1.145041)[0m
[32m2024-04-22 21:38:22.288[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted lognorm distribution with error=1.761873)[0m
[32m2024-04-22 21:38:22.328[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_sing

AvgQuantityPerAccount_End
Best Distribution: exponpow

###### AvgQuantityPerAccount_Difference ######


[32m2024-04-22 21:38:22.685[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted lognorm distribution with error=0.988918)[0m
[32m2024-04-22 21:38:22.712[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=0.969537)[0m
[32m2024-04-22 21:38:22.749[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=0.949526)[0m
[32m2024-04-22 21:38:22.758[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=0.969788)[0m


AvgQuantityPerAccount_Difference
Best Distribution: norm

{'NumOfCompletedOrders': 'rayleigh', 'NumOfCanceledOrders': 'gamma', 'AvgPrice': 'cauchy', 'TotalExecutedQuantity': 'cauchy', 'TotalQuantity': 'norm', 'Age': 'powerlaw', 'NumOfAccounts': 'chi2', 'NumOfClosedAccounts': 'expon', 'AvgOrderRate_Start': 'expon', 'AvgOrderRate_End': 'cauchy', 'AvgQuantityOrderedRate_Start': 'exponpow', 'AvgQuantityOrderedRate_End': 'chi2', 'AvgQuantityOrderedRate_Difference': 'lognorm', 'AvgQuantityPerAccount_Start': 'gamma', 'AvgQuantityPerAccount_End': 'exponpow', 'AvgQuantityPerAccount_Difference': 'norm'}


## Saving


Make the Churned column to be the last column


In [102]:
cols = list(df.columns)
cols.append(cols.pop(cols.index("Churned")))
df = df[cols]

Display the head of the dataframe


In [103]:
df.head()

Unnamed: 0,ClientID,Age,NumOfAccounts,NumOfClosedAccounts,NumOfCompletedOrders,NumOfCanceledOrders,TotalExecutedQuantity,TotalQuantity,AvgPrice,OrderType_Buy,...,LastOpenAccountDate_Year_2024,FirstOrderDateAcrossAccounts_Year_0,FirstOrderDateAcrossAccounts_Year_2022,FirstOrderDateAcrossAccounts_Year_2023,FirstOrderDateAcrossAccounts_Year_2024,LastOrderDateAcrossAccounts_Year_0,LastOrderDateAcrossAccounts_Year_2022,LastOrderDateAcrossAccounts_Year_2023,LastOrderDateAcrossAccounts_Year_2024,Churned
0,0,1.509918,0.526589,0.0,0.741276,0.0,9.392745,-0.527633,1.21,1.485619,...,0,0,1,0,0,0,1,0,0,0
1,1,1.63137,0.526589,0.0,1.080418,0.526589,8.377471,-0.768427,1.982429,0.0,...,0,0,1,0,0,0,1,0,0,1
2,2,1.603479,0.526589,0.0,1.358505,0.741276,11.727198,0.074898,7.652667,13.370571,...,0,0,0,0,1,0,0,0,1,0
3,3,1.663294,0.526589,0.0,1.643721,1.248441,13.433575,0.542455,72.059886,30.45519,...,0,0,0,0,1,0,0,0,1,0
4,4,1.583341,0.741276,0.0,1.291725,1.248441,12.907934,0.458533,1.572133,1.857024,...,0,0,0,0,1,0,0,0,1,0


Display the info of the dataframe


In [104]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9123 entries, 0 to 9122
Data columns (total 447 columns):
 #    Column                                  Dtype  
---   ------                                  -----  
 0    ClientID                                int64  
 1    Age                                     float64
 2    NumOfAccounts                           float64
 3    NumOfClosedAccounts                     float64
 4    NumOfCompletedOrders                    float64
 5    NumOfCanceledOrders                     float64
 6    TotalExecutedQuantity                   float64
 7    TotalQuantity                           float64
 8    AvgPrice                                float64
 9    OrderType_Buy                           float64
 10   OrderType_Sell                          float64
 11   SecurityID_0                            float64
 12   SecurityID_1                            float64
 13   SecurityID_2                            float64
 14   SecurityID_3          

Save the dataframe into a csv "cleaned_dataset.csv"


In [105]:
df.to_csv("../Data/cleaned_dataset.csv", index=False)