# Data Cleaning and Preprocessing


Imports


In [356]:
import pandas as pd
import numpy as np
from fitter import Fitter, get_common_distributions
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import warnings
from datetime import datetime, timedelta
from scipy import stats

warnings.filterwarnings("ignore")

pd.set_option("future.no_silent_downcasting", True)

Read the orders dataset into a pandas dataframe


In [399]:
orders_df = pd.read_csv("../Data/orders_data_competition.csv")

Read the clients dataset into a pandas dataframe


In [400]:
clients_df = pd.read_csv("../Data/clients_data_competition.csv")

## Account Level Dataset


### Orders Data


**_We created another dataframe "agg_orders" which represents the aggregate orders of each account_**


Drop the rows with null values


In [401]:
orders_df.dropna(inplace=True)

In [402]:
orders_df = orders_df[orders_df["Order Via"] == "Online"]

In [403]:
orders_df.drop(columns=["Order Via"], inplace=True)

In [404]:
clients_df = clients_df[clients_df["Client Type Name"] == "Individuals"]

In [405]:
clients_df.drop(columns=["Client Type Name"], inplace=True)

Initialize a new Dataframe to store the aggregate orders per account


In [437]:
agg_orders = pd.DataFrame()

Add an AccountID column to the aggregate orders df


In [429]:
agg_orders["AccountID"] = clients_df["Account ID"].unique()

Compute and add the number of orders for each account


In [438]:
# Add a NumOfOrders column that has the number of orders for each account
agg_orders["NumOfOrders"] = orders_df.groupby("Account ID").size()

# Set the number of orders for the accounts that didn't place any order to 0
agg_orders["NumOfOrders"] = agg_orders["NumOfOrders"].fillna(0)

# Convert the type to int
agg_orders["NumOfOrders"] = agg_orders["NumOfOrders"].astype(int)

Compute and add the last order date for each account


In [439]:
# Convert the 'order_date' column to datetime
orders_df["Order Time"] = pd.to_datetime(orders_df["Order Time"])

# Create a new dataframe with the last order date for each account
agg_orders["LastOrder"] = orders_df.groupby("Account ID")["Order Time"].max()

# Convert the LastOrder column to datetime
agg_orders["LastOrder"] = pd.to_datetime(agg_orders["LastOrder"], format="%d-%m-%Y")

Compute and add the first order date for each account


In [364]:
# Create a new dataframe with the last order date for each account
agg_orders["FirstOrder"] = orders_df.groupby("Account ID")["Order Time"].min()

# Convert the LastOrder column to datetime
agg_orders["FirstOrder"] = pd.to_datetime(agg_orders["FirstOrder"], format="%d-%m-%Y")

Compute and add the number of completed orders for each account


In [365]:
# Add a NumOfCompleted column that has the number of completed orders for each account
agg_orders["NumOfCompleted"] = orders_df.groupby("Account ID")["Is Completed"].sum()

# Set the number of completed orders for the accounts that didn't place any order to 0
agg_orders["NumOfCompleted"] = agg_orders["NumOfCompleted"].fillna(0)

# # Convert the type to int
agg_orders["NumOfCompleted"] = agg_orders["NumOfCompleted"].astype(int)

Compute and add the number of canceled for each account


In [440]:
# Add a NumOfCanceled column that has the number of completed orders for each account
agg_orders["NumOfCanceled"] = orders_df.groupby("Account ID")["Is Canceled"].sum()

# Set the number of completed orders for the accounts that didn't place any order to 0
agg_orders["NumOfCanceled"] = agg_orders["NumOfCanceled"].fillna(0)

# # Convert the type to int
agg_orders["NumOfCanceled"] = agg_orders["NumOfCanceled"].astype(int)

Compute and add the average price of orders for each account


In [441]:
# Add a AvgPrice column that has the number of completed orders for each account
agg_orders["AvgPrice"] = (
    orders_df.groupby("Account ID")["Price"].sum()
    / orders_df.groupby("Account ID").size()
)

# Set the number of completed orders for the accounts that didn't place any order to 0
agg_orders["AvgPrice"] = agg_orders["AvgPrice"].fillna(0.0)

Compute and add the total executed quantity of orders for each account


In [442]:
# Add a TotalExecutedQuantity column that has the number of completed orders for each account
agg_orders["TotalExecutedQuantity"] = orders_df.groupby("Account ID")[
    "Executed Quantity"
].sum()

# Set the number of completed orders for the accounts that didn't place any order to 0
agg_orders["TotalExecutedQuantity"] = agg_orders["TotalExecutedQuantity"].fillna(0)

# # Convert the type to int
agg_orders["TotalExecutedQuantity"] = agg_orders["TotalExecutedQuantity"].astype(int)

Compute and add the total quantity of orders for each account


In [443]:
# Add a TotalQuantity column that has the number of completed orders for each account
agg_orders["TotalQuantity"] = orders_df.groupby("Account ID")["Quantity"].sum()

# Set the number of completed orders for the accounts that didn't place any order to 0
agg_orders["TotalQuantity"] = agg_orders["TotalQuantity"].fillna(0)

# # Convert the type to int
agg_orders["TotalQuantity"] = agg_orders["TotalQuantity"].astype(int)

Define a function to compute and return an IDF vector of a data


In [444]:
def get_IDF_vector(df):
    idf = {}
    for i in df.columns:
        idf[i] = (
            np.log2(df.shape[0] / df[df[i] > 0].shape[0])
            if df[df[i] > 0].shape[0] != 0
            else 1e-6
        )
    return idf

Define a function to compute and return the TFIDF dataframe for the data


In [445]:
def calculate_TFIDF(df, idf_dict):
    tfidf_df = df.copy()
    for i in df.columns:
        tfidf_df[i] = df[i] * idf_dict[i]
    return tfidf_df

Define a function to convert a categorical feature to its TFIDF dataframe and concatenate it with the aggregate orders dataframe


In [446]:
def TFIDF(column_name: str):
    temp = pd.get_dummies(orders_df, columns=[column_name])
    temp.replace({True: 1, False: 0}, inplace=True)

    tfidf_df = pd.DataFrame(data=agg_orders.iloc[:, 0])

    for word in orders_df[column_name].unique():
        col_name = f"{column_name}_{word}"

        tfidf_df[col_name.replace(" ", "")] = temp.groupby("Account ID")[col_name].sum()

        tfidf_df[col_name.replace(" ", "")] = tfidf_df[
            col_name.replace(" ", "")
        ].fillna(0)

        tfidf_df[col_name.replace(" ", "")] = tfidf_df[
            col_name.replace(" ", "")
        ].astype(int)

    idf_dict = get_IDF_vector(tfidf_df.iloc[:, 1:])
    tfidf_df = calculate_TFIDF(tfidf_df.iloc[:, 1:], idf_dict)
    return pd.concat([agg_orders, tfidf_df], axis=1)

Compute and add the TFIDF of the Security ID column


In [448]:
agg_orders = TFIDF("Security ID")

Compute and add the TFIDF of the Order Via column


In [449]:
agg_orders = TFIDF("Order Type")

Compute and add the TFIDF of the Execution Status column


In [450]:
agg_orders = TFIDF("Execution Status")

Compute and add the TFIDF of the Sector Name column


In [451]:
agg_orders = TFIDF("Sector Name")

Display the head of the agg_orders dataframe


In [452]:
agg_orders.head()

Unnamed: 0_level_0,NumOfOrders,LastOrder,NumOfCanceled,AvgPrice,TotalExecutedQuantity,TotalQuantity,SecurityID_0,SecurityID_1,SecurityID_2,SecurityID_3,...,SectorName_INVESTMENT,SectorName_TelecommunicationServices,SectorName_REALESTATE,SectorName_Telecommunications,SectorName_FOOD,SectorName_Others,SectorName_Tourism,SectorName_ConsumerServices,SectorName_Utilities,SectorName_PharmaceuticalIndustries
Account ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,21,2024-02-28 11:06:33,2,7.652667,123895,137395,0.0,0.0,0.0,0.0,...,2.477111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,87,2024-03-10 12:32:34,11,71.908161,682538,908478,0.0,5.647045,5.283509,0.0,...,0.0,4.973598,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,30,2024-02-26 12:47:33,10,2.846267,356686,594953,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,28,2024-03-10 13:47:43,6,49.255714,18505,29253,9.128765,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,43,2024-03-03 10:46:33,6,20.552953,117399,148754,0.0,0.0,0.0,0.0,...,16.101221,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0


Display the info of the agg_orders dataframe


In [453]:
agg_orders.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6732 entries, 2 to 13521
Columns: 338 entries, NumOfOrders to SectorName_PharmaceuticalIndustries
dtypes: datetime64[ns](1), float64(333), int64(4)
memory usage: 17.4 MB


### Clients Data


Drop the rows with null values


In [375]:
clients_df.dropna(inplace=True)

In [376]:
clients_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13523 entries, 0 to 13522
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Client ID             13523 non-null  int64  
 1   Account ID            13523 non-null  int64  
 2   Gender                13523 non-null  object 
 3   Risk Rate             13523 non-null  object 
 4   Company Name          13523 non-null  object 
 5   Is Closed             13523 non-null  int64  
 6   Is Dormant            13523 non-null  float64
 7   Is Profile Suspended  13523 non-null  int64  
 8   Is Client Suspended   13523 non-null  int64  
 9   Client Type Name      13523 non-null  object 
 10  OpenDate              13523 non-null  object 
 11  BirthDate             13523 non-null  object 
dtypes: float64(1), int64(5), object(6)
memory usage: 1.2+ MB


Convert the OpenDate column to DateTime object


In [377]:
# Convert the OpenDate column to datetime
clients_df["OpenDate"] = pd.to_datetime(clients_df["OpenDate"], format="%m/%d/%Y")

Convert the BirthDate column to DateTime object, and calculate the age of the client


In [378]:
# Convert the BirthDate column to datetime
clients_df["BirthDate"] = pd.to_datetime(clients_df["BirthDate"], format="%Y-%m-%d")

In [379]:
def calculate_age(birth_date):
    # Assuming birth_date is a datetime object
    today = datetime.today()
    return (
        today.year
        - birth_date.year
        - ((today.month, today.day) < (birth_date.month, birth_date.day))
    )

In [380]:
clients_df["Age"] = clients_df["BirthDate"].apply(calculate_age)

Remove all whitespaces in the column names


In [381]:
clients_df.rename(columns=lambda x: x.replace(" ", ""), inplace=True)

Display the head of the clients dataframe


In [382]:
clients_df.head()

Unnamed: 0,ClientID,AccountID,Gender,RiskRate,CompanyName,IsClosed,IsDormant,IsProfileSuspended,IsClientSuspended,ClientTypeName,OpenDate,BirthDate,Age
0,0,0,Male,Low,HSB,0,0.0,0,0,Individuals,2014-02-16,1990-07-27,33
1,1,1,Female,Low,HSB,0,-1.0,1,0,Individuals,2014-02-16,1963-10-22,60
2,2,2,Male,Low,HSB,0,0.0,0,0,Individuals,2014-02-17,1971-05-14,52
3,3,3,Male,Low,HSB,0,0.0,0,0,Individuals,2014-02-17,1953-01-14,71
4,4,4,Male,Low,HSB,0,0.0,0,0,Individuals,2014-02-17,1976-06-09,47


Display the info of the clients dataframe


In [383]:
clients_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13523 entries, 0 to 13522
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   ClientID            13523 non-null  int64         
 1   AccountID           13523 non-null  int64         
 2   Gender              13523 non-null  object        
 3   RiskRate            13523 non-null  object        
 4   CompanyName         13523 non-null  object        
 5   IsClosed            13523 non-null  int64         
 6   IsDormant           13523 non-null  float64       
 7   IsProfileSuspended  13523 non-null  int64         
 8   IsClientSuspended   13523 non-null  int64         
 9   ClientTypeName      13523 non-null  object        
 10  OpenDate            13523 non-null  datetime64[ns]
 11  BirthDate           13523 non-null  datetime64[ns]
 12  Age                 13523 non-null  int64         
dtypes: datetime64[ns](2), float64(1), int64(6), ob

### Merging


**_We will inner join the agg_orders and clients_df dataframes on the AccountID column_**


Perform the merging between both dataframes


In [384]:
df_account_level = pd.merge(clients_df, agg_orders, on="AccountID", how="inner")

In [385]:
# Get the current date
now = datetime.now()

In [386]:
order_rates_start = []
order_rates_end = []

In [387]:
for i, account in df_account_level.iterrows():
    midpointDate = account["OpenDate"] + (now - account["OpenDate"]) / 2
    openDate = account["OpenDate"]

    days_open_midpoint = (midpointDate - openDate).days
    days_midpoint_now = (now - midpointDate).days

    num_of_orders_start = len(
        orders_df[
            (orders_df["Order Time"] >= openDate)
            & (orders_df["Order Time"] < midpointDate)
            & (orders_df["Account ID"] == account["AccountID"])
        ]
    )

    num_of_orders_end = len(
        orders_df[
            (orders_df["Order Time"] >= midpointDate)
            & (orders_df["Order Time"] < now)
            & (orders_df["Account ID"] == account["AccountID"])
        ]
    )

    order_rates_start.append(num_of_orders_start / days_open_midpoint)
    order_rates_end.append(num_of_orders_end / days_midpoint_now)

In [388]:
df_account_level["OrderRate_Start"] = pd.Series(order_rates_start)
df_account_level["OrderRate_End"] = pd.Series(order_rates_end)

In [389]:
quantity_rates_start = []
quantity_rates_end = []

In [390]:
for i, account in df_account_level.iterrows():
    midpointDate = account["OpenDate"] + (now - account["OpenDate"]) / 2
    openDate = account["OpenDate"]

    days_open_midpoint = (midpointDate - openDate).days
    days_midpoint_now = (now - midpointDate).days

    quantity_ordered_start = orders_df[
        (orders_df["Order Time"] >= openDate)
        & (orders_df["Order Time"] < midpointDate)
        & (orders_df["Account ID"] == account["AccountID"])
    ]["Quantity"].sum()

    quantity_ordered_end = orders_df[
        (orders_df["Order Time"] >= midpointDate)
        & (orders_df["Order Time"] < now)
        & (orders_df["Account ID"] == account["AccountID"])
    ]["Quantity"].sum()

    quantity_rates_start.append(quantity_ordered_start / days_open_midpoint)
    quantity_rates_end.append(quantity_ordered_end / days_midpoint_now)

In [391]:
df_account_level["QuantityOrderedRate_Start"] = pd.Series(quantity_rates_start)
df_account_level["QuantityOrderedRate_End"] = pd.Series(quantity_rates_end)

In [392]:
avg_quantity_per_order_start = []
avg_quantity_per_order_end = []

In [393]:
for i, account in df_account_level.iterrows():
    midpointDate = account["OpenDate"] + (now - account["OpenDate"]) / 2
    openDate = account["OpenDate"]

    avg_quantity_start = orders_df[
        (orders_df["Order Time"] >= openDate)
        & (orders_df["Order Time"] < midpointDate)
        & (orders_df["Account ID"] == account["AccountID"])
    ]["Quantity"].mean()

    avg_quantity_end = orders_df[
        (orders_df["Order Time"] >= midpointDate)
        & (orders_df["Order Time"] < now)
        & (orders_df["Account ID"] == account["AccountID"])
    ]["Quantity"].mean()

    avg_quantity_per_order_start.append(avg_quantity_start)
    avg_quantity_per_order_end.append(avg_quantity_end)

In [394]:
df_account_level["AvgQuantityPerOrder_Start"] = pd.Series(avg_quantity_per_order_start)
df_account_level["AvgQuantityPerOrder_End"] = pd.Series(avg_quantity_per_order_end)

In [395]:
df_account_level["AvgQuantityPerOrder_Start"] = df_account_level[
    "AvgQuantityPerOrder_Start"
].fillna(0)
df_account_level["AvgQuantityPerOrder_End"] = df_account_level[
    "AvgQuantityPerOrder_End"
].fillna(0)

In [396]:
df_account_level["ExecutedQuantityRatio"] = df_account_level.apply(
    lambda row: (
        1
        if row["TotalQuantity"] == 0
        else row["TotalExecutedQuantity"] / row["TotalQuantity"]
    ),
    axis=1,
)

Display the head of the dataframe


In [397]:
df_account_level.iloc[0:15]

Unnamed: 0,ClientID,AccountID,Gender,RiskRate,CompanyName,IsClosed,IsDormant,IsProfileSuspended,IsClientSuspended,ClientTypeName,...,AvgPrice,TotalExecutedQuantity,TotalQuantity,OrderRate_Start,OrderRate_End,QuantityOrderedRate_Start,QuantityOrderedRate_End,AvgQuantityPerOrder_Start,AvgQuantityPerOrder_End,ExecutedQuantityRatio
0,0,0,Male,Low,HSB,0,0.0,0,0,Individuals,...,1.21,12000,12000,0.0,0.001076,0.0,6.455083,0.0,6000.0,1.0
1,1,1,Female,Low,HSB,0,-1.0,1,0,Individuals,...,1.982429,4347,4529,0.0,0.003765,0.0,2.436256,0.0,647.0,0.959815
2,2,2,Male,Low,HSB,0,0.0,0,0,Individuals,...,7.652667,123895,137395,0.0,0.011302,0.0,73.947793,0.0,6542.619048,0.901743
3,3,3,Male,Low,HSB,0,0.0,0,0,Individuals,...,72.059886,682538,911078,0.0,0.047363,0.0,490.354144,0.0,10353.159091,0.749154
4,4,4,Male,Low,HSB,0,0.0,0,0,Individuals,...,2.846267,356686,594953,0.0,0.016146,0.0,320.211518,0.0,19831.766667,0.59952
5,5,5,Female,Low,HSB,0,0.0,0,0,Individuals,...,7.186667,743,788,0.0,0.000943,0.0,0.247721,0.0,262.666667,0.942893
6,6,6,Male,Low,HSB,0,0.0,0,0,Individuals,...,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7,7,7,Male,Not Assigned,HSB,0,0.0,0,0,Individuals,...,49.255714,18505,29253,0.568182,0.068182,567.340909,97.5,998.52,1430.0,0.632585
8,8,8,Male,Medium,HSB,0,0.0,0,0,Individuals,...,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9,9,9,Male,Not Assigned,OLT INVESTMENT INTERNATIONAL COMPANY,0,0.0,0,0,Individuals,...,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


Display the info of the dataframe


In [398]:
df_account_level.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13523 entries, 0 to 13522
Data columns (total 28 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   ClientID                   13523 non-null  int64         
 1   AccountID                  13523 non-null  int64         
 2   Gender                     13523 non-null  object        
 3   RiskRate                   13523 non-null  object        
 4   CompanyName                13523 non-null  object        
 5   IsClosed                   13523 non-null  int64         
 6   IsDormant                  13523 non-null  float64       
 7   IsProfileSuspended         13523 non-null  int64         
 8   IsClientSuspended          13523 non-null  int64         
 9   ClientTypeName             13523 non-null  object        
 10  OpenDate                   13523 non-null  datetime64[ns]
 11  BirthDate                  13523 non-null  datetime64[ns]
 12  Age 

## Client Level Dataset


In [320]:
df = pd.DataFrame()

In [355]:
df = pd.read_csv("../Data/visualization_data.csv")

In [321]:
df = (
    df_account_level.groupby("ClientID")
    .agg(
        {
            "Gender": "first",
            "Age": "first",
            "ClientTypeName": "first",
            "IsClientSuspended": "first",
            "RiskRate": "first",
        },
    )
    .reset_index()
)

In [322]:
df["NumOfAccounts"] = df_account_level.groupby("ClientID").size()

In [323]:
df["NumOfClosedAccounts"] = df_account_level.groupby("ClientID")["IsClosed"].sum()

In [324]:
df["NumOfSuspendedAccounts"] = df_account_level.groupby("ClientID")[
    "IsProfileSuspended"
].sum()

In [325]:
df["NumOfOrders"] = df_account_level.groupby("ClientID")["NumOfOrders"].sum()

In [326]:
df["NumOfCompletedOrders"] = df_account_level.groupby("ClientID")[
    "NumOfCompleted"
].sum()

In [327]:
df["NumOfCanceledOrders"] = df_account_level.groupby("ClientID")["NumOfCanceled"].sum()

In [328]:
df["TotalExecutedQuantity"] = df_account_level.groupby("ClientID")[
    "TotalExecutedQuantity"
].sum()

In [329]:
df["TotalQuantity"] = df_account_level.groupby("ClientID")["TotalQuantity"].sum()

In [330]:
df["AvgPrice"] = (
    df_account_level.groupby("ClientID")["AvgPrice"].sum()
    / df_account_level.groupby("ClientID").size()
)

In [331]:
agg_dict = {}

for col in df_account_level.columns:
    if (
        "OrderType" in col
        or "SecurityID" in col
        or "OrderVia" in col
        or "ExecutionStatus" in col
        or "SectorName" in col
    ):
        agg_dict[col] = (
            df_account_level.groupby("ClientID")[col].sum()
            / df_account_level.groupby("ClientID").size()
        )

df = pd.concat([df, pd.DataFrame(agg_dict)], axis=1)

In [332]:
df["FirstOpenAccountDate"] = df_account_level.groupby("ClientID")["OpenDate"].min()

In [333]:
df["LastOpenAccountDate"] = df_account_level.groupby("ClientID")["OpenDate"].max()

In [334]:
df["FirstOrderDateAcrossAccounts"] = df_account_level.groupby("ClientID")[
    "LastOrder"
].min()

df["FirstOrderDateAcrossAccounts"] = pd.to_datetime(
    df["FirstOrderDateAcrossAccounts"].dt.date
)

In [335]:
df["LastOrderDateAcrossAccounts"] = df_account_level.groupby("ClientID")[
    "LastOrder"
].max()

df["LastOrderDateAcrossAccounts"] = pd.to_datetime(
    df["LastOrderDateAcrossAccounts"].dt.date
)

Define a function so that we can compute if the account is dormant or not.

If the account has not placed any order, we set the Is Dormant value to -1, so that we can isolate the accounts who have not placed any orders.


In [336]:
def check_dormant(date):
    one_year_before_now = datetime.now() - timedelta(days=365)

    if date < one_year_before_now:
        return 1
    else:
        return 0

Recompute the Is Dormant column


In [337]:
df["IsDormant"] = df["LastOrderDateAcrossAccounts"].apply(check_dormant)

In [338]:
df["AvgOrderRate_Start"] = df_account_level.groupby("ClientID")[
    "OrderRate_Start"
].mean()

In [339]:
df["AvgOrderRate_End"] = df_account_level.groupby("ClientID")["OrderRate_End"].mean()

In [340]:
df["AvgOrderRate_Difference"] = df["AvgOrderRate_End"] - df["AvgOrderRate_Start"]

In [341]:
df["AvgQuantityOrderedRate_Start"] = df_account_level.groupby("ClientID")[
    "QuantityOrderedRate_Start"
].mean()

In [342]:
df["AvgQuantityOrderedRate_End"] = df_account_level.groupby("ClientID")[
    "QuantityOrderedRate_End"
].mean()

In [343]:
df["AvgQuantityOrderedRate_Difference"] = (
    df["AvgQuantityOrderedRate_End"] - df["AvgQuantityOrderedRate_Start"]
)

In [344]:
df["AvgQuantityPerAccount_Start"] = df_account_level.groupby("ClientID")[
    "AvgQuantityPerOrder_Start"
].mean()

In [345]:
df["AvgQuantityPerAccount_End"] = df_account_level.groupby("ClientID")[
    "AvgQuantityPerOrder_End"
].mean()

In [346]:
df["AvgQuantityPerAccount_Difference"] = (
    df["AvgQuantityPerAccount_End"] - df["AvgQuantityPerAccount_Start"]
)

In [347]:
df["AvgExecutedQuantityRatio"] = df_account_level.groupby("ClientID")[
    "ExecutedQuantityRatio"
].mean()

In [348]:
df["ClosedAccountsRatio"] = df["NumOfClosedAccounts"] / df["NumOfAccounts"]

In [349]:
df["SuspendedAccountsRatio"] = df["NumOfSuspendedAccounts"] / df["NumOfAccounts"]

In [350]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8871 entries, 0 to 8870
Data columns (total 31 columns):
 #   Column                             Non-Null Count  Dtype         
---  ------                             --------------  -----         
 0   ClientID                           8871 non-null   int64         
 1   Gender                             8871 non-null   object        
 2   Age                                8871 non-null   int64         
 3   IsClientSuspended                  8871 non-null   int64         
 4   RiskRate                           8871 non-null   object        
 5   NumOfAccounts                      8626 non-null   float64       
 6   NumOfClosedAccounts                8626 non-null   float64       
 7   NumOfSuspendedAccounts             8626 non-null   float64       
 8   NumOfOrders                        8626 non-null   float64       
 9   NumOfCompletedOrders               8626 non-null   float64       
 10  NumOfCanceledOrders                8

In [351]:
# Create the Year columns
df["FirstOpenAccountDate_Year"] = df["FirstOpenAccountDate"].dt.year

# Convert the type to int
df["FirstOpenAccountDate_Year"] = df["FirstOpenAccountDate_Year"].astype(int)

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [None]:
# Create the Year columns
df["LastOpenAccountDate_Year"] = df["LastOpenAccountDate"].dt.year

# Convert the type to int
df["LastOpenAccountDate_Year"] = df["LastOpenAccountDate_Year"].astype(int)

In [None]:
# Create the Year columns
df["FirstOrderDateAcrossAccounts_Year"] = df["FirstOrderDateAcrossAccounts"].dt.year

df["FirstOrderDateAcrossAccounts_Year"] = df[
    "FirstOrderDateAcrossAccounts_Year"
].fillna(0)

# Convert the type to int
df["FirstOrderDateAcrossAccounts_Year"] = df[
    "FirstOrderDateAcrossAccounts_Year"
].astype(int)

In [None]:
# Create the Year columns
df["LastOrderDateAcrossAccounts_Year"] = df["LastOrderDateAcrossAccounts"].dt.year

df["LastOrderDateAcrossAccounts_Year"] = df["LastOrderDateAcrossAccounts_Year"].fillna(
    0
)

# Convert the type to int
df["LastOrderDateAcrossAccounts_Year"] = df["LastOrderDateAcrossAccounts_Year"].astype(
    int
)

In [None]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9123 entries, 0 to 9122
Data columns (total 399 columns):
 #    Column                                Dtype         
---   ------                                -----         
 0    ClientID                              int64         
 1    Gender                                object        
 2    Age                                   int64         
 3    IsClientSuspended                     int64         
 4    ClientTypeName                        object        
 5    RiskRate                              object        
 6    NumOfAccounts                         int64         
 7    NumOfClosedAccounts                   int64         
 8    NumOfSuspendedAccounts                int64         
 9    NumOfOrders                           int64         
 10   NumOfCompletedOrders                  int64         
 11   NumOfCanceledOrders                   int64         
 12   TotalExecutedQuantity                 int64         
 13   T

## Defining The Label


**_We defined our label to be a combination of multiple features_**


Create the label


In [None]:
df["Churned"] = (
    (df["IsDormant"] == 1)
    & ((df["AvgOrderRate_Difference"] < 0) | (df["NumOfOrders"] == 0))
    | (df["RiskRate"] == "High") & (df["AvgExecutedQuantityRatio"] <= 0.5)
    | (df["ClosedAccountsRatio"] > 0.9)
    | (df["IsClientSuspended"] == 1)
    | (df["SuspendedAccountsRatio"] >= 0.5)
)

df = df.replace({True: 1, False: 0})

df["Churned"] = df["Churned"].astype(int)

In [None]:
df.to_csv("../Data/visualization_data.csv", index=False)

In [None]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9123 entries, 0 to 9122
Data columns (total 460 columns):
 #    Column                                  Dtype         
---   ------                                  -----         
 0    ClientID                                int64         
 1    Age                                     int64         
 2    IsClientSuspended                       int64         
 3    NumOfAccounts                           int64         
 4    NumOfClosedAccounts                     int64         
 5    NumOfSuspendedAccounts                  int64         
 6    NumOfOrders                             int64         
 7    NumOfCompletedOrders                    int64         
 8    NumOfCanceledOrders                     int64         
 9    TotalExecutedQuantity                   int64         
 10   TotalQuantity                           int64         
 11   AvgPrice                                float64       
 12   OrderType_Buy                   

## OHE


The columns to be one-hot encoded


In [None]:
columns = [
    "Gender",
    "RiskRate",
    "FirstOpenAccountDate_Year",
    "LastOpenAccountDate_Year",
    "FirstOrderDateAcrossAccounts_Year",
    "LastOrderDateAcrossAccounts_Year",
]

OHE these columns


In [None]:
df = pd.get_dummies(
    df,
    columns=columns,
    dtype=int,
)

Drop the columns used for creating the label to remove bias, also drop the unnecessary columns


In [None]:
correlations = df.corr()["Churned"].drop("Churned")
highly_correlated_features = correlations[correlations.abs() > 0.6]

print(highly_correlated_features)

NumOfSuspendedAccounts    0.648417
Name: Churned, dtype: float64


In [None]:
df.drop(
    columns=[
        "AvgExecutedQuantityRatio",
        "IsDormant",
        "RiskRate_High",
        "ClosedAccountsRatio",
        "IsClientSuspended",
        "SuspendedAccountsRatio",
        "NumOfOrders",
        "AvgOrderRate_Difference",
        "FirstOpenAccountDate",
        "LastOpenAccountDate",
        "FirstOrderDateAcrossAccounts",
        "LastOrderDateAcrossAccounts",
        "NumOfSuspendedAccounts",
        "CompanyName",
    ],
    inplace=True,
)

Remove all whitespaces in the column names


In [None]:
df.rename(columns=lambda x: x.replace(" ", ""), inplace=True)

## Normalization


The columns to be normalized


In [None]:
columns = [
    "NumOfCompletedOrders",
    "NumOfCanceledOrders",
    "AvgPrice",
    "TotalExecutedQuantity",
    "TotalQuantity",
    "Age",
    "NumOfAccounts",
    "NumOfClosedAccounts",
    "NumOfCompletedOrders",
    "NumOfCanceledOrders",
    "AvgOrderRate_Start",
    "AvgOrderRate_End",
    "AvgQuantityOrderedRate_Start",
    "AvgQuantityOrderedRate_End",
    "AvgQuantityOrderedRate_Difference",
    "AvgQuantityPerAccount_Start",
    "AvgQuantityPerAccount_End",
    "AvgQuantityPerAccount_Difference",
]

In [None]:
df = pd.read_csv("../Data/cleaned_dataset.csv")

Define a function to normalize the data based on a distribution


In [None]:
def get_normalized_data(data, dist):
    if dist == "uniform":
        return MinMaxScaler().fit_transform(data)
    elif dist == "norm":
        return StandardScaler().fit_transform(data)
    elif dist == "cauchy":
        ranked = stats.rankdata(data, method="average")
        # Convert ranks to percentiles
        percentiles = 100.0 * (ranked - 1) / (len(data) - 1)
        # Get the quantiles
        return np.percentile(data, percentiles)
    else:
        return np.log(np.abs(data.flatten()) + 1)

Define a function to get the best distribution that fits the data


In [None]:
def get_best_distribution(columns, df):
    columns_distributions_dict = {column: "" for column in columns}

    for column in columns:
        print("###### " + column + " ######")

        data = df[column].values

        f = Fitter(
            data,
            distributions=get_common_distributions(),
        )
        f.fit()
        f.summary(plot=False)
        dist = f.get_best(method="sumsquare_error")
        best_dist = ""
        for key in dist.keys():
            best_dist = key

        columns_distributions_dict[column] = str(best_dist)
        print(column)
        print(f"Best Distribution: {best_dist}")
        print()

    return columns_distributions_dict

Define a function to normalize the data based on its best distribution


In [None]:
def normalize(columns, df):
    columns_distributions_dict = get_best_distribution(columns, df)

    for column in columns_distributions_dict.keys():
        data = np.array(df[column]).reshape(-1, 1)
        df[column] = get_normalized_data(
            data=data, dist=columns_distributions_dict[column]
        )
    return columns_distributions_dict

Normalize the data


In [None]:
print(normalize(columns, df))

[32m2024-04-22 04:14:02.614[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted norm distribution with error=6e-05)[0m
[32m2024-04-22 04:14:02.617[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted expon distribution with error=1.2e-05)[0m
[32m2024-04-22 04:14:02.635[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted uniform distribution with error=7.5e-05)[0m
[32m2024-04-22 04:14:02.656[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted rayleigh distribution with error=5.5e-05)[0m
[32m2024-04-22 04:14:02.754[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted cauchy distribution with error=3.4e-05)[0m


###### NumOfCompletedOrders ######


[32m2024-04-22 04:14:03.285[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=7.6e-05)[0m
[32m2024-04-22 04:14:03.353[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=4.4e-05)[0m
[32m2024-04-22 04:14:03.365[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=7e-05)[0m
[32m2024-04-22 04:14:03.433[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=4.2e-05)[0m
[32m2024-04-22 04:14:03.464[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted lognorm distribution with error=7.5e-05)[0m
[32m2024-04-22 04:14:03.504[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_d

NumOfCompletedOrders
Best Distribution: expon

###### NumOfCanceledOrders ######


[32m2024-04-22 04:14:04.178[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=0.00039)[0m
[32m2024-04-22 04:14:04.237[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=0.000227)[0m
[32m2024-04-22 04:14:04.268[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=0.000382)[0m
[32m2024-04-22 04:14:04.327[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=0.000301)[0m
[32m2024-04-22 04:14:04.368[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted lognorm distribution with error=0.000388)[0m
[32m2024-04-22 04:14:04.423[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_si

NumOfCanceledOrders
Best Distribution: expon

###### AvgPrice ######


[32m2024-04-22 04:14:05.085[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=0.002665)[0m
[32m2024-04-22 04:14:05.128[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=0.001051)[0m
[32m2024-04-22 04:14:05.138[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=0.002246)[0m
[32m2024-04-22 04:14:05.211[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=0.000999)[0m
[32m2024-04-22 04:14:05.238[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted lognorm distribution with error=0.002633)[0m
[32m2024-04-22 04:14:05.270[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_s

AvgPrice
Best Distribution: cauchy

###### TotalExecutedQuantity ######


[32m2024-04-22 04:14:05.541[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=0.0)[0m
[32m2024-04-22 04:14:05.546[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted cauchy distribution with error=0.0)[0m
[32m2024-04-22 04:14:05.935[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=0.0)[0m
[32m2024-04-22 04:14:05.964[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=0.0)[0m
[32m2024-04-22 04:14:06.033[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=0.0)[0m
[32m2024-04-22 04:14:06.060[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[3

TotalExecutedQuantity
Best Distribution: expon

###### TotalQuantity ######


[32m2024-04-22 04:14:06.363[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=0.0)[0m
[32m2024-04-22 04:14:06.397[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted cauchy distribution with error=0.0)[0m
[32m2024-04-22 04:14:06.773[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=0.0)[0m
[32m2024-04-22 04:14:06.796[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=0.0)[0m
[32m2024-04-22 04:14:06.865[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=0.0)[0m
[32m2024-04-22 04:14:06.900[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[3

TotalQuantity
Best Distribution: expon

###### Age ######


[32m2024-04-22 04:14:07.293[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=0.020381)[0m
[32m2024-04-22 04:14:07.311[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=0.020363)[0m
[32m2024-04-22 04:14:07.383[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=0.098524)[0m
[32m2024-04-22 04:14:07.425[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted expon distribution with error=0.410975)[0m
[32m2024-04-22 04:14:07.444[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted norm distribution with error=1.740438)[0m
[32m2024-04-22 04:14:07.467[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_

Age
Best Distribution: powerlaw

###### NumOfAccounts ######


[32m2024-04-22 04:14:07.637[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted lognorm distribution with error=2.569952)[0m
[32m2024-04-22 04:14:07.811[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted cauchy distribution with error=2.693142)[0m
[32m2024-04-22 04:14:07.845[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=0.72936)[0m
[32m2024-04-22 04:14:07.884[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=2.134466)[0m
[32m2024-04-22 04:14:07.903[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=0.92792)[0m
[32m2024-04-22 04:14:07.938[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_singl

NumOfAccounts
Best Distribution: expon

###### NumOfClosedAccounts ######


[32m2024-04-22 04:14:08.468[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted cauchy distribution with error=1060.536483)[0m
[32m2024-04-22 04:14:08.600[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=783.769224)[0m
[32m2024-04-22 04:14:08.639[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=796.319105)[0m
[32m2024-04-22 04:14:08.706[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=960.704916)[0m
[32m2024-04-22 04:14:08.737[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=1054.329319)[0m
[32m2024-04-22 04:14:08.805[0m | [1mINFO    [0m | [36mfitter.fitter[0m:

NumOfClosedAccounts
Best Distribution: expon

###### NumOfCompletedOrders ######


[32m2024-04-22 04:14:09.605[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=7.6e-05)[0m
[32m2024-04-22 04:14:09.708[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=4.4e-05)[0m
[32m2024-04-22 04:14:09.714[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=7e-05)[0m
[32m2024-04-22 04:14:09.802[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=4.2e-05)[0m
[32m2024-04-22 04:14:09.849[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted lognorm distribution with error=7.5e-05)[0m
[32m2024-04-22 04:14:09.885[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_d

NumOfCompletedOrders
Best Distribution: expon

###### NumOfCanceledOrders ######


[32m2024-04-22 04:14:10.478[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=0.00039)[0m
[32m2024-04-22 04:14:10.522[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=0.000227)[0m
[32m2024-04-22 04:14:10.562[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=0.000382)[0m
[32m2024-04-22 04:14:10.603[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=0.000301)[0m
[32m2024-04-22 04:14:10.635[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted lognorm distribution with error=0.000388)[0m
[32m2024-04-22 04:14:10.667[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_si

NumOfCanceledOrders
Best Distribution: expon

###### AvgOrderRate_Start ######


[32m2024-04-22 04:14:11.095[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted cauchy distribution with error=14.647401)[0m
[32m2024-04-22 04:14:11.307[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=13.226904)[0m
[32m2024-04-22 04:14:11.359[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=7.349342)[0m
[32m2024-04-22 04:14:11.367[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=14.488773)[0m
[32m2024-04-22 04:14:11.397[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=8.422577)[0m
[32m2024-04-22 04:14:11.430[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit

AvgOrderRate_Start
Best Distribution: expon

###### AvgOrderRate_End ######


[32m2024-04-22 04:14:12.123[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=17.87766)[0m
[32m2024-04-22 04:14:12.165[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=20.575272)[0m
[32m2024-04-22 04:14:12.183[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=11.524445)[0m
[32m2024-04-22 04:14:12.230[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=11.959317)[0m
[32m2024-04-22 04:14:12.248[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted lognorm distribution with error=21.215858)[0m
[32m2024-04-22 04:14:12.279[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_f

AvgOrderRate_End
Best Distribution: expon

###### AvgQuantityOrderedRate_Start ######


[32m2024-04-22 04:14:12.735[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted cauchy distribution with error=0.0)[0m
[32m2024-04-22 04:14:12.903[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=0.0)[0m
[32m2024-04-22 04:14:12.967[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=0.0)[0m
[32m2024-04-22 04:14:12.998[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=0.0)[0m
[32m2024-04-22 04:14:13.017[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=0.0)[0m
[32m2024-04-22 04:14:13.054[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[3

AvgQuantityOrderedRate_Start
Best Distribution: expon

###### AvgQuantityOrderedRate_End ######


[32m2024-04-22 04:14:13.301[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted cauchy distribution with error=0.0)[0m
[32m2024-04-22 04:14:13.751[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=0.0)[0m
[32m2024-04-22 04:14:13.837[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=0.0)[0m
[32m2024-04-22 04:14:13.842[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=0.0)[0m
[32m2024-04-22 04:14:13.892[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=0.0)[0m
[32m2024-04-22 04:14:13.916[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[3

AvgQuantityOrderedRate_End
Best Distribution: expon

###### AvgQuantityOrderedRate_Difference ######


[32m2024-04-22 04:14:14.132[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=0.0)[0m
[32m2024-04-22 04:14:14.388[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=0.0)[0m
[32m2024-04-22 04:14:14.488[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=0.0)[0m
[32m2024-04-22 04:14:14.516[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=0.0)[0m
[32m2024-04-22 04:14:14.523[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted lognorm distribution with error=0.0)[0m
[32m2024-04-22 04:14:14.572[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[

AvgQuantityOrderedRate_Difference
Best Distribution: norm

###### AvgQuantityPerAccount_Start ######


[32m2024-04-22 04:14:14.918[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=0.0)[0m
[32m2024-04-22 04:14:15.014[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted cauchy distribution with error=0.0)[0m
[32m2024-04-22 04:14:15.237[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=0.0)[0m
[32m2024-04-22 04:14:15.291[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=0.0)[0m
[32m2024-04-22 04:14:15.339[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=0.0)[0m
[32m2024-04-22 04:14:15.385[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[3

AvgQuantityPerAccount_Start
Best Distribution: rayleigh

###### AvgQuantityPerAccount_End ######


[32m2024-04-22 04:14:15.648[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted cauchy distribution with error=0.0)[0m
[32m2024-04-22 04:14:15.752[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=0.0)[0m
[32m2024-04-22 04:14:16.139[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=0.0)[0m
[32m2024-04-22 04:14:16.151[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=0.0)[0m
[32m2024-04-22 04:14:16.235[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=0.0)[0m
[32m2024-04-22 04:14:16.272[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[3

AvgQuantityPerAccount_End
Best Distribution: expon

###### AvgQuantityPerAccount_Difference ######


[32m2024-04-22 04:14:16.510[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted cauchy distribution with error=0.0)[0m
[32m2024-04-22 04:14:16.545[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=0.0)[0m
[32m2024-04-22 04:14:16.553[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=0.0)[0m
[32m2024-04-22 04:14:16.675[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=0.0)[0m
[32m2024-04-22 04:14:16.709[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted lognorm distribution with error=0.0)[0m


AvgQuantityPerAccount_Difference
Best Distribution: norm

{'NumOfCompletedOrders': 'expon', 'NumOfCanceledOrders': 'expon', 'AvgPrice': 'cauchy', 'TotalExecutedQuantity': 'expon', 'TotalQuantity': 'expon', 'Age': 'powerlaw', 'NumOfAccounts': 'expon', 'NumOfClosedAccounts': 'expon', 'AvgOrderRate_Start': 'expon', 'AvgOrderRate_End': 'expon', 'AvgQuantityOrderedRate_Start': 'expon', 'AvgQuantityOrderedRate_End': 'expon', 'AvgQuantityOrderedRate_Difference': 'norm', 'AvgQuantityPerAccount_Start': 'rayleigh', 'AvgQuantityPerAccount_End': 'expon', 'AvgQuantityPerAccount_Difference': 'norm'}


## Saving


Make the Churned column to be the last column


In [None]:
cols = list(df.columns)
cols.append(cols.pop(cols.index("Churned")))
df = df[cols]

Display the head of the dataframe


In [None]:
df.head()

Unnamed: 0,ClientID,Age,NumOfAccounts,NumOfClosedAccounts,NumOfCompletedOrders,NumOfCanceledOrders,TotalExecutedQuantity,TotalQuantity,AvgPrice,OrderType_Buy,...,LastOpenAccountDate_Year_2024,FirstOrderDateAcrossAccounts_Year_0,FirstOrderDateAcrossAccounts_Year_2022,FirstOrderDateAcrossAccounts_Year_2023,FirstOrderDateAcrossAccounts_Year_2024,LastOrderDateAcrossAccounts_Year_0,LastOrderDateAcrossAccounts_Year_2022,LastOrderDateAcrossAccounts_Year_2023,LastOrderDateAcrossAccounts_Year_2024,Churned
0,0,3.526361,0.693147,0.0,1.098612,0.0,9.392745,9.392745,1.21,1.485619,...,0,0,1,0,0,0,1,0,0,0
1,1,4.110874,0.693147,0.0,1.94591,0.693147,8.377471,8.418477,1.982429,0.0,...,0,0,1,0,0,0,1,0,0,1
2,2,3.970292,0.693147,0.0,2.890372,1.098612,11.727198,11.830623,7.652667,13.370571,...,0,0,0,0,1,0,0,0,1,0
3,3,4.276666,0.693147,0.0,4.174387,2.484907,13.433575,13.722385,72.059886,30.45519,...,0,0,0,0,1,0,0,0,1,0
4,4,3.871201,1.098612,0.0,2.639057,2.484907,12.907934,13.38283,1.572133,1.857024,...,0,0,0,0,1,0,0,0,1,0


Display the info of the dataframe


In [None]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9123 entries, 0 to 9122
Data columns (total 447 columns):
 #    Column                                  Dtype  
---   ------                                  -----  
 0    ClientID                                int64  
 1    Age                                     float64
 2    NumOfAccounts                           float64
 3    NumOfClosedAccounts                     float64
 4    NumOfCompletedOrders                    float64
 5    NumOfCanceledOrders                     float64
 6    TotalExecutedQuantity                   float64
 7    TotalQuantity                           float64
 8    AvgPrice                                float64
 9    OrderType_Buy                           float64
 10   OrderType_Sell                          float64
 11   SecurityID_0                            float64
 12   SecurityID_1                            float64
 13   SecurityID_2                            float64
 14   SecurityID_3          

Save the dataframe into a csv "cleaned_dataset.csv"


In [None]:
df.to_csv("../Data/cleaned_dataset.csv", index=False)