# Data Cleaning and Preprocessing


Imports


In [1]:
import pandas as pd
import numpy as np
from fitter import Fitter, get_common_distributions
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import warnings
from datetime import datetime
import seaborn as sns

warnings.filterwarnings("ignore")

pd.set_option("future.no_silent_downcasting", True)

Read the orders dataset into a pandas dataframe


In [2]:
orders_df = pd.read_csv("orders_data_competition.csv")

Read the clients dataset into a pandas dataframe


In [3]:
clients_df = pd.read_csv("clients_data_competition.csv")

## Orders Data


**_We created another dataframe "agg_orders" which represents the aggregate orders of each account_**


Drop the rows with null values


In [4]:
orders_df.dropna(inplace=True)

Initialize a new Dataframe to store the aggregate orders per account


In [5]:
agg_orders = pd.DataFrame()

Add an AccountID column to the aggregate orders df


In [6]:
agg_orders["AccountID"] = clients_df["Account ID"].unique()

Compute and add the number of orders for each account


In [7]:
# Add a NumOfOrders column that has the number of orders for each account
agg_orders["NumOfOrders"] = orders_df.groupby("Account ID").size()

# Set the number of orders for the accounts that didn't place any order to 0
agg_orders["NumOfOrders"] = agg_orders["NumOfOrders"].fillna(0)

# Convert the type to int
agg_orders["NumOfOrders"] = agg_orders["NumOfOrders"].astype(int)

Compute and add the last order for each account:

-   Split them into 2 columns for Month and Year of last order
-   OHE these columns


In [8]:
# Convert the 'order_date' column to datetime
orders_df["Order Time"] = pd.to_datetime(orders_df["Order Time"])

# Create a new dataframe with the last order date for each account
agg_orders["LastOrder"] = orders_df.groupby("Account ID")["Order Time"].max()

# Convert the LastOrder column to datetime
agg_orders["LastOrder"] = pd.to_datetime(agg_orders["LastOrder"], format="%d-%m-%Y")

# Create the Month and Year columns
agg_orders["LastOrder_Month"] = agg_orders["LastOrder"].dt.month
agg_orders["LastOrder_Year"] = agg_orders["LastOrder"].dt.year

# Set the last order date for the accounts that didn't place any order to 0-0-0 for visualization purposes
agg_orders["LastOrder_Month"] = agg_orders["LastOrder_Month"].fillna(0)
agg_orders["LastOrder_Year"] = agg_orders["LastOrder_Year"].fillna(0)

# Convert the type to int
agg_orders["LastOrder_Month"] = agg_orders["LastOrder_Month"].astype(int)
agg_orders["LastOrder_Year"] = agg_orders["LastOrder_Year"].astype(int)

Compute and add the number of completed orders for each account


In [9]:
# Add a NumOfCompleted column that has the number of completed orders for each account
agg_orders["NumOfCompleted"] = orders_df.groupby("Account ID")["Is Completed"].sum()

# Set the number of completed orders for the accounts that didn't place any order to 0
agg_orders["NumOfCompleted"] = agg_orders["NumOfCompleted"].fillna(0)

# # Convert the type to int
agg_orders["NumOfCompleted"] = agg_orders["NumOfCompleted"].astype(int)

Compute and add the number of canceled for each account


In [10]:
# Add a NumOfCanceled column that has the number of completed orders for each account
agg_orders["NumOfCanceled"] = orders_df.groupby("Account ID")["Is Canceled"].sum()

# Set the number of completed orders for the accounts that didn't place any order to 0
agg_orders["NumOfCanceled"] = agg_orders["NumOfCanceled"].fillna(0)

# # Convert the type to int
agg_orders["NumOfCanceled"] = agg_orders["NumOfCanceled"].astype(int)

Compute and add the average price of orders for each account


In [11]:
# Add a AvgPrice column that has the number of completed orders for each account
agg_orders["AvgPrice"] = (
    orders_df.groupby("Account ID")["Price"].sum()
    / orders_df.groupby("Account ID").size()
)

# Set the number of completed orders for the accounts that didn't place any order to 0
agg_orders["AvgPrice"] = agg_orders["AvgPrice"].fillna(0.0)

Compute and add the total executed quantity of orders for each account


In [12]:
# Add a TotalExecutedQuantity column that has the number of completed orders for each account
agg_orders["TotalExecutedQuantity"] = orders_df.groupby("Account ID")[
    "Executed Quantity"
].sum()

# Set the number of completed orders for the accounts that didn't place any order to 0
agg_orders["TotalExecutedQuantity"] = agg_orders["TotalExecutedQuantity"].fillna(0)

# # Convert the type to int
agg_orders["TotalExecutedQuantity"] = agg_orders["TotalExecutedQuantity"].astype(int)

Compute and add the total quantity of orders for each account


In [13]:
# Add a TotalQuantity column that has the number of completed orders for each account
agg_orders["TotalQuantity"] = orders_df.groupby("Account ID")["Quantity"].sum()

# Set the number of completed orders for the accounts that didn't place any order to 0
agg_orders["TotalQuantity"] = agg_orders["TotalQuantity"].fillna(0)

# # Convert the type to int
agg_orders["TotalQuantity"] = agg_orders["TotalQuantity"].astype(int)

In [14]:
agg_orders_copy = agg_orders.copy()

Define a function to compute and return an IDF vector of a data


In [15]:
def get_IDF_vector(df):
    idf = {}
    for i in df.columns:
        idf[i] = (
            np.log2(df.shape[0] / df[df[i] > 0].shape[0])
            if df[df[i] > 0].shape[0] != 0
            else 1e-6
        )
    return idf

Define a function to compute and return the TFIDF dataframe for the data


In [16]:
def calculate_TFIDF(df, idf_dict):
    tfidf_df = df.copy()
    for i in df.columns:
        tfidf_df[i] = df[i] * idf_dict[i]
    return tfidf_df

Define a function to convert a categorical feature to its TFIDF dataframe and concatenate it with the aggregate orders dataframe


In [17]:
def TFIDF(column_name: str):
    temp = pd.get_dummies(orders_df, columns=[column_name])
    temp.replace({True: 1, False: 0}, inplace=True)

    tfidf_df = pd.DataFrame(data=agg_orders.iloc[:, 0])

    for word in orders_df[column_name].unique():
        col_name = f"{column_name}_{word}"

        tfidf_df[col_name.replace(" ", "")] = temp.groupby("Account ID")[col_name].sum()

        tfidf_df[col_name.replace(" ", "")] = tfidf_df[
            col_name.replace(" ", "")
        ].fillna(0)

        tfidf_df[col_name.replace(" ", "")] = tfidf_df[
            col_name.replace(" ", "")
        ].astype(int)

    idf_dict = get_IDF_vector(tfidf_df.iloc[:, 1:])
    tfidf_df = calculate_TFIDF(tfidf_df.iloc[:, 1:], idf_dict)
    return pd.concat([agg_orders, tfidf_df], axis=1)

Compute and add the TFIDF of the Order Type column


In [18]:
agg_orders = TFIDF("Order Type")

Compute and add the TFIDF of the Security ID column


In [19]:
agg_orders = TFIDF("Security ID")

Compute and add the TFIDF of the Order Via column


In [20]:
agg_orders = TFIDF("Order Via")

Compute and add the TFIDF of the Execution Status column


In [21]:
agg_orders = TFIDF("Execution Status")

Compute and add the TFIDF of the Sector Name column


In [22]:
agg_orders = TFIDF("Sector Name")

Display the head of the agg_orders dataframe


In [23]:
agg_orders.head()

Unnamed: 0,AccountID,NumOfOrders,LastOrder,LastOrder_Month,LastOrder_Year,NumOfCompleted,NumOfCanceled,AvgPrice,TotalExecutedQuantity,TotalQuantity,...,SectorName_TelecommunicationServices,SectorName_REALESTATE,SectorName_Telecommunications,SectorName_FOOD,SectorName_Others,SectorName_Tourism,SectorName_ConsumerServices,SectorName_Utilities,SectorName_Trade,SectorName_PharmaceuticalIndustries
0,0,2,2022-10-19 12:24:04,10,2022,2,0,1.21,12000,12000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,7,2022-09-14 12:56:56,9,2022,6,1,1.982429,4347,4529,...,0.0,0.0,0.0,2.797573,0.0,0.0,0.0,0.0,0.0,0.0
2,2,21,2024-02-28 11:06:33,2,2024,17,2,7.652667,123895,137395,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,88,2024-03-10 12:32:34,3,2024,64,11,72.059886,682538,911078,...,6.297068,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,30,2024-02-26 12:47:33,2,2024,8,10,2.846267,356686,594953,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Display the info of the agg_orders dataframe


In [24]:
agg_orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13523 entries, 0 to 13522
Columns: 377 entries, AccountID to SectorName_PharmaceuticalIndustries
dtypes: datetime64[ns](1), float64(368), int64(8)
memory usage: 38.9 MB


## Clients Data


Drop the rows with null values


In [25]:
clients_df.dropna(inplace=True)

Split the OpenDate column to Month and Year of the account open date. Then, OHE these columns.


In [26]:
# Convert the OpenDate column to datetime
clients_df["OpenDate"] = pd.to_datetime(clients_df["OpenDate"], format="%m/%d/%Y")

# Create the Month and Year columns
clients_df["OpenDate_Month"] = clients_df["OpenDate"].dt.month
clients_df["OpenDate_Year"] = clients_df["OpenDate"].dt.year

# Convert the type to int
clients_df["OpenDate_Month"] = clients_df["OpenDate_Month"].astype(int)
clients_df["OpenDate_Year"] = clients_df["OpenDate_Year"].astype(int)

Split the Birthdate column to Month and Year of the client's birthdate. Then, OHE these columns.


In [27]:
# Convert the BirthDate column to datetime
clients_df["BirthDate"] = pd.to_datetime(clients_df["BirthDate"], format="%Y-%m-%d")

# Create the Month and Year columns
clients_df["BirthDate_Month"] = clients_df["BirthDate"].dt.month
clients_df["BirthDate_Year"] = clients_df["BirthDate"].dt.year

# Convert the type to int
clients_df["BirthDate_Month"] = clients_df["BirthDate_Month"].astype(int)
clients_df["BirthDate_Year"] = clients_df["BirthDate_Year"].astype(int)

Define a function so that we can compute if the account is dormant or not


In [28]:
def is_dormant(date):
    april_2023 = datetime(2023, 4, 1)
    if date < april_2023:
        return 1
    else:
        return 0

Recompute the Is Dormant column


In [29]:
clients_df["Is Dormant"] = agg_orders["LastOrder"].apply(is_dormant)

Remove all whitespaces in the column names


In [30]:
clients_df.rename(columns=lambda x: x.replace(" ", ""), inplace=True)

Display the head of the clients dataframe


In [31]:
clients_df.head()

Unnamed: 0,ClientID,AccountID,Gender,RiskRate,CompanyName,IsClosed,IsDormant,IsProfileSuspended,IsClientSuspended,ClientTypeName,OpenDate,BirthDate,OpenDate_Month,OpenDate_Year,BirthDate_Month,BirthDate_Year
0,0,0,Male,Low,HSB,0,1,0,0,Individuals,2014-02-16,1990-07-27,2,2014,7,1990
1,1,1,Female,Low,HSB,0,1,1,0,Individuals,2014-02-16,1963-10-22,2,2014,10,1963
2,2,2,Male,Low,HSB,0,0,0,0,Individuals,2014-02-17,1971-05-14,2,2014,5,1971
3,3,3,Male,Low,HSB,0,0,0,0,Individuals,2014-02-17,1953-01-14,2,2014,1,1953
4,4,4,Male,Low,HSB,0,0,0,0,Individuals,2014-02-17,1976-06-09,2,2014,6,1976


Display the info of the clients dataframe


In [32]:
clients_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13523 entries, 0 to 13522
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   ClientID            13523 non-null  int64         
 1   AccountID           13523 non-null  int64         
 2   Gender              13523 non-null  object        
 3   RiskRate            13523 non-null  object        
 4   CompanyName         13523 non-null  object        
 5   IsClosed            13523 non-null  int64         
 6   IsDormant           13523 non-null  int64         
 7   IsProfileSuspended  13523 non-null  int64         
 8   IsClientSuspended   13523 non-null  int64         
 9   ClientTypeName      13523 non-null  object        
 10  OpenDate            13523 non-null  datetime64[ns]
 11  BirthDate           13523 non-null  datetime64[ns]
 12  OpenDate_Month      13523 non-null  int64         
 13  OpenDate_Year       13523 non-null  int64     

## Merging


**_We will inner join the agg_orders and clients_df dataframes on the AccountID column_**


Perform the merging between both dataframes


In [33]:
df = pd.merge(clients_df, agg_orders, on="AccountID", how="inner")

Display the head of the dataframe


In [34]:
df.head()

Unnamed: 0,ClientID,AccountID,Gender,RiskRate,CompanyName,IsClosed,IsDormant,IsProfileSuspended,IsClientSuspended,ClientTypeName,...,SectorName_TelecommunicationServices,SectorName_REALESTATE,SectorName_Telecommunications,SectorName_FOOD,SectorName_Others,SectorName_Tourism,SectorName_ConsumerServices,SectorName_Utilities,SectorName_Trade,SectorName_PharmaceuticalIndustries
0,0,0,Male,Low,HSB,0,1,0,0,Individuals,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,1,Female,Low,HSB,0,1,1,0,Individuals,...,0.0,0.0,0.0,2.797573,0.0,0.0,0.0,0.0,0.0,0.0
2,2,2,Male,Low,HSB,0,0,0,0,Individuals,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,3,Male,Low,HSB,0,0,0,0,Individuals,...,6.297068,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,4,Male,Low,HSB,0,0,0,0,Individuals,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Display the info of the dataframe


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13523 entries, 0 to 13522
Columns: 392 entries, ClientID to SectorName_PharmaceuticalIndustries
dtypes: datetime64[ns](3), float64(368), int64(17), object(4)
memory usage: 40.4+ MB


In [36]:
df_no_TFIDF = pd.merge(clients_df, agg_orders_copy, on="AccountID", how="inner")

## Defining The Label


**_We defined our label to be a combination of multiple features_**

-   TotalQuantity
-   TotalExecutedQuantity
-   IsDormant
-   RiskRate_High
-   IsClosed
-   IsClientSuspended
-   IsProfileSuspended


Create the label


In [37]:
# Compute the ratio of TotalExecutedQuantity/TotalQuantity
df["ExecutedQuantityRatio"] = df.apply(
    lambda row: (
        1
        if row["TotalQuantity"] == 0
        else row["TotalExecutedQuantity"] / row["TotalQuantity"]
    ),
    axis=1,
)

df["Churned"] = (
    # Churned if no order for a long time and account is dormant
    (df["IsDormant"] == 1)
    |
    # Churned if high risk rate and low executed quantity ratio
    (df["RiskRate"] == "High") & (df["ExecutedQuantityRatio"] < 0.5)
    |
    # Churned if account is closed
    (df["IsClosed"] == 1)
    |
    # Churned if client is suspended
    (df["IsClientSuspended"] == 1)
    | (df["IsProfileSuspended"] == 1)
)

df = df.replace({True: 1, False: 0})

df["Churned"] = df["Churned"].astype(int)

In [38]:
df_no_TFIDF["Churned"] = df["Churned"]

## OHE


The columns to be one-hot encoded


In [39]:
columns = [
    "BirthDate_Month",
    "BirthDate_Year",
    "Gender",
    "RiskRate",
    "ClientTypeName",
    "OpenDate_Month",
    "OpenDate_Year",
]

OHE these columns


In [40]:
df = pd.get_dummies(
    df,
    columns=columns,
    dtype=int,
)

Drop the columns used for creating the label to remove bias, also drop the unnecessary columns


In [41]:
df.drop(
    columns=[
        "ExecutedQuantityRatio",
        "IsDormant",
        "RiskRate_High",
        "IsClosed",
        "IsClientSuspended",
        "IsProfileSuspended",
        "OpenDate",
        "BirthDate",
        "LastOrder",
        "LastOrder_Year",
        "LastOrder_Month",
        "CompanyName",  # As we are only interested in HSB
    ],
    inplace=True,
)

df_no_TFIDF.drop(
    columns=[
        "OpenDate",
        "BirthDate",
        "LastOrder",
    ],
    inplace=True,
)

Remove all whitespaces in the column names


In [42]:
df.rename(columns=lambda x: x.replace(" ", ""), inplace=True)

## Normalization


The columns to be normalized


In [43]:
columns = [
    "NumOfCompleted",
    "NumOfCanceled",
    "AvgPrice",
    "TotalExecutedQuantity",
    "NumOfOrders",
    "TotalQuantity",
]

Define a function to normalize the data based on a distribution


In [44]:
def get_normalized_data(data, dist):
    if dist == "uniform":
        return MinMaxScaler().fit_transform(data)
    elif dist == "norm":
        return StandardScaler().fit_transform(data)
    elif dist == "lognorm" or dist == "expon":
        return np.log(np.abs(data.flatten()) + 1)

Define a function to get the best distribution that fits the data


In [45]:
def get_best_distribution(columns, df):
    columns_distributions_dict = {column: "" for column in columns}

    for column in columns:
        print("###### " + column + " ######")

        data = df[column].values

        f = Fitter(
            data,
            distributions=get_common_distributions(),
        )
        f.fit()
        f.summary(plot=False)
        dist = f.get_best(method="sumsquare_error")
        best_dist = ""
        for key in dist.keys():
            best_dist = key

        columns_distributions_dict[column] = str(best_dist)
        print(column)
        print(f"Best Distribution: {best_dist}")
        print()

    return columns_distributions_dict

Define a function to normalize the data based on its best distribution


In [46]:
def normalize(columns, df):
    columns_distributions_dict = get_best_distribution(columns, df)

    for column in columns_distributions_dict.keys():
        data = np.array(df[column]).reshape(-1, 1)
        df[column] = get_normalized_data(
            data=data, dist=columns_distributions_dict[column]
        )
    return columns_distributions_dict

Normalize the data


In [47]:
print(normalize(columns, df))

[32m2024-04-19 20:56:59.800[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted norm distribution with error=0.000123)[0m
[32m2024-04-19 20:56:59.806[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted expon distribution with error=2.4e-05)[0m
[32m2024-04-19 20:56:59.834[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted uniform distribution with error=0.00015)[0m
[32m2024-04-19 20:56:59.863[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted rayleigh distribution with error=0.000114)[0m


###### NumOfCompleted ######


[32m2024-04-19 20:57:00.023[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted cauchy distribution with error=0.000117)[0m
[32m2024-04-19 20:57:00.644[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=0.000152)[0m
[32m2024-04-19 20:57:00.727[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=9.3e-05)[0m
[32m2024-04-19 20:57:00.744[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=0.000149)[0m
[32m2024-04-19 20:57:00.829[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=8.4e-05)[0m
[32m2024-04-19 20:57:00.878[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_sing

NumOfCompleted
Best Distribution: expon

###### NumOfCanceled ######


[32m2024-04-19 20:57:01.685[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=0.000531)[0m
[32m2024-04-19 20:57:01.715[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=0.000302)[0m
[32m2024-04-19 20:57:01.795[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=0.000524)[0m
[32m2024-04-19 20:57:01.870[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=0.000295)[0m
[32m2024-04-19 20:57:01.926[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted lognorm distribution with error=0.000529)[0m
[32m2024-04-19 20:57:01.969[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_s

NumOfCanceled
Best Distribution: expon

###### AvgPrice ######


[32m2024-04-19 20:57:02.767[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=0.001539)[0m
[32m2024-04-19 20:57:02.859[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=0.001501)[0m
[32m2024-04-19 20:57:02.868[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=0.000597)[0m
[32m2024-04-19 20:57:02.984[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=0.000729)[0m
[32m2024-04-19 20:57:03.020[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted lognorm distribution with error=0.001531)[0m
[32m2024-04-19 20:57:03.059[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_s

AvgPrice
Best Distribution: expon

###### TotalExecutedQuantity ######


[32m2024-04-19 20:57:03.367[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted cauchy distribution with error=0.0)[0m
[32m2024-04-19 20:57:03.506[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=0.0)[0m
[32m2024-04-19 20:57:03.866[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=0.0)[0m
[32m2024-04-19 20:57:03.919[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=0.0)[0m
[32m2024-04-19 20:57:03.989[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=0.0)[0m
[32m2024-04-19 20:57:04.043[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[3

TotalExecutedQuantity
Best Distribution: expon

###### NumOfOrders ######


[32m2024-04-19 20:57:04.838[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=4.6e-05)[0m
[32m2024-04-19 20:57:04.908[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=2.8e-05)[0m
[32m2024-04-19 20:57:04.943[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=4.5e-05)[0m
[32m2024-04-19 20:57:05.012[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=3.2e-05)[0m
[32m2024-04-19 20:57:05.050[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted lognorm distribution with error=4.6e-05)[0m
[32m2024-04-19 20:57:05.089[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single

NumOfOrders
Best Distribution: expon

###### TotalQuantity ######


[32m2024-04-19 20:57:05.343[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted gamma distribution with error=0.0)[0m
[32m2024-04-19 20:57:05.385[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted cauchy distribution with error=0.0)[0m
[32m2024-04-19 20:57:05.711[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted chi2 distribution with error=0.0)[0m
[32m2024-04-19 20:57:05.747[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted powerlaw distribution with error=0.0)[0m
[32m2024-04-19 20:57:05.798[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[36m337[0m - [1mFitted exponpow distribution with error=0.0)[0m
[32m2024-04-19 20:57:05.835[0m | [1mINFO    [0m | [36mfitter.fitter[0m:[36m_fit_single_distribution[0m:[3

TotalQuantity
Best Distribution: expon

{'NumOfCompleted': 'expon', 'NumOfCanceled': 'expon', 'AvgPrice': 'expon', 'TotalExecutedQuantity': 'expon', 'NumOfOrders': 'expon', 'TotalQuantity': 'expon'}


## Saving


Make the Churned column to be the last column


In [48]:
cols = list(df.columns)
cols.append(cols.pop(cols.index("Churned")))
df = df[cols]

Display the head of the dataframe


In [49]:
df.head()

Unnamed: 0,ClientID,AccountID,NumOfOrders,NumOfCompleted,NumOfCanceled,AvgPrice,TotalExecutedQuantity,TotalQuantity,OrderType_Buy,OrderType_Sell,...,OpenDate_Year_2016,OpenDate_Year_2017,OpenDate_Year_2018,OpenDate_Year_2019,OpenDate_Year_2020,OpenDate_Year_2021,OpenDate_Year_2022,OpenDate_Year_2023,OpenDate_Year_2024,Churned
0,0,0,1.098612,1.098612,0.0,0.792993,9.392745,9.392745,1.485619,0.0,...,0,0,0,0,0,0,0,0,0,1
1,1,1,2.079442,1.94591,0.693147,1.092738,8.377471,8.418477,0.0,4.035913,...,0,0,0,0,0,0,0,0,0,1
2,2,2,3.091042,2.890372,1.098612,2.157868,11.727198,11.830623,13.370571,1.729677,...,0,0,0,0,0,0,0,0,0,0
3,3,3,4.488636,4.174387,2.484907,4.291279,13.433575,13.722385,30.45519,27.09827,...,0,0,0,0,0,0,0,0,0,0
4,4,4,3.433987,2.197225,2.397895,1.347103,12.784614,13.296239,1.485619,16.14365,...,0,0,0,0,0,0,0,0,0,0


Display the info of the dataframe


In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13523 entries, 0 to 13522
Columns: 493 entries, ClientID to Churned
dtypes: float64(373), int64(120)
memory usage: 50.9 MB


Save the dataframe into a csv "cleaned_dataset.csv"


In [51]:
df.to_csv("cleaned_dataset.csv", index=False)

In [52]:
df_no_TFIDF.head()

Unnamed: 0,ClientID,AccountID,Gender,RiskRate,CompanyName,IsClosed,IsDormant,IsProfileSuspended,IsClientSuspended,ClientTypeName,...,BirthDate_Year,NumOfOrders,LastOrder_Month,LastOrder_Year,NumOfCompleted,NumOfCanceled,AvgPrice,TotalExecutedQuantity,TotalQuantity,Churned
0,0,0,Male,Low,HSB,0,1,0,0,Individuals,...,1990,2,10,2022,2,0,1.21,12000,12000,1
1,1,1,Female,Low,HSB,0,1,1,0,Individuals,...,1963,7,9,2022,6,1,1.982429,4347,4529,1
2,2,2,Male,Low,HSB,0,0,0,0,Individuals,...,1971,21,2,2024,17,2,7.652667,123895,137395,0
3,3,3,Male,Low,HSB,0,0,0,0,Individuals,...,1953,88,3,2024,64,11,72.059886,682538,911078,0
4,4,4,Male,Low,HSB,0,0,0,0,Individuals,...,1976,30,2,2024,8,10,2.846267,356686,594953,0


In [53]:
df_no_TFIDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13523 entries, 0 to 13522
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ClientID               13523 non-null  int64  
 1   AccountID              13523 non-null  int64  
 2   Gender                 13523 non-null  object 
 3   RiskRate               13523 non-null  object 
 4   CompanyName            13523 non-null  object 
 5   IsClosed               13523 non-null  int64  
 6   IsDormant              13523 non-null  int64  
 7   IsProfileSuspended     13523 non-null  int64  
 8   IsClientSuspended      13523 non-null  int64  
 9   ClientTypeName         13523 non-null  object 
 10  OpenDate_Month         13523 non-null  int64  
 11  OpenDate_Year          13523 non-null  int64  
 12  BirthDate_Month        13523 non-null  int64  
 13  BirthDate_Year         13523 non-null  int64  
 14  NumOfOrders            13523 non-null  int64  
 15  La

Save the dataframe with no TFIDF and OHE columns into a csv


In [54]:
df_no_TFIDF.to_csv("../Data/visualization_data.csv", index=False)