In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [2]:
df = pd.read_csv("../Data/visualization_data.csv")

In [3]:
df.drop(columns=["CompanyName"], inplace=True)

In [4]:
df.head()

Unnamed: 0,ClientID,AccountID,Gender,RiskRate,IsClosed,IsDormant,IsProfileSuspended,IsClientSuspended,ClientTypeName,OpenDate_Month,...,SectorName_REALESTATE,SectorName_Telecommunications,SectorName_FOOD,SectorName_Others,SectorName_Tourism,SectorName_ConsumerServices,SectorName_Utilities,SectorName_Trade,SectorName_PharmaceuticalIndustries,Churned
0,0,0,Male,Low,0,1,0,0,Individuals,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,1,1,Female,Low,0,1,1,0,Individuals,2,...,0.0,0.0,2.797573,0.0,0.0,0.0,0.0,0.0,0.0,1
2,2,2,Male,Low,0,0,0,0,Individuals,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,3,3,Male,Low,0,0,0,0,Individuals,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,4,4,Male,Low,0,0,0,0,Individuals,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13523 entries, 0 to 13522
Columns: 388 entries, ClientID to Churned
dtypes: float64(368), int64(17), object(3)
memory usage: 40.0+ MB


In [6]:
agg_accounts = pd.DataFrame()

In [7]:
agg_accounts = (
    df.groupby("ClientID")
    .agg(
        {
            "Gender": "first",
            "Age": "first",
            "IsClientSuspended": "first",
            "ClientTypeName": "first",
            "RiskRate": "first",
        },
    )
    .reset_index()
)

In [8]:
agg_accounts["NumOfAccounts"] = df.groupby("ClientID").size()

In [9]:
agg_accounts["NumOfClosedAccounts"] = df.groupby("ClientID")["IsClosed"].sum()

In [10]:
agg_accounts["NumOfDormantAccounts"] = df.groupby("ClientID")["IsDormant"].sum()

In [11]:
agg_accounts["NumOfSuspendedAccounts"] = df.groupby("ClientID")[
    "IsProfileSuspended"
].sum()

In [12]:
agg_accounts["NumOfOrders"] = df.groupby("ClientID")["NumOfOrders"].sum()

In [13]:
agg_accounts["NumOfCompletedOrders"] = df.groupby("ClientID")["NumOfCompleted"].sum()

In [14]:
agg_accounts["NumOfCanceledOrders"] = df.groupby("ClientID")["NumOfCanceled"].sum()

In [15]:
agg_accounts["TotalExecutedQuantity"] = df.groupby("ClientID")[
    "TotalExecutedQuantity"
].sum()

In [16]:
agg_accounts["TotalQuantity"] = df.groupby("ClientID")["TotalQuantity"].sum()

In [17]:
agg_accounts["AvgPrice"] = (
    df.groupby("ClientID")["AvgPrice"].sum() / df.groupby("ClientID").size()
)

In [18]:
agg_dict = {}

for col in df.columns:
    if (
        "OrderType" in col
        or "SecurityID" in col
        or "OrderVia" in col
        or "ExecutionStatus" in col
        or "SectorName" in col
    ):
        agg_dict[col] = (
            df.groupby("ClientID")[col].sum() / df.groupby("ClientID").size()
        )

agg_accounts = pd.concat([agg_accounts, pd.DataFrame(agg_dict)], axis=1)

In [19]:
df["OpenedDate"] = pd.to_datetime(
    df["OpenDate_Year"].astype(str) + "-" + df["OpenDate_Month"].astype(str) + "-01"
)

In [20]:
agg_accounts["LastOpenedAccountDate"] = df.groupby("ClientID")["OpenedDate"].max()

In [21]:
df["LastOrderDate"] = pd.to_datetime(
    df["LastOrder_Year"].astype(str) + "-" + df["LastOrder_Month"].astype(str) + "-01",
    errors="coerce",
)

In [22]:
agg_accounts["LastOrderDateAccrossAccounts"] = df.groupby("ClientID")[
    "OpenedDate"
].max()

In [52]:
agg_accounts["NumOfChurnedAccounts"] = df.groupby("ClientID")["Churned"].sum()

In [54]:
def getChurnProb(row):
    if row["NumOfChurnedAccounts"] == 0:
        return 0
    else:
        return (row["NumOfChurnedAccounts"] + 1) / (row["NumOfAccounts"] + 2)

In [55]:
agg_accounts["ChurnedProbability"] = agg_accounts.apply(getChurnProb, axis=1)

In [60]:
high_probability_churned_df = agg_accounts[agg_accounts["ChurnedProbability"] >= 0.8]

In [62]:
categorical_features = high_probability_churned_df.select_dtypes(include=["object"])