# Data Cleaning and Preprocessing


In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")

pd.set_option("future.no_silent_downcasting", True)

In [2]:
orders_df = pd.read_csv("orders_data_competition.csv")

In [3]:
orders_df.dropna(inplace=True)

In [4]:
clients_df = pd.read_csv("clients_data_competition.csv")

In [5]:
df = clients_df.merge(orders_df, on="Account ID", how="inner")

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1987936 entries, 0 to 1987935
Data columns (total 27 columns):
 #   Column                Dtype  
---  ------                -----  
 0   Client ID             int64  
 1   Account ID            int64  
 2   Gender                object 
 3   Risk Rate             object 
 4   Company Name          object 
 5   Is Closed             int64  
 6   Is Dormant            float64
 7   Is Profile Suspended  int64  
 8   Is Client Suspended   int64  
 9   Client Type Name      object 
 10  OpenDate              object 
 11  BirthDate             object 
 12  Order ID              int64  
 13  Market Key            object 
 14  Security ID           int64  
 15  Order Type            object 
 16  Order Time            object 
 17  Order Via             object 
 18  Is Completed          int64  
 19  Is Canceled           int64  
 20  Expire Date           object 
 21  Execution Status      object 
 22  quantity              int64  
 23  Price  

In [7]:
df.head()

Unnamed: 0,Client ID,Account ID,Gender,Risk Rate,Company Name,Is Closed,Is Dormant,Is Profile Suspended,Is Client Suspended,Client Type Name,...,Order Via,Is Completed,Is Canceled,Expire Date,Execution Status,quantity,Price,Sector Name,Executed Quantity,Quantity
0,0,0,Male,Low,HSB,0,0.0,0,0,Individuals,...,Call Center,1,0,2022-10-19 00:00:00.000,Executed,8000,1.22,Basic Materials,8000,8000
1,0,0,Male,Low,HSB,0,0.0,0,0,Individuals,...,Call Center,1,0,2022-10-19 00:00:00.000,Executed,4000,1.2,Basic Materials,4000,4000
2,1,1,Female,Low,HSB,0,-1.0,1,0,Individuals,...,Telephone,0,1,2022-09-13 00:00:00.000,Not Executed,182,1.162,Basic Materials,0,182
3,1,1,Female,Low,HSB,0,-1.0,1,0,Individuals,...,Telephone,1,0,2022-09-14 00:00:00.000,Executed,182,1.131,Basic Materials,182,182
4,1,1,Female,Low,HSB,0,-1.0,1,0,Individuals,...,Telephone,1,0,2022-09-14 00:00:00.000,Executed,182,5.69,Industries,182,182


In [8]:
orders_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1987936 entries, 0 to 1987942
Data columns (total 16 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Order ID           int64  
 1   Account ID         int64  
 2   Market Key         object 
 3   Security ID        int64  
 4   Order Type         object 
 5   Order Time         object 
 6   Order Via          object 
 7   Is Completed       int64  
 8   Is Canceled        int64  
 9   Expire Date        object 
 10  Execution Status   object 
 11  quantity           int64  
 12  Price              float64
 13  Sector Name        object 
 14  Executed Quantity  int64  
 15  Quantity           int64  
dtypes: float64(1), int64(8), object(7)
memory usage: 257.8+ MB


In [9]:
agg_orders = pd.DataFrame()

In [10]:
# Add an AccountID column to the aggregate orders df
agg_orders["AccountID"] = clients_df["Account ID"].unique()

In [11]:
# Add a NumOfOrders column that has the number of orders for each account
agg_orders["NumOfOrders"] = orders_df.groupby("Account ID").size()

# Set the number of orders for the accounts that didn't place any order to 0
agg_orders["NumOfOrders"] = agg_orders["NumOfOrders"].fillna(0)

# Convert the type to int
agg_orders["NumOfOrders"] = agg_orders["NumOfOrders"].astype(int)

In [12]:
# Convert the 'order_date' column to datetime
orders_df["Order Time"] = pd.to_datetime(orders_df["Order Time"])

# Create a new dataframe with the last order date for each account
agg_orders["LastOrder"] = orders_df.groupby("Account ID")["Order Time"].max()

In [13]:
# Add a NumOfCompleted column that has the number of completed orders for each account
agg_orders["NumOfCompleted"] = orders_df.groupby("Account ID")["Is Completed"].sum()

# Set the number of completed orders for the accounts that didn't place any order to 0
agg_orders["NumOfCompleted"] = agg_orders["NumOfCompleted"].fillna(0)

# # Convert the type to int
agg_orders["NumOfCompleted"] = agg_orders["NumOfCompleted"].astype(int)

In [14]:
# Add a NumOfCanceled column that has the number of completed orders for each account
agg_orders["NumOfCanceled"] = orders_df.groupby("Account ID")["Is Canceled"].sum()

# Set the number of completed orders for the accounts that didn't place any order to 0
agg_orders["NumOfCanceled"] = agg_orders["NumOfCanceled"].fillna(0)

# # Convert the type to int
agg_orders["NumOfCanceled"] = agg_orders["NumOfCanceled"].astype(int)

In [15]:
# Add a AvgPrice column that has the number of completed orders for each account
agg_orders["AvgPrice"] = (
    orders_df.groupby("Account ID")["Price"].sum()
    / orders_df.groupby("Account ID").size()
)

# Set the number of completed orders for the accounts that didn't place any order to 0
agg_orders["AvgPrice"] = agg_orders["AvgPrice"].fillna(0.0)

In [16]:
# Add a TotalExecutedQuantity column that has the number of completed orders for each account
agg_orders["TotalExecutedQuantity"] = orders_df.groupby("Account ID")[
    "Executed Quantity"
].sum()

# Set the number of completed orders for the accounts that didn't place any order to 0
agg_orders["TotalExecutedQuantity"] = agg_orders["TotalExecutedQuantity"].fillna(0)

# # Convert the type to int
agg_orders["TotalExecutedQuantity"] = agg_orders["TotalExecutedQuantity"].astype(int)

In [17]:
# Add a TotalQuantity column that has the number of completed orders for each account
agg_orders["TotalQuantity"] = orders_df.groupby("Account ID")["Quantity"].sum()

# Set the number of completed orders for the accounts that didn't place any order to 0
agg_orders["TotalQuantity"] = agg_orders["TotalQuantity"].fillna(0)

# # Convert the type to int
agg_orders["TotalQuantity"] = agg_orders["TotalQuantity"].astype(int)

In [18]:
def calculating_IDF(df):
    idf = {}
    for i in df.columns:
        idf[i] = (
            np.log2(df.shape[0] / df[df[i] > 0].shape[0])
            if df[df[i] > 0].shape[0] != 0
            else 1e-6
        )
    return idf

In [19]:
def calculate_TFIDF(df, idf_dict):
    tfidf_df = df.copy()
    for i in df.columns:
        tfidf_df[i] = df[i] * idf_dict[i]
    return tfidf_df

In [20]:
def TFIDF(column_name: str):
    temp = pd.get_dummies(orders_df, columns=[column_name])
    temp.replace({True: 1, False: 0}, inplace=True)

    tfidf_df = pd.DataFrame(data=agg_orders.iloc[:, 0])

    for word in orders_df[column_name].unique():
        col_name = f"{column_name}_{word}"

        tfidf_df[col_name.replace(" ", "")] = temp.groupby("Account ID")[col_name].sum()

        tfidf_df[col_name.replace(" ", "")] = tfidf_df[
            col_name.replace(" ", "")
        ].fillna(0)

        tfidf_df[col_name.replace(" ", "")] = tfidf_df[
            col_name.replace(" ", "")
        ].astype(int)

    idf_dict = calculating_IDF(tfidf_df.iloc[:, 1:])
    tfidf_df = calculate_TFIDF(tfidf_df.iloc[:, 1:], idf_dict)
    return pd.concat([agg_orders, tfidf_df], axis=1)

In [21]:
agg_orders = TFIDF("Order Type")

In [22]:
agg_orders = TFIDF("Security ID")

In [23]:
agg_orders = TFIDF("Order Via")

In [24]:
agg_orders = TFIDF("Execution Status")

In [25]:
agg_orders = TFIDF("Sector Name")

In [26]:
agg_orders.head()

Unnamed: 0,AccountID,NumOfOrders,LastOrder,NumOfCompleted,NumOfCanceled,AvgPrice,TotalExecutedQuantity,TotalQuantity,OrderType_Buy,OrderType_Sell,...,SectorName_TelecommunicationServices,SectorName_REALESTATE,SectorName_Telecommunications,SectorName_FOOD,SectorName_Others,SectorName_Tourism,SectorName_ConsumerServices,SectorName_Utilities,SectorName_Trade,SectorName_PharmaceuticalIndustries
0,0,2,2022-10-19 12:24:04,2,0,1.21,12000,12000,1.485619,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,7,2022-09-14 12:56:56,6,1,1.982429,4347,4529,0.0,4.035913,...,0.0,0.0,0.0,2.797573,0.0,0.0,0.0,0.0,0.0,0.0
2,2,21,2024-02-28 11:06:33,17,2,7.652667,123895,137395,13.370571,1.729677,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,88,2024-03-10 12:32:34,64,11,72.059886,682538,911078,30.45519,27.09827,...,6.297068,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,30,2024-02-26 12:47:33,8,10,2.846267,356686,594953,1.485619,16.14365,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
