## ML Project: Payment Transaction Data From Online Casino Players

Dataset to download: https://data.mendeley.com/datasets/9j5gcygnwg/1\
Ghaharian, Kasra (2023), “Raw payments transaction data from online casino players and online sports bettors”, Mendeley Data, V1, doi: 10.17632/9j5gcygnwg.1

Aticle: https://www.sciencedirect.com/science/article/pii/S2352340923001956
### Exploratory Data Analysis

In [None]:
# Data management
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None  
from datetime import datetime as dt

# Visualizations
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from sklearn.preprocessing import LabelEncoder

In [None]:
casino = pd.read_csv("Online_casino_DIB.csv")
casino.head()

In [None]:
casino.dtypes

In [None]:
#
casino.ReqTimeUTC = pd.to_datetime(casino.ReqTimeUTC)

In [None]:
casino.describe(exclude = 'float64')

In [None]:
casino = casino[casino.ReqTimeUTC <='2020-02-29 00:00:00+00:00']\
    .copy()

In [None]:
casino.TransactionType.unique()

In [None]:
casino.TransactionType = casino.TransactionType \
    .map({'LOYALTYCARDDEBIT':'L2D', 'LOYALTYCARDCREDITCL':'L1D', 
          'LOYALTYCARDCREDIT':'L2W'})

In [None]:
user_type_ = casino.groupby(["AccountIdentifier", "TransactionType"])\
    .count().reset_index()
# sns.histplot(data = user_type_, x="ReqTimeUTC", bins=50, 
# hue = "TransactionType")

sns.histplot(data = user_type_, x="ReqTimeUTC", hue = "TransactionType", 
             cumulative = True, stat="density", element="step", fill=False)

In [None]:
casino.columns

In [None]:
casino = casino[(casino.TransactionType == "L2D") & \
    (casino.Status=="APPROVED")].reset_index(drop=True)

casino = casino[['AccountIdentifier', 'ReqTimeUTC', 'TransactionAmount']]

casino = casino.sort_values(["AccountIdentifier", "ReqTimeUTC"])\
    .reset_index(drop=True)

casino.rename(columns = {'AccountIdentifier':'user', 'ReqTimeUTC':'timest',
    'TransactionAmount':'amount'},inplace=True)

casino.head()

In [None]:
casino.head()

In [None]:
top_20_freq = casino.groupby("AccountIdentifier").count().reset_index() \
    .iloc[:,0:2].sort_values("ReqTimeUTC")[::-1][0:20]

top_20_spenders = casino[["AccountIdentifier", "TransactionAmount"]].\
    groupby("AccountIdentifier") \
    .sum().reset_index().sort_values("TransactionAmount")[::-1][0:20]
# top_20_spenders

In [None]:
fig, ax = plt.subplots(ncols = 2, figsize=(15, 5))

sns.barplot(top_20_freq, x = 'ReqTimeUTC', y = 'AccountIdentifier', 
            ax = ax[0])
sns.barplot(top_20_spenders, x ="TransactionAmount", 
            y = "AccountIdentifier", ax = ax[1])

In [None]:
casino["hour"] = casino.ReqTimeUTC.dt.hour

casino["day_of_week"] = casino.ReqTimeUTC.dt.day_of_week

day_of_week_dict = {0: "1_Monday", 1:"2_Tuesday", 2:"3_Wednesday", 
                    3:"4_Thursday", 4:"5_Friday", 5:"6_Saturday",
                    6:"7_Sunday"}


casino["day_of_week"] = casino["day_of_week"].map(day_of_week_dict)

casino

In [None]:
h_d_g = casino[["hour", "day_of_week", "TransactionType"]]\
    .groupby(["hour", "day_of_week"]).count().reset_index()

casino_hm = pd.pivot_table(h_d_g, values="TransactionType", 
                           index="day_of_week", columns="hour")

sns.heatmap(casino_hm)

In [None]:
# casino["ReqDateUTC"] = casino.ReqTimeUTC.dt.to_period('D')
casino["ReqDateUTC"] = casino.ReqTimeUTC.dt.floor('D')

daily_activity = casino.groupby(["AccountIdentifier", "ReqDateUTC"]) \
    .agg({'TransactionAmount': 'sum', 'TransactionType':'count'})\
    .reset_index()

daily_activity

In [None]:
c1 = daily_activity[daily_activity.AccountIdentifier=="customer1"][0:20]
sns.lineplot(c1, x="ReqDateUTC", y="TransactionAmount")

In [None]:
full_customer_df = pd.DataFrame()

for customer_id in daily_activity.AccountIdentifier.unique():
    customer_df = daily_activity[daily_activity.AccountIdentifier == customer_id]

    full_range = pd.date_range(customer_df.ReqDateUTC.min(), 
                               customer_df.ReqDateUTC.max(), freq="D")

    customer_df = customer_df.set_index(keys = "ReqDateUTC")

    customer_df = customer_df.reindex(list(full_range), fill_value=0)
    customer_df.AccountIdentifier = [customer_id]*len(customer_df)
    
    customer_df = customer_df.reset_index()
    full_customer_df = pd.concat([full_customer_df, customer_df])

full_customer_df = full_customer_df.reset_index(drop=True)

In [None]:
full_customer_df

In [None]:
full_customer_df["month"] = full_customer_df.\
    ReqDateUTC.dt.to_period("M")

customer_month = full_customer_df.groupby(["month",
        "AccountIdentifier"]).count().reset_index().iloc[:,:2]

start_ = customer_month.groupby("AccountIdentifier").min().reset_index()

end_ = customer_month.groupby("AccountIdentifier").max().reset_index()

start_end_df = pd.merge(start_, end_ , on = "AccountIdentifier")

start_end_df

source_target_value = start_end_df.groupby(["month_x", "month_y"]).\
    count().reset_index()

In [None]:
source_target_value

In [None]:

le = LabelEncoder()

le.fit_transform(source_target_value.month_x.unique())

In [None]:
fig = go.Figure(data=[go.Sankey(
    node = dict(
      pad = 100,
      thickness = 10,
      line = dict(color = "gray", width = 0.5),
      # label = ["3", "4", "5", "6","7", "8", "9", "10", "11",
      # "12", "1", "2"],
      label = ["Mar", "Apr", "May", "Jun","Jul",\
        "Aug", "Sep", "Oct", "Nov", "Dec", "Jan", "Feb"],
      color = "orange"
    ),
    
    link = dict(
      source =le.transform(source_target_value.month_x.tolist()),
      target = le.transform(source_target_value.month_y.tolist()),
      value = source_target_value.AccountIdentifier.tolist(),
      hovercolor=["black"],
  ))])

fig.update_layout(title_text="Customer Lifetime: March 2023 - February 2024", 
                  font_size=10)
fig.show()

In [None]:
fast_quiter_ = source_target_value[source_target_value.\
    month_x == source_target_value.month_y]

source_target_value = source_target_value[source_target_value.\
    month_x != source_target_value.month_y]

In [None]:
tc_df = customer_month.groupby("month").count().reset_index()

tc_df

In [None]:
fq_df = fast_quiter_[["month_x", "AccountIdentifier"]]

fq_df

In [None]:
q_t = pd.merge(tc_df, fq_df, right_on = "month_x", \ 
               left_on = "month")[["month","AccountIdentifier_x", 
                                   "AccountIdentifier_y"]]

q_t.rename(columns = {"month": "month","AccountIdentifier_x":"total", 
                      "AccountIdentifier_y":"quit"}, 
           inplace=True)   
q_t


In [None]:
fig, ax = plt.subplots(figsize=(15,5))
sns.barplot(q_t.melt(id_vars = ["month"]), x = "month", y="value", 
            hue = "variable")