In [None]:
import pandas as pd
import numpy as np
from datetime import datetime as dt
pd.options.mode.chained_assignment = None
from sklearn.preprocessing import LabelEncoder

In [None]:
import seaborn as sns
import plotly.graph_objects as go 
import matplotlib.pyplot as plt

Use the online sports gaming data set to perform data analysis and determine preliminary features

In [None]:
df_raw = pd.read_csv('Online_sports_DIB.csv')

Do basic data gathering in the df
1. ReqTimeUTC is type obj so convert to datetime
2. 4 columns are 447853 but Status is only 447847 - find out why
3. Shorten up the column titles to something more manageable
4. The data period ends 2020-02-29 exclude any lines past that date and there are some
5. Re-code the Transaction type into shorter and meaningful length
6. Strip of 'customer' from the user column leaving only the customer number


In [None]:
df = df_raw.copy()

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.describe(include='O')

In [None]:
# Change dtype

df.ReqTimeUTC = pd.to_datetime(df.ReqTimeUTC)
df.dtypes

In [None]:
# Remove dates after end of Feb

df = df[df.ReqTimeUTC <= '2020-02-29 00:00:00+00:00'].copy()
df.head()

In [None]:
#find missing entries in Status

missing_entries = df[df['Status'].isna()]
print(missing_entries)

In [None]:
# Drop rows with missing values in 'your_column'
df = df.dropna(subset=['Status'])
df.info()


In [None]:
df.TransactionType.unique()

Mapping for TransactionType 
LOYALTYCARDCREDITACH: L1D, LOYALTYCARDCREDITCL: L1D,
LOYALTYCARDDEBIT: L2D,  LOYALTYCARDCREDIT: L2W

In [None]:
df.TransactionType = df.TransactionType.map({'LOYALTYCARDCREDITACH':'L1D', 'LOYALTYCARDCREDITCL':'L1D',
'LOYALTYCARDDEBIT':'L2D', 'LOYALTYCARDCREDIT':'L2W'})

df

In [None]:
# Rename the columns

df.rename(columns= {'AccountIdentifier':'user', 'ReqTimeUTC':'date', 'Status':'status',
                     'TransactionType':'type', 'TransactionAmount':'amount'}, inplace=True)

df

In [None]:
# Strip off the 'Customer' prefix in the user column. Leave just the id number

df['user'] = df['user'].str.replace('customer', '', regex=True)

df

In [None]:
# Group the df by user and type to see what we've got

In [None]:
df.groupby(['user', 'type']).count().reset_index()

In [None]:
user_type_ =df.groupby(['user', 'type']).count().reset_index()

Plot the result using a cumulative 

In [None]:
sns.histplot(data=user_type_, x='date', hue='type',
             cumulative=True, stat='density',
             element='step', fill=False)

As for the casino data the L1D and L2D map closely together and the L2W plot is closer than the casino plot
but still not useful.
Drop the L1D data and keep the L1D APPROVED data for analysis

In [None]:
df_working = df.copy()
df_working.head(2)

In [None]:
df_working = df_working[(df_working['type'] == 'L2D') & \
                        (df_working['status'] == 'APPROVED')].reset_index(drop=True)
df_working.head(2)

In [None]:
df_working = df_working[['user', 'date', 'type', 'amount']]
df_working = df_working.sort_values(['user', 'date']) \
.reset_index(drop=True)

df_working.head()

Find the top 20 dollar depositors and the top most frequent depositors
1. get a frequency count
2. reset indices
3. select first 2 cols
4. sort by date, reverse the sort and take top 20 values
5. put the top 20 in a new df and clean it up

In [None]:
df_working.groupby('user').count()

In [None]:
df_working.groupby('user').count().reset_index()

In [None]:
# add iloc :=all rows, 0:2 cols 0 and 1 to isolate user and date

df_working.groupby('user').count().reset_index()\
    .iloc[:,0:2]


In [None]:
# sort the value -1 is high to low, then pick top 20 rows

df_working.groupby('user').count().reset_index()\
    .iloc[:,0:2].sort_values('date')[::-1][0:20]


In [None]:
top_20freq_df = df_working.groupby('user').count().reset_index()\
    .iloc[:,0:2].sort_values('date')[::-1][0:20]

top_20freq_df

Repeat for the top 20 dollar value depositors

In [None]:
top_20dep_df = df_working[['user', 'amount']] \
    .groupby('user').sum().reset_index() \
    .sort_values('amount')[::-1][0:20]

top_20dep_df

Plot freq and dep df's side by side to see how they match up

In [None]:
fig, ax = plt.subplots(ncols=2, figsize=(15, 5))

sns.barplot(top_20freq_df, x='date', y='user',
            ax = ax[0])
sns.barplot(top_20dep_df, x='amount', y='user',
            ax = ax[1])


Use pd.merge to find the common user in both the frequency and deposit value df's and print them out

In [None]:
common_customers = pd.merge(top_20freq_df, top_20dep_df, on='user') 
print(common_customers)

There are only four customers who appear as top-20' on both lists. The suggests that the relationship
between number of deposits and the value of those deposits is not strong

The data contains date/time information on the transactions.
Use this data to see if there is a  disceranble pattern to patron activity using a heatmap

In [None]:
# extract 'hour' from the datetime group

df_working.date.dt.hour
df_working['hour'] = df_working.date.dt.hour

In [None]:
# extract the day of the week

df_working.date.dt.day_of_week
df_working['day_of_week'] = df_working.date.dt \
    .day_of_week

df_working

In [None]:
# the day of the week is presented as an integer.
#create a dictionary to map names to the integers

day_of_the_week_dict = {0: 'Monday',
                        1: 'Tuesday',
                        2: 'Wednesday',
                        3: 'Thursday',
                        4: 'Friday',
                        5: 'Saturday',
                        6: 'Sunday',
                        }

df_working.day_of_week = df_working['day_of_week']\
    .map(day_of_the_week_dict)

df_working

Create the data for the heatmap.
Need a confusion matrix with day of the week as index and hours of the day as the cols.
Panda pivot_table to organize the data

In [None]:
df_working[['hour', 'day_of_week', 'type']] \
.groupby(['hour', 'day_of_week']).count().reset_index()

In [None]:
h_d_g = df_working[['hour', 'day_of_week', 'type']] \
.groupby(['hour', 'day_of_week']).count().reset_index()

h_d_g

In [None]:
df_heatmap = pd.pivot_table(h_d_g, values='type', index='day_of_week',
                                columns='hour')

In [None]:
sns.heatmap(df_heatmap)

The heatmap shows that deposit activity is concentrated in the am and diminishes significantly in the pm. 
This is consistent with patrons replenishing their accounts before gaming.

Consider developing the data for a heatmap of L2W to see if that shows similar patterns

We can look at patron daily activity but there are likely gaps that will prevent plotting this.
Confirm on one patron and fill in gaps as necessary

In [None]:
df_working.columns

In [None]:
daily_activity = df_working.groupby(['user', 'date']) \
.agg({'amount':'sum', 'type':'count'}) \
.reset_index()

daily_activity

In [None]:
# change the digit to whatever patron you want from 0-20

c4 = daily_activity[daily_activity.user == '4'][0:20]
c4

Fill the gaps in the dates

In [None]:
full_customer_df = pd.DataFrame()

for customer_id in daily_activity.user.unique():
    customer_df = daily_activity[daily_activity.user == customer_id]

    full_range = pd.date_range(customer_df.date.min(),customer_df.date.max(), freq='D')

    customer_df = customer_df.set_index(keys='date')

    customer_df = customer_df.reindex(list(full_range), fill_value=0)

    customer_df.user = [customer_id] * len(customer_df)

    customer_df = customer_df.reset_index()

    full_customer_df = pd.concat([full_customer_df, customer_df])

full_customer_df = full_customer_df.reset_index(drop=True)

In [None]:
print(full_customer_df.isnull().sum())

Try the Sankey plot again. Lots of checks on the way

In [None]:
full_customer_df['month'] = full_customer_df.date.dt.to_period('M')

In [None]:
customer_month = full_customer_df.groupby(['month', 'user']).count().reset_index().iloc[:, :2]

In [None]:
start_ = customer_month.groupby('user').min().reset_index()

In [None]:
end_ = customer_month.groupby('user').max().reset_index()

In [None]:
start_end_df = pd.merge(start_, end_, on='user')

In [None]:
source_target_value = start_end_df.groupby(['month_x', 'month_y']).count().reset_index()

In [None]:
source_target_value

Use LabelEncoder to set up the Sankey data

In [None]:
le = LabelEncoder()
le.fit_transform(source_target_value.month_x.unique())

In [None]:
fig = go.Figure(data=[go.Sankey(node=dict(
    pad=100,
    thickness=10,
    line=dict(color = 'gray', width = 0.5),
    label = ['Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep',
            'Oct', 'Nov', 'Dec', 'Jan', 'Feb' ],

    color = 'orange'),
                    link = dict(
                        source = le.transform(source_target_value.month_x.tolist()),
                        target = le.transform(source_target_value.month_y.tolist()),
                        value = source_target_value.user.tolist(),
                        #hovercolor =['black']
                    
))])

fig.update_layout(title_text='Customer Activity: Mar-Feb')

fig.show()

Who in the data exits gaming the quickest

In [None]:
source_target_value

In [None]:
fast_quiter_ = source_target_value[source_target_value.month_x == source_target_value.month_y]

source_target_value = source_target_value[source_target_value.month_x != source_target_value.month_y]

In [None]:
fast_quiter_

In [None]:
source_target_value

In [None]:
tc_df = customer_month.groupby('month').count().reset_index()

tc_df

In [None]:
fq_df = fast_quiter_[['month_x', 'user']]

fq_df


In [None]:
q_t = pd.merge(tc_df, fq_df, right_on = 'month_x', left_on = 'month')[['month', 'user_x', 'user_y']]

q_t.rename(columns = {'month': 'month', 'user_x': 'total', 'user_y': 'quit'}, inplace=True)

q_t

In [None]:
fig, ax=plt.subplots(figsize=(15, 5))
sns.barplot(q_t.melt(id_vars = ['month']), x= 'month', y='value', hue='variable')