# Exploring Project

#### library import

In [1]:
# import configs
from configure import conf
# import libraries
import pyodbc
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from IPython.core.display_functions import display

#### Connection to the server

In [2]:
# Establishing connection to SQL database
print('*** Connecting to SQL Server... ***')
conn = pyodbc.connect(str(conf.configs['db']))
print('*** Connection Established. ***')

*** Connecting to SQL Server... ***
*** Connection Established. ***


#### Query the database

In [3]:
# SQL Query that will be sent, response is the table of interest
query_str = '''
select b.BetID,
b.ClientID,
b.BetDateTime,
b.TotalStake,
LCategoryClass.CategoryClass

from dbo.Bet as b

left join dbo.Client as c on b.ClientID=c.ClientID
left join dbo.LVerification as verify on c.VerificationID=verify.VerificationID

left join dbo.BetDetail on dbo.BetDetail.BetID=b.BetID
left join dbo.[Event] on dbo.BetDetail.EventID=dbo.[Event].EventID

left join dbo.LEventClass on dbo.[Event].EventClassID=dbo.LEventClass.EventClassID
left join dbo.LMasterEventClass on dbo.LEventClass.MasterEventClassID=dbo.LMasterEventClass.MasterEventClassID
left join dbo.LCategoryClass on dbo.LMasterEventClass.CategoryClassID=dbo.LCategoryClass.CategoryClassID

where c.IsTest=0 and c.IsBookmaker=0 and verify.IsVerified=1 and b.FreeBetID=0 and dbo.[Event].EventTypeID=1 and dbo.LCategoryClass.CategoryClass LIKE '%NSW%';
'''

In [4]:
# Send the SQL Query, and record response as a Pandas dataframe.
sql_query = pd.read_sql_query(query_str, conn)
df_raw = pd.DataFrame(sql_query)

pd.set_option("display.max_columns", 8)
pd.set_option("display.max_rows", 30)
display(df_raw)



Unnamed: 0,BetID,ClientID,BetDateTime,TotalStake,CategoryClass
0,6266820,6131,2020-01-08 03:29:14.480,300.0,NSW Metro Thoroughbred
1,6310324,6131,2020-01-14 04:34:36.667,100.0,NSW Country Thoroughbred
2,6315404,6131,2020-01-15 06:47:22.810,50.0,NSW Metro Thoroughbred
3,6411986,6131,2020-01-29 04:26:32.320,50.0,NSW Country Thoroughbred
4,6525967,6131,2020-02-15 01:15:06.303,50.0,NSW Metro Thoroughbred
...,...,...,...,...,...
1415848,6106041,59123,2019-12-14 04:10:47.190,20.0,NSW Metro Thoroughbred
1415849,5584880,59189,2019-10-12 07:03:46.147,5.0,NSW Metro Thoroughbred
1415850,5427431,59191,2019-09-23 06:14:33.960,20.0,NSW Country Thoroughbred
1415851,5435921,59264,2019-09-25 05:43:34.543,25.0,NSW Metro Thoroughbred


In [5]:
df_raw.head(100)

Unnamed: 0,BetID,ClientID,BetDateTime,TotalStake,CategoryClass
0,6266820,6131,2020-01-08 03:29:14.480,300.0,NSW Metro Thoroughbred
1,6310324,6131,2020-01-14 04:34:36.667,100.0,NSW Country Thoroughbred
2,6315404,6131,2020-01-15 06:47:22.810,50.0,NSW Metro Thoroughbred
3,6411986,6131,2020-01-29 04:26:32.320,50.0,NSW Country Thoroughbred
4,6525967,6131,2020-02-15 01:15:06.303,50.0,NSW Metro Thoroughbred
...,...,...,...,...,...
95,7873272,6178,2020-05-25 03:02:40.070,100.0,NSW Country Thoroughbred
96,7962273,6178,2020-05-30 05:09:54.673,50.0,NSW Metro Thoroughbred
97,8070142,6178,2020-06-06 03:09:22.190,50.0,NSW Metro Thoroughbred
98,8072057,6178,2020-06-06 03:44:08.173,70.0,NSW Metro Thoroughbred


In [6]:
df = df_raw.copy()

# Create a BetYear column
df.insert(3, "BetYear", df['BetDateTime'].dt.year, True)

# Remove duplicates due to multi's/left-joining(?)
df = df.drop_duplicates(['ClientID', 'BetDateTime', 'BetID', 'TotalStake'])

# Remove stakes with non-positive values
df = df[df["TotalStake"] > 0]

display(df)

Unnamed: 0,BetID,ClientID,BetDateTime,BetYear,TotalStake,CategoryClass
0,6266820,6131,2020-01-08 03:29:14.480,2020,300.0,NSW Metro Thoroughbred
1,6310324,6131,2020-01-14 04:34:36.667,2020,100.0,NSW Country Thoroughbred
2,6315404,6131,2020-01-15 06:47:22.810,2020,50.0,NSW Metro Thoroughbred
3,6411986,6131,2020-01-29 04:26:32.320,2020,50.0,NSW Country Thoroughbred
4,6525967,6131,2020-02-15 01:15:06.303,2020,50.0,NSW Metro Thoroughbred
...,...,...,...,...,...,...
1415848,6106041,59123,2019-12-14 04:10:47.190,2019,20.0,NSW Metro Thoroughbred
1415849,5584880,59189,2019-10-12 07:03:46.147,2019,5.0,NSW Metro Thoroughbred
1415850,5427431,59191,2019-09-23 06:14:33.960,2019,20.0,NSW Country Thoroughbred
1415851,5435921,59264,2019-09-25 05:43:34.543,2019,25.0,NSW Metro Thoroughbred


In [7]:
# Determine the TotalStake over 2019

df_2019 = df.copy()
# take only bets in 2019
df_2019 = df_2019[df_2019["BetYear"] == 2019]

# Sum up the total stake amounts
totalStaked = df_2019["TotalStake"].sum()
print("NSW Thoroughbred (Racing) Turnover (2019): $", round(totalStaked, 2))
print("Estimated (25%): $", round(totalStaked*0.25, 2))


# Total staked by year
df_temp = df.copy().groupby(["BetYear"])["TotalStake"].sum()
display(df_temp)

NSW Thoroughbred (Racing) Turnover (2019): $ 25552310.12
Estimated (25%): $ 6388077.53


BetYear
2016    6.883862e+06
2017    3.507139e+07
2018    2.486363e+07
2019    2.555231e+07
2020    1.462236e+07
Name: TotalStake, dtype: float64

## Explore the database

In [19]:
query = '''
SELECT TOP(100) *
FROM CLIENT AS C WITH (NOLOCK)
WHERE C.ARCHIVEPARTITION IN (0,1)
OPTION (MAXDOP 1)

'''

X = '''
SELECT TOP(100) *
FROM CLIENT AS C WITH (NOLOCK)
WHERE C.ARCHIVEPARTITION IN (0,1)
OPTION (MAXDOP 1)
'''



# Send the SQL Query, and record response as a Pandas dataframe.
test_sql_query = pd.read_sql_query(query, conn)
df_raw_test = pd.DataFrame(test_sql_query)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
df_raw_test.head(100)

df_raw_test.info(True,null_counts=True)

  df_raw_test.info(True,null_counts=True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 36 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   ArchivePartition             100 non-null    int64         
 1   ClientID                     100 non-null    int64         
 2   OrgID                        100 non-null    int64         
 3   AccountNumber                100 non-null    object        
 4   Gender                       100 non-null    object        
 5   SignupDate                   100 non-null    datetime64[ns]
 6   SignupChannelID              100 non-null    int64         
 7   IsCreditClient               100 non-null    bool          
 8   ClientProfileID              100 non-null    int64         
 9   BetInterceptRuleID           100 non-null    int64         
 10  IsTest                       100 non-null    bool          
 11  IsBookmaker                  100 non-null    b