In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from pytz import timezone

In [2]:
# Read data
df_zadatak = pd.read_csv('./data/zadatak.csv', sep=';', header=None, names=['player_id', 'trans_hour', 'product', 'transaction_type', 'amount', 'cnt'])

In [3]:
df_igraci = pd.read_csv('./data/igraci.csv', sep=',', header=None, names=['player_id', 'birth_date', 'city', 'registration_date', 'registration_hour', 'is_opt_out', 'registration_terminal'])

In [4]:
df_zadatak['trans_hour'] = pd.to_datetime(df_zadatak['trans_hour']).dt.tz_localize(None)

In [5]:
def all_transactions_in_last_30_days(df, current_date):
    """Returns a dataset of all transactions in last 30 days"""
    df = df[df['trans_hour'].between(current_date - pd.Timedelta(days=30), current_date)]
    return df

In [6]:
all_transactions_in_last_30_days(df_zadatak, pd.to_datetime('9/9/2022').to_datetime64())['player_id'].nunique()

9161

In [7]:
def active_in_last_30_days(df_zadatak, current_date):
    """Returns a dataset of active users in the last 30 days"""
    # Filter data
    df_zadatak = df_zadatak[df_zadatak['trans_hour'].between(current_date - pd.Timedelta(days=30).to_timedelta64(), current_date)]
    return df_zadatak

def active_in_next_30_days(df_zadatak, df_active_in_last_30_days, current_date):
    """Returns a list of active users in the next 30 days"""
    # Filter data
    df_zadatak = df_zadatak[df_zadatak['transaction_type'].isin(['Deposit', 'TicketPayin'])]
    df_zadatak = df_zadatak[df_zadatak['trans_hour'].between(current_date, current_date + pd.Timedelta(days=30).to_timedelta64())]
    df_active_in_next_30_days = df_zadatak[df_zadatak['player_id'].isin(df_active_in_last_30_days['player_id'])]
    return df_active_in_next_30_days

def inactive_in_next_30_days(df_active_in_last_30_days, df_active_in_next_30_days, current_date):
    """Returns a list of inactive users in the last 30 days"""
    # Filter data
    df_inactive_in_next_30_days = df_active_in_last_30_days[~df_active_in_last_30_days['player_id'].isin(df_active_in_next_30_days['player_id'])]
    return df_inactive_in_next_30_days

In [8]:
current_date = pd.to_datetime('9/9/2022').to_datetime64()
current_date

numpy.datetime64('2022-09-09T00:00:00.000000000')

In [9]:
def df_players_with_churning(df_igraci, df_active_before, df_inactive_next):
    """Returns a dataset of players with churning"""
    # Merge data
    
    df_igraci = df_igraci[df_igraci['player_id'].isin(df_active_before['player_id'])]
    df_igraci['churned'] = df_igraci['player_id'].isin(df_inactive_next['player_id'])
    return df_igraci

In [10]:
# def player_total_payout_per_product(df_zadatak, current_date: pd.Timestamp):
#     """Returns a dataset of product total payout"""
#     df_zadatak
#     df_zadatak = df_zadatak[df_zadatak['transaction_type'] == 'TicketWin']
#     df_zadatak = df_zadatak.groupby(['player_id', 'product']).sum()
#     return df_zadatak

# def player_total_payin_per_product(df_zadatak, current_date):
#     """Returns a dataset of product total payin"""
#     df_zadatak = df_zadatak[df_zadatak['transaction_type'] == 'TicketPayin']
#     df_zadatak = df_zadatak.groupby(['player_id', 'product']).sum()
#     return df_zadatak

In [11]:
def all_transactions_in_last_30_days(df, current_date):
    """Returns a dataset of all transactions in last 30 days"""
    df = df[df['trans_hour'].between(current_date - pd.Timedelta(days=30), current_date)]
    return df

In [12]:
def total_win_for_player(df, current_date):
    """Returns a dataset of total profit for each player"""
    df = all_transactions_in_last_30_days(df, current_date)
    df = df[df['transaction_type'] == 'TicketWin']
    df = df.groupby(['player_id']).sum()
    df.rename(columns={'amount': 'total_win'}, inplace=True)
    df.rename(columns={'cnt': 'cnt_win'}, inplace=True)
    return df

def total_payin_for_player(df, current_date):
    """Returns a dataset of total payin for each player"""
    df = all_transactions_in_last_30_days(df, current_date)
    df = df[df['transaction_type'] == 'TicketPayin']
    df = df.groupby(['player_id']).sum()
    df.rename(columns={'amount': 'total_payin'}, inplace=True)
    df.rename(columns={'cnt': 'cnt_payin'}, inplace=True)
    return df

In [13]:
def total_profit_for_player(df, current_date):
    """Returns a dataset of total profit for each player"""
    df = pd.merge(total_win_for_player(df, current_date), total_payin_for_player(df, current_date), how='right', on='player_id')

    # fill all NAn values with 0
    df.fillna(0, inplace=True)

    # calculate total profit
    df['total_profit'] = df['total_win'] - df['total_payin']
    return df['total_profit'].to_frame()

In [14]:
def total_deposit_for_player(df, current_date):
    """Returns a dataset of total deposit for each player"""
    df = all_transactions_in_last_30_days(df, current_date)
    df = df[df['transaction_type'] == 'Deposit']
    df = df.groupby(['player_id']).sum()
    df.rename(columns={'amount': 'total_deposit'}, inplace=True)
    df.rename(columns={'cnt': 'cnt_deposit'}, inplace=True)

    #fill all NAn values with 0
    df.fillna(0, inplace=True)
    return df

def total_withdrawal_for_player(df, current_date):
    """Returns a dataset of total withdrawal for each player"""
    df = all_transactions_in_last_30_days(df, current_date)
    df = df[df['transaction_type'] == 'Withdrawal']
    df = df.groupby(['player_id']).sum()
    df.rename(columns={'amount': 'total_withdrawal'}, inplace=True)
    df.rename(columns={'cnt': 'cnt_withdrawal'}, inplace=True)

    #fill all NAn values with 0
    df.fillna(0, inplace=True)
    return df

def total_bonus_for_player(df, current_date):
    """Returns a dataset of total bonus for each player"""
    df = all_transactions_in_last_30_days(df, current_date)
    df = df[df['transaction_type'] == 'Bonus']
    df = df.groupby(['player_id']).sum()
    df.rename(columns={'amount': 'total_bonus'}, inplace=True)
    df.rename(columns={'cnt': 'cnt_bonus'}, inplace=True)

    #fill all NAn values with 0
    df.fillna(0, inplace=True)
    
    return df

def total_deposit_minus_withdrawal_for_player(df, current_date):
    """Returns a dataset of total account for each player"""
    df = pd.merge(total_deposit_for_player(df, current_date), total_withdrawal_for_player(df, current_date), how='left', on='player_id')
    
    # fill all NAn values with 0
    df.fillna(0, inplace=True)
    return df

In [15]:
# total_deposit_minus_withdrawal_for_player(df_zadatak, current_date)

def total_account_without_profit_for_player(df, current_date):
    """Returns a dataset of total account for each player"""
    df = pd.merge(total_deposit_minus_withdrawal_for_player(df, current_date), total_bonus_for_player(df, current_date), how='left', on='player_id')
    
    # fill all NAn values with 0
    df.fillna(0, inplace=True)

    # calculate total account
    df['total_account'] = df['total_deposit'] - df['total_withdrawal'] + df['total_bonus']
    return df['total_account'].to_frame()

In [16]:
def total_account_for_player(df, current_date):
    """Returns a dataset of total account for each player"""
    df = pd.merge(total_account_without_profit_for_player(df, current_date), total_profit_for_player(df, current_date), how='right', on='player_id')
    
    # fill all NAn values with 0
    df.fillna(0, inplace=True)

    # calculate total account
    df['total_account'] = df['total_account'] + df['total_profit']

    # round total account to 2 decimal places
    df['total_account'] = df['total_account'].round(2)
    return df['total_account'].to_frame()

In [17]:
def total_days_played_for_player_in_last_30(df, current_date):
    """Returns a dataset of total days played for each player"""
    df = all_transactions_in_last_30_days(df, current_date)
    df = df[df['transaction_type'] == 'TicketPayin']
    df['trans_hour'] = df['trans_hour'].dt.date
    df = df.groupby(['player_id', 'trans_hour']).count()
    df = df.groupby(['player_id']).count()
    df.rename(columns={'product': 'total_days_played'}, inplace=True)

    #fill all NAn values with 0
    df.fillna(0, inplace=True)
    
    return df

# total_days_played_for_player_in_last_30(df_zadatak, current_date).head(50)

In [18]:
def days_since_last_win_for_player(df, current_date):
    """Returns a dataset of days since last win for each player"""
    df = all_transactions_in_last_30_days(df, current_date)
    df = df[df['transaction_type'] == 'TicketWin']
    df = df.groupby(['player_id'])
    df = df['trans_hour'].max()
    df = df.to_frame()
    df['trans_hour'] = df['trans_hour'].apply(lambda x: current_date - x)
    df['trans_hour'] = df['trans_hour'].apply(lambda x: x.days)
    # rename trans hour to days since last win
    df.rename(columns={'trans_hour': 'days_since_last_win'}, inplace=True)

    #fill all NAn values with 31
    df.fillna(0, inplace=True)
    return df


In [19]:
def win_loss_ratio(df, current_date):
    """Returns a dataset of win loss ratio for each player"""
    df = pd.merge(total_win_for_player(df, current_date), total_payin_for_player(df, current_date), how='right', on='player_id')
    
    # fill all NAn values with 0
    df.fillna(0, inplace=True)

    # calculate win loss ratio
    df['win_loss_ratio'] = df['cnt_win'] / df['cnt_payin']
    return df

In [20]:
def over_five_deposits(df, current_date):
    """Returns a dataset of players who did deposit in last 30 days"""
    df = all_transactions_in_last_30_days(df, current_date)
    df = df[df['transaction_type'] == 'Deposit']
    df = df.groupby(['player_id']).count()
    df['over_five_deposits'] = df['cnt'] > 5
    df = df['over_five_deposits'].to_frame()
    df['over_five_deposits'] = df['over_five_deposits'].apply(lambda x: 1 if x else 0)
    return df

In [21]:
current_date = pd.to_datetime('9/9/2022').to_datetime64()

In [22]:
def over_thirty_years_old(df, current_date):
    """Returns a dataset of players who are over 30 years old"""

    df['birth_date'] = pd.to_datetime(df['birth_date'])

    df = df.groupby(['player_id']).max()
    df['age'] = df['birth_date'].apply(lambda x: current_date - x)
    df['age'] = df['age'].apply(lambda x: x.days / 365)
    df['over_thirty_years_old'] = df['age'] > 30
    df = df['over_thirty_years_old'].to_frame()
    df['over_thirty_years_old'] = df['over_thirty_years_old'].apply(lambda x: 1 if x else 0)
    return df

In [23]:
over_thirty_years_old(df_igraci, current_date)

Unnamed: 0_level_0,over_thirty_years_old
player_id,Unnamed: 1_level_1
1,1
2,0
3,1
4,1
5,1
...,...
35497,0
35498,0
35499,1
35500,0


In [24]:
def registered_for_over_a_year(df, current_date):
    """Returns a dataset of players who registered for over a year"""
    df = df.groupby(['player_id']).max()
    df['registration_date'] = pd.to_datetime(df['registration_date'])
    df['registered_for_over_a_year'] = df['registration_date'].apply(lambda x: current_date - x)
    df['registered_for_over_a_year'] = df['registered_for_over_a_year'].apply(lambda x: x.days / 365)
    df['registered_for_over_a_year'] = df['registered_for_over_a_year'] > 1
    df = df['registered_for_over_a_year'].to_frame()
    df['registered_for_over_a_year'] = df['registered_for_over_a_year'].apply(lambda x: 1 if x else 0)
    return df

In [25]:
def registered_for_over_a_month(df, current_date):
    """Returns a dataset of players who registered for over a month"""
    df = df.groupby(['player_id']).max()
    df['registration_date'] = pd.to_datetime(df['registration_date'])
    df['registered_for_over_a_month'] = df['registration_date'].apply(lambda x: current_date - x)
    df['registered_for_over_a_month'] = df['registered_for_over_a_month'].apply(lambda x: x.days / 30)
    df['registered_for_over_a_month'] = df['registered_for_over_a_month'] > 1
    df = df['registered_for_over_a_month'].to_frame()
    df['registered_for_over_a_month'] = df['registered_for_over_a_month'].apply(lambda x: 1 if x else 0)
    return df

In [26]:
# how many different types of games did player play in last 30 days
def different_games_played_in_last_30_days(df, current_date):
    """Returns a dataset of how many different types of games did player play in last 30 days"""
    df = all_transactions_in_last_30_days(df, current_date)
    df = df[df['transaction_type'] == 'TicketPayin']
    df = df.groupby(['player_id', 'product']).count()
    df = df.groupby(['player_id']).count()
    df.rename(columns={'cnt': 'different_games_played_in_last_30_days'}, inplace=True)
    return df


In [27]:
def reg_on_mobile_web(df, current_date):
    """Returns a dataset of players who registered on mobile web"""
    df = df.groupby(['player_id']).max()
    df['registration_terminal'] = df['registration_terminal'].apply(lambda x: 1 if x == ('MobileWeb' or 'Mobile Web') else 0)
    df.rename(columns={'registration_terminal': 'reg_on_mobile_web'}, inplace=True)
    df.drop(['is_opt_out', 'registration_hour', 'registration_date', 'city', 'birth_date'], axis=1, inplace=True)
    return df

In [28]:
reg_on_mobile_web(df_igraci, current_date)

Unnamed: 0_level_0,reg_on_mobile_web
player_id,Unnamed: 1_level_1
1,0
2,1
3,0
4,1
5,0
...,...
35497,0
35498,0
35499,0
35500,0


In [29]:
def dfForAppend(df_zadatak, df_igraci, current_date):
    df_active_before = active_in_last_30_days(df_zadatak, current_date)
    df_active_next = active_in_next_30_days(df_zadatak, df_active_before, current_date)
    df_inactive_next = inactive_in_next_30_days(df_active_before, df_active_next, current_date)
    df = df_players_with_churning(df_igraci, df_active_before, df_inactive_next)
    df = pd.merge(df, total_account_for_player(df_zadatak, current_date), how='left', on='player_id')
    df = pd.merge(df, total_days_played_for_player_in_last_30(df_zadatak, current_date), how='left', on='player_id')
    df = pd.merge(df, days_since_last_win_for_player(df_zadatak, current_date), how='left', on='player_id')
    df = pd.merge(df, total_profit_for_player(df_zadatak, current_date), how='left', on='player_id')
    df = pd.merge(df, win_loss_ratio(df_zadatak, current_date), how='left', on='player_id')
    df = pd.merge(df, over_five_deposits(df_zadatak, current_date), how='left', on='player_id')
    df = pd.merge(df, different_games_played_in_last_30_days(df_zadatak, current_date), how='left', on='player_id')
    df = pd.merge(df, reg_on_mobile_web(df_igraci, current_date), how='left', on='player_id')
    
    # drop coloumns city registration_hour registration terminal product and count
    df.drop(['registration_hour', 'city', 'cnt', 'total_win', 'cnt_win', 'total_payin', 'cnt_payin', 'transaction_type_x', 'amount_x', 'transaction_type_y', 'amount_y', 'trans_hour', 'registration_terminal'], axis=1, inplace=True)

    # change birth date to age
    df['age'] = df['birth_date'].apply(lambda x: current_date - pd.to_datetime(x))
    df['age'] = df['age'].apply(lambda x: int((x.days / 365)))

    # change registration date to yyyy-mm-dd
    df['registration_date'] = pd.to_datetime(df['registration_date']).dt.year

    # substract 2000 from registration date
    df['registration_date'] = df['registration_date'].apply(lambda x: x - 2014)

    df.drop(['birth_date'], axis=1, inplace=True)
    
    return df

In [30]:
current_date = pd.to_datetime('9/9/2022').to_datetime64()
print(current_date)
df = dfForAppend(df_zadatak, df_igraci, current_date)

current_date = pd.to_datetime('29/11/2022').to_datetime64()
print(current_date)
newdf1 = dfForAppend(df_zadatak, df_igraci, current_date)

current_date = pd.to_datetime('22/8/2022').to_datetime64()
print(current_date)
newdf2 = dfForAppend(df_zadatak, df_igraci, current_date)

current_date = pd.to_datetime('21/7/2022').to_datetime64()
print(current_date)
newdf3 = dfForAppend(df_zadatak, df_igraci, current_date)

current_date = pd.to_datetime('4/10/2022').to_datetime64()
print(current_date)
newdf4 = dfForAppend(df_zadatak, df_igraci, current_date)

current_date = pd.to_datetime('7/8/2022').to_datetime64()
print(current_date)
newdf5 = dfForAppend(df_zadatak, df_igraci, current_date)

current_date = pd.to_datetime('2/2/2022').to_datetime64()
print(current_date)
newdf6 = dfForAppend(df_zadatak, df_igraci, current_date)

current_date = pd.to_datetime('10/9/2021').to_datetime64()
print(current_date)
newdf7 = dfForAppend(df_zadatak, df_igraci, current_date)

current_date = pd.to_datetime('27/4/2021').to_datetime64()
print(current_date)
newdf8 = dfForAppend(df_zadatak, df_igraci, current_date)

current_date = pd.to_datetime('3/3/2021').to_datetime64()
print(current_date)
newdf9 = dfForAppend(df_zadatak, df_igraci, current_date)

current_date = pd.to_datetime('16/6/2022').to_datetime64()
print(current_date)
newdf10 = dfForAppend(df_zadatak, df_igraci, current_date)

current_date = pd.to_datetime('25/11/2021').to_datetime64()
print(current_date)
newdf11 = dfForAppend(df_zadatak, df_igraci, current_date)

current_date = pd.to_datetime('20/11/2021').to_datetime64()
print(current_date)
newdf12 = dfForAppend(df_zadatak, df_igraci, current_date)

current_date = pd.to_datetime('2/12/2021').to_datetime64()
print(current_date)
newdf13 = dfForAppend(df_zadatak, df_igraci, current_date)

current_date = pd.to_datetime('1/11/2021').to_datetime64()
print(current_date)
newdf14 = dfForAppend(df_zadatak, df_igraci, current_date)

current_date = pd.to_datetime('4/4/2022').to_datetime64()
print(current_date)
newdf15 = dfForAppend(df_zadatak, df_igraci, current_date)

current_date = pd.to_datetime('11/5/2022').to_datetime64()
print(current_date)
newdf16 = dfForAppend(df_zadatak, df_igraci, current_date)

current_date = pd.to_datetime('28/9/2021').to_datetime64()
print(current_date)
newdf17 = dfForAppend(df_zadatak, df_igraci, current_date)

current_date = pd.to_datetime('1/7/2022').to_datetime64()
print(current_date)
newdf18 = dfForAppend(df_zadatak, df_igraci, current_date)

current_date = pd.to_datetime('1/5/2022').to_datetime64()
print(current_date)
newdf19 = dfForAppend(df_zadatak, df_igraci, current_date)

2022-09-09T00:00:00.000000000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_igraci['churned'] = df_igraci['player_id'].isin(df_inactive_next['player_id'])
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  current_date = pd.to_datetime('29/11/2022').to_datetime64()


2022-11-29T00:00:00.000000000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_igraci['churned'] = df_igraci['player_id'].isin(df_inactive_next['player_id'])
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  current_date = pd.to_datetime('22/8/2022').to_datetime64()


2022-08-22T00:00:00.000000000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_igraci['churned'] = df_igraci['player_id'].isin(df_inactive_next['player_id'])
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  current_date = pd.to_datetime('21/7/2022').to_datetime64()


2022-07-21T00:00:00.000000000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_igraci['churned'] = df_igraci['player_id'].isin(df_inactive_next['player_id'])
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()


2022-04-10T00:00:00.000000000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_igraci['churned'] = df_igraci['player_id'].isin(df_inactive_next['player_id'])
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()


2022-07-08T00:00:00.000000000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_igraci['churned'] = df_igraci['player_id'].isin(df_inactive_next['player_id'])
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()


2022-02-02T00:00:00.000000000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_igraci['churned'] = df_igraci['player_id'].isin(df_inactive_next['player_id'])
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()


2021-10-09T00:00:00.000000000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_igraci['churned'] = df_igraci['player_id'].isin(df_inactive_next['player_id'])
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  current_date = pd.to_datetime('27/4/2021').to_datetime64()


2021-04-27T00:00:00.000000000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_igraci['churned'] = df_igraci['player_id'].isin(df_inactive_next['player_id'])
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()


2021-03-03T00:00:00.000000000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_igraci['churned'] = df_igraci['player_id'].isin(df_inactive_next['player_id'])
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  current_date = pd.to_datetime('16/6/2022').to_datetime64()


2022-06-16T00:00:00.000000000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_igraci['churned'] = df_igraci['player_id'].isin(df_inactive_next['player_id'])
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  current_date = pd.to_datetime('25/11/2021').to_datetime64()


2021-11-25T00:00:00.000000000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_igraci['churned'] = df_igraci['player_id'].isin(df_inactive_next['player_id'])
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  current_date = pd.to_datetime('20/11/2021').to_datetime64()


2021-11-20T00:00:00.000000000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_igraci['churned'] = df_igraci['player_id'].isin(df_inactive_next['player_id'])
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()


2021-02-12T00:00:00.000000000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_igraci['churned'] = df_igraci['player_id'].isin(df_inactive_next['player_id'])
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()


2021-01-11T00:00:00.000000000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_igraci['churned'] = df_igraci['player_id'].isin(df_inactive_next['player_id'])
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()


2022-04-04T00:00:00.000000000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_igraci['churned'] = df_igraci['player_id'].isin(df_inactive_next['player_id'])
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()


2022-11-05T00:00:00.000000000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_igraci['churned'] = df_igraci['player_id'].isin(df_inactive_next['player_id'])
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  current_date = pd.to_datetime('28/9/2021').to_datetime64()


2021-09-28T00:00:00.000000000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_igraci['churned'] = df_igraci['player_id'].isin(df_inactive_next['player_id'])
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()


2022-01-07T00:00:00.000000000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_igraci['churned'] = df_igraci['player_id'].isin(df_inactive_next['player_id'])
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()


2022-01-05T00:00:00.000000000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_igraci['churned'] = df_igraci['player_id'].isin(df_inactive_next['player_id'])
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()
  df = df.groupby(['player_id']).sum()


In [31]:
frames = [df, newdf1, newdf2, newdf3, newdf4, newdf5, newdf6, newdf7, newdf8, newdf9, newdf10, newdf11, newdf12, newdf13, newdf14, newdf15, newdf16, newdf17, newdf18, newdf19]
df = pd.concat(frames)

df['days_since_last_win'] = df['days_since_last_win'].fillna(31)
df['over_five_deposits'] = df['over_five_deposits'].fillna(0)

df = df[df['win_loss_ratio'] <= 1]

df.to_csv('./export/big_training_set_v10.csv', index=False)

In [32]:
df['churned'].value_counts()

False    143559
True      26835
Name: churned, dtype: int64

In [33]:
df

Unnamed: 0,player_id,registration_date,is_opt_out,churned,total_account,total_days_played,days_since_last_win,total_profit,win_loss_ratio,over_five_deposits,different_games_played_in_last_30_days,reg_on_mobile_web,age
0,1,0,False,False,-0.13,14.0,0.0,-194.66,0.604892,1.0,2.0,0,34
1,3,0,False,False,-0.74,2.0,31.0,-9.79,0.000000,0.0,1.0,0,46
2,4,0,False,False,-0.70,28.0,0.0,-139.65,0.905895,1.0,3.0,1,51
3,5,0,False,False,-0.30,7.0,2.0,-197.09,0.992151,1.0,2.0,0,41
4,7,0,False,False,0.84,5.0,19.0,-5.62,0.303797,0.0,1.0,0,27
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9151,26457,8,True,False,0.11,1.0,0.0,-11.20,0.258333,0.0,1.0,0,42
9152,26458,8,False,False,-0.27,1.0,0.0,-2.53,0.315789,0.0,1.0,0,37
9153,26460,8,True,True,0.55,1.0,0.0,-1.71,0.181818,0.0,2.0,0,28
9154,26461,8,False,False,0.55,1.0,0.0,-1.71,0.375000,0.0,1.0,0,36
