# Lootbox addiction predictions

Applying the model trained with the gambling data to predict the risk score of addiction to lootboxes.

This should be the final step in the data processing. We already have the trained random forest classifier model from the gambling data `randomforestclassifier_gambling.pkl`, and the analytic dataset for lootbox purchase data `df_purchases_analytic.pkl` (or its weekly equivalent).

It's time to make the predictions on the lootbox dataset. Should be pretty straightforward.

In [153]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import GridSearchCV 
from sklearn.ensemble import RandomForestClassifier

In [2]:
rm_classifier = pickle.load(open('../gambling_dataset/ML_model/randomforestclassifier_gambling.pkl', 'rb')) # The model was trained with scikit-learn==1.0.2 (newer versions might not work)
df_purchases_analytic = pd.read_pickle('../processed_dataframes/df_purchases_analytic.pkl') 

In [3]:
# Drop 'user' column from df_purchases_analytic before applying the model
X_test = df_purchases_analytic.drop(columns=['user'])

# Apply the model to make the predictions based on the lootbox data
y_pred = rm_classifier.predict(X_test)

# Store the class probabilities (confidence score)
probs = rm_classifier.predict_proba(X_test)

In [6]:
print(probs[:,1]) 
print(len(probs[:,1])) # Total valid unique users

[0.3173612  0.31898341 0.31732872 ... 0.31898341 0.26748408 0.24735735]
291103


In [7]:
# Add the predicted addiction series (y_pred) to the main dataframe
df_purchases_analytic['addiction'] = y_pred
df_purchases_analytic['confidence_score'] = probs[:,1]

In [8]:
# Print users with addiction signs (addiction==1)
display(df_purchases_analytic[df_purchases_analytic['addiction'] == 1].sort_values(by='confidence_score', ascending=False))

Unnamed: 0,user,sum_stakes_fixedodds,sum_bets_fixedodds,bettingdays_fixedodds,duration_fixedodds,frequency_fixedodds,bets_per_day_fixedodds,euros_per_bet_fixedodds,net_loss_fixedodds,percent_lost_fixedodds,addiction,confidence_score
168113,S6***-X8PQ,1881.00,747,37,65,0.569231,20.189189,2.518072,739.27,0.393020,1,0.763758
235508,SM***-ULGN,3592.50,1437,25,64,0.390625,57.480000,2.500000,1346.12,0.374703,1,0.760925
140632,AY***-WMEQ,1622.50,649,17,63,0.269841,38.176471,2.500000,832.21,0.512918,1,0.758449
288329,SZ***-FXHC,6027.50,2411,46,66,0.696970,52.413043,2.500000,3951.39,0.655560,1,0.757595
13507,A4***-YPFN,4407.50,1763,27,65,0.415385,65.296296,2.500000,1429.17,0.324259,1,0.757148
...,...,...,...,...,...,...,...,...,...,...,...,...
172177,S7***-UHWC,208.50,83,6,43,0.139535,13.833333,2.512048,183.41,0.879664,1,0.500660
38923,AA***-LUTJ,194.40,203,9,29,0.310345,22.555556,0.957635,187.06,0.962243,1,0.500518
253864,SR***-UTCN,338.50,135,6,21,0.285714,22.500000,2.507407,189.71,0.560443,1,0.500506
121338,AU***-PGEG,217.50,87,3,7,0.428571,29.000000,2.500000,196.30,0.902529,1,0.500404


In [9]:
# Save dataframe with predictions to pickle file
df_purchases_analytic.to_pickle('../processed_dataframes/df_purchases_analytic_predictions.pkl')

## Now let's make a new dataframe, for weekly risk score for each users

In [98]:
# Get the dates for the files in the weeksly analytic dataframes
import os

def get_dates():
    dir_path = '../processed_dataframes/df_purchases_analytic_weekly'  # replace with your directory path
    file_list = os.listdir(dir_path)

    date_list = []
    for file_name in file_list:
        if file_name.endswith('.pkl'):
            date_str = file_name[:10]  # extract first 10 characters (yyyy-mm-dd)
            date_list.append(date_str)
    date_list.sort()
    #print(date_list)
    return date_list

In [99]:
# Returns a df with the predictions for each user until a given date
def get_predictions_date(datelimit):
    
    df_predictions = pd.DataFrame(columns=['date', 'user', 'addiction', 'confidence_score'])
    df_temp = pd.DataFrame()
    
    # Get the most recent date from those available
    date_list = get_dates()
    date_list = [date for date in date_list if date <= datelimit]
    date = max(date_list) # the most recent one
    
    print("\nProcessing dataframe for data until", date)
    df_purchases_analytic = pd.read_pickle(f"../processed_dataframes/df_purchases_analytic_weekly/{date}_df_purchases_analytic.pkl") 
    print(df_purchases_analytic.shape)

    # Drop 'user' column from df_purchases_analytic before applying the model
    X_test = df_purchases_analytic.drop(columns=['user'])

    # Apply the model to make the predictions based on the lootbox data
    y_pred = rm_classifier.predict(X_test)

    # Store the class probabilities (confidence score)
    probs = rm_classifier.predict_proba(X_test)

    # Add the predicted addiction series (y_pred) to the dataframe
    df_purchases_analytic['addiction'] = y_pred
    df_purchases_analytic['confidence_score'] = probs[:,1]

    print(df_purchases_analytic[df_purchases_analytic['addiction'] == 1].shape)

    # Store the scores in a temp dataframe
    df_temp[['user', 'addiction', 'confidence_score']] = df_purchases_analytic[['user', 'addiction', 'confidence_score']]
    df_temp['date'] = date

    # Concat the new scores to the new dataframe (df_predictions)
    #df_predictions = pd.concat([df_predictions, df_temp], axis=0, ignore_index=True)

    return df_temp

In [100]:
# Now generate the predictions for the available dates and concatenate them into a single df

date_list = get_dates()
df_purchases_analytic_predictions_date = pd.DataFrame()

for date in date_list:
    df_date = pd.DataFrame()
    df_date = get_predictions_date(date)
    df_purchases_analytic_predictions_date = pd.concat([df_purchases_analytic_predictions_date, df_date], axis=0, ignore_index=True)
    
print(df_purchases_analytic_predictions_date.shape)


Processing dataframe for data until 2022-12-18
(10004, 10)
(12, 12)

Processing dataframe for data until 2022-12-25
(34359, 10)
(51, 12)

Processing dataframe for data until 2023-01-01
(71373, 10)
(135, 12)

Processing dataframe for data until 2023-01-08
(102025, 10)
(231, 12)

Processing dataframe for data until 2023-01-15
(133448, 10)
(319, 12)

Processing dataframe for data until 2023-01-22
(165591, 10)
(404, 12)

Processing dataframe for data until 2023-01-29
(204155, 10)
(455, 12)

Processing dataframe for data until 2023-02-05
(236431, 10)
(516, 12)

Processing dataframe for data until 2023-02-12
(264699, 10)
(577, 12)
(1222085, 4)


In [176]:
df_purchases_analytic_predictions_date.to_pickle('../processed_dataframes/df_purchases_analytic_predictions_date.pkl')

## Find out which users are improving from their addiction

It will be a new feature in df_purchases_analytic_predictions

In [175]:
df = df_purchases_analytic_predictions_date # just to shorten it...

In [167]:
# Users which at some point were considered addicted

# group by user and filter for users with addiction
grouped = df.groupby('user')
grouped_addicted = grouped.filter(lambda x: (x['addiction'] == 1).any())

# Create list of addicted users, we'll use it later
addicted_users = grouped_addicted['user'].unique().tolist()

# Also sort by date, asc
grouped_addicted = grouped_addicted.sort_values(by=['user', 'date'])

display(grouped_addicted)

Unnamed: 0,user,addiction,confidence_score,date
115821,A2***-3NGC,0,0.220282,2023-01-08
217862,A2***-3NGC,0,0.339411,2023-01-15
351338,A2***-3NGC,0,0.487099,2023-01-22
516958,A2***-3NGC,1,0.541913,2023-01-29
721139,A2***-3NGC,1,0.569346,2023-02-05
...,...,...,...,...
351119,SZ***-YTTE,0,0.213828,2023-01-15
516690,SZ***-YTTE,0,0.426837,2023-01-22
720827,SZ***-YTTE,0,0.491275,2023-01-29
957234,SZ***-YTTE,1,0.506916,2023-02-05


In [168]:
# Create a new column for the previous confidence score
grouped_addicted['prev_confidence_score'] = grouped_addicted['confidence_score'].shift()

# Only keep the last date
grouped_addicted = grouped_addicted[grouped_addicted['date'] == max(grouped_addicted['date'])]

# Filter for users where the confidence score has decreased, stayed the same, or increased
decreased_users = grouped_addicted[grouped_addicted['confidence_score'] < grouped_addicted['prev_confidence_score']]
equal_users = grouped_addicted[grouped_addicted['confidence_score'] == grouped_addicted['prev_confidence_score']]
increased_users = grouped_addicted[grouped_addicted['confidence_score'] > grouped_addicted['prev_confidence_score']]

# Convert to list
decreased_users = decreased_users['user'].unique().tolist()
equal_users = equal_users['user'].unique().tolist()
increased_users = increased_users['user'].unique().tolist()

print("Improving users:", decreased_users)

Improving users: ['A2***-3NGC', 'A6***-TBHN', 'A7***-UAHQ', 'A7***-WMHN', 'A9***-EGEQ', 'A9***-ZLFQ', 'AA***-LSPG', 'AA***-M6YC', 'AC***-T5SG', 'AH***-6ARJ', 'AJ***-XJEN', 'AK***-5NVJ', 'AK***-VZFG', 'AM***-EEPQ', 'AN***-YLHQ', 'AQ***-V3FG', 'AQ***-W3EE', 'AR***-TLEQ', 'AR***-XBEN', 'AR***-XPFQ', 'AU***-SMEQ', 'AV***-UZGE', 'AV***-XRFQ', 'AW***-XLEQ', 'AX***-YDFN', 'AY***-G7NL', 'S2***-SCGN', 'S4***-WYEG', 'S5***-RKGJ', 'S5***-UMFN', 'S7***-9GQN', 'S7***-ZMEQ', 'SB***-2RNC', 'SB***-XLFN', 'SE***-RFYL', 'SM***-V4TL', 'SM***-XLGN', 'SN***-VSTG', 'SN***-XKGN', 'SP***-XMFQ', 'SR***-4JPG', 'SR***-HYPL', 'SR***-UZGG', 'SS***-8UTE', 'SS***-MEGQ', 'ST***-UNFQ', 'SU***-YMGQ', 'SV***-VREN', 'SX***-WMGN', 'SY***-X2EG']


In [170]:
# Inspect one of the cases
df[df['user']=='AR***-XPFQ'].sort_values(by='date')

Unnamed: 0,user,addiction,confidence_score,date
3731,AR***-XPFQ,0,0.219467,2022-12-18
22848,AR***-XPFQ,0,0.378296,2022-12-25
71342,AR***-XPFQ,1,0.615087,2023-01-01
154316,AR***-XPFQ,1,0.723545,2023-01-08
268118,AR***-XPFQ,1,0.723545,2023-01-15
413532,AR***-XPFQ,1,0.749586,2023-01-22
593661,AR***-XPFQ,1,0.748931,2023-01-29
809869,AR***-XPFQ,1,0.744774,2023-02-05
1056911,AR***-XPFQ,1,0.735229,2023-02-12


In [171]:
# Add a feature to df_purchases_analytic, indicating if the addiction is getting worse or not

# By default, values are nan
df_purchases_analytic['improving'] = np.nan

# 1 for those users who are improving
df_purchases_analytic.loc[df_purchases_analytic['user'].isin(decreased_users), 'improving'] = 1

# 0 for those who stay the same
df_purchases_analytic.loc[df_purchases_analytic['user'].isin(equal_users), 'improving'] = 0

# 0 for those addicted users who are worsening
df_purchases_analytic.loc[df_purchases_analytic['user'].isin(increased_users), 'improving'] = -1

In [172]:
display(df_purchases_analytic[df_purchases_analytic['user'].isin(addicted_users)].head(50))

Unnamed: 0,user,sum_stakes_fixedodds,sum_bets_fixedodds,bettingdays_fixedodds,duration_fixedodds,frequency_fixedodds,bets_per_day_fixedodds,euros_per_bet_fixedodds,net_loss_fixedodds,percent_lost_fixedodds,addiction,confidence_score,improving
224,A2***-3NGC,397.75,162,23,48,0.479167,7.043478,2.455247,347.67,0.874092,1,0.595472,1.0
598,A2***-6DQL,1982.5,793,42,59,0.711864,18.880952,2.5,787.03,0.396989,1,0.735405,-1.0
2860,A2***-NWHN,672.5,269,5,20,0.25,53.8,2.5,-193.22,-0.287316,1,0.502333,0.0
2949,A2***-PFYJ,415.9,428,45,66,0.681818,9.511111,0.971729,399.08,0.959558,1,0.602808,-1.0
3677,A2***-U4XA,455.0,178,2,2,1.0,89.0,2.55618,290.01,0.637385,1,0.582661,0.0
4743,A3***-2JCA,416.5,166,12,50,0.24,13.833333,2.509036,362.41,0.870132,1,0.654866,-1.0
5112,A3***-5KSA,385.1,159,26,57,0.45614,6.115385,2.422013,278.37,0.722851,1,0.567018,-1.0
5415,A3***-7XBL,234.0,78,2,15,0.133333,39.0,3.0,223.61,0.955598,1,0.528156,0.0
5531,A3***-8UQN,215.9,224,4,23,0.173913,56.0,0.963839,203.39,0.942057,1,0.511471,0.0
5554,A3***-8ZBC,426.5,170,10,40,0.25,17.0,2.508824,174.73,0.409683,1,0.517819,0.0


In [173]:
# Save dataframe with predictions to pickle file
df_purchases_analytic.to_pickle('../processed_dataframes/df_purchases_analytic_predictions.pkl')