In [1]:
import bz2
from datetime import datetime
import logging
import os
import platform
import time
import tqdm
from urllib.parse import urlparse

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import statsmodels as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.preprocessing import LabelEncoder

os.environ["MODIN_ENGINE"] = "ray"  # Modin will be using the Ray engine
# import pandas as pd
import modin.pandas as pd

import tensorflow as tf

In [2]:
python_version = platform.python_version()
tensorflow_version = tf.__version__

if python_version < '3.7':
    print("This notebook requires Python 3.7+ to run. You are running Python {}".format(python_version))
    print("Continue using at your own risk")
else:
    print("Running Python {}".format(python_version))

if tensorflow_version < '2.':
    print("This notebook requires TensorFlow 2+ to run. You are computing with TensorFlow {}".format(tensorflow_version))
    print("Results are not guaranteed to be correct. Continue using at your own risk")
else:
    print("Computing with TensorFlow {}".format(tensorflow_version))

Running Python 3.7.7
Computing with TensorFlow 2.2.0


In [3]:
%matplotlib inline
random_seed = 12345
tf.random.set_seed(random_seed)

In [4]:
# Some common variables
# Note: if you move this noteboook to some other path, you will have to adjust the DATA_DIR variable
DATA_DIR = os.path.join(os.getcwd(), 'data')

PLAYS_URI = 'https://storage.googleapis.com/king-airnd-recruitment-sandbox-data/toydata_mltest.tar.gz'
PLAYS_FILENAME = 'game-plays.csv'
PLAYS_FILE_OPTIONS = {'id': 'string', # sample index (unique)
                      'c1': 'string', # categorical player feature
                      'c2': 'string', # categorical player feature
                      'c3': 'string', # categorical player feature
                      'c4': 'string', # categorical player feature
                      'c5': 'boolean', # categorical player feature
                      'c6': 'string', # categorical player feature
                      'player_group': 'string', # version of the experiment (A or B)
                      'n1': 'string', # engagement intensity (last 30 days before A/B test experiment)
                      'n2': 'string', # numerical player feature (collected before the test experiment)
                      'n3': 'string', # numerical player feature (collected before the test experiment)
                      'n4': 'string', # numerical player feature (collected before the test experiment)
                      'n5': 'string', # numerical player feature (collected before the test experiment)
                      'n6': 'string', # numerical player feature (collected before the test experiment)
                      'n7': 'string', # numerical player feature (collected before the test experiment)
                      'n8': 'string', # numerical player feature (collected before the test experiment)
                      'n9': 'string', # numerical player feature (collected before the test experiment)
                      'n10': 'string', # numerical player feature (collected before the test experiment)
                      'n11': 'string', # numerical player feature (collected before the test experiment)
                      'n12': 'string', # numerical player feature (collected before the test experiment)
                      'n13': 'string', # engagement intensity (first 7 days of A/B test experiment)
                      'n14': 'string', # monetization metric (first 7 days of A/B test experiment expenditure)
                     }

In [5]:
# Some helper functions
def is_url(uri):
    return urlparse(uri).scheme in ('http', 'https',)

def str_to_date(date, format='%Y-%m-%d'):
    """
    Converts a string date in to its date type equivalent
    """
    return datetime.strptime(date, format).date()

def remap_list(source, init=0):
    """
    Maps an input list of ids to a sequential list of integers.
    Returns the mapping in either 'direction' for convenience
    """
    
    # Remove potential duplicates
    source = list(dict.fromkeys(source))
    
    source_index = {}
    index_source = {}
    count = init
    
    for l in source:
        source_index[l] = count
        index_source[count] = l
        count += 1

    return source_index, index_source

# Taken from https://stackoverflow.com/a/51724511
class LabelEncoderByCol(BaseEstimator, TransformerMixin):

    def __init__(self, col):
        #List of column names in the DataFrame that should be encoded
        self.col = col
        #Dictionary storing a LabelEncoder for each column
        self.le_dic = {}
        for el in self.col:
            self.le_dic[el] = LabelEncoder()

    def fit(self, x, y=None):
        #Fill missing values with the string 'NaN'
        x[self.col] = x[self.col].fillna('NaN')
        for el in self.col:
            #Only use the values that are not 'NaN' to fit the Encoder
            a = x[el][x[el]!='NaN']
            self.le_dic[el].fit(a)
        return self

    def transform(self, x, y=None):
        #Fill missing values with the string 'NaN'
        print(self.col)
        x[self.col] = x[self.col].fillna('NaN')
        for el in self.col:
            #Only use the values that are not 'NaN' to fit the Encoder
            a = x[el][x[el]!='NaN']
            #Store an ndarray of the current column
            #b = x[el].get_values()
            b = x[el].to_numpy()
            #Replace the elements in the ndarray that are not 'NaN'
            #using the transformer
            b[b!='NaN'] = self.le_dic[el].transform(a)
            #Overwrite the column in the DataFrame
            x[el]=b
        #return the transformed DataFrame
        return x

def load_file(filenames, options=None, bzip2=True):
    """
    Loads the data contained in one or more files and returns it as a Pandas variable
    """
    def read_csv_as_pd(csv_path, options=None):
        if options:
            return pd.read_csv(csv_path, dtype=options)
        else:
            return pd.read_csv(csv_path)
    
    def read_bz2_as_pd(filepath, options=None):
        with bz2.open(filepath, "rt") as f:
            data = read_csv_as_pd(f, options)
            
        return data
    
    def read_gz_as_pd(filepath, options=None):
        data = None
        return data
    
    if type(filenames) is not list:
        filenames = [filenames]
    
    filenames = [filename if is_url(filename) else os.path.join(DATA_DIR, filename) for filename in filenames]
    
    if bzip2:
        filenames = [filename + '.bz2' if bzip2 else filename for filename in filenames]
        df_files_array = (read_bz2_as_pd(filename, options) for filename in filenames)
    else:
        df_files_array = (read_csv_as_pd(filename, options) for filename in filenames)

    data = pd.concat(df_files_array, ignore_index=True)
    
    return data

def save_recommendations(recommendations,
                         filename,
                         user_col='visitorid',
                         item_col_prefix='item_',
                         n=100):

    col_names = [item_col_prefix + str(x) for x in range(n)]

    data = pd.DataFrame.from_dict(recommendations, orient='index', columns=col_names)
    data.index.names = [user_col]
    data.reset_index(inplace=True)

    data.to_csv(filename, index=False)

In [6]:
# Load game plays
raw_plays = load_file(PLAYS_FILENAME, PLAYS_FILE_OPTIONS)        

To request implementation, send an email to feature_requests@modin.org.


In [7]:
raw_plays.tail(10)

Unnamed: 0,id,c1,c2,c3,c4,c5,c6,player_group,n1,n2,...,n5,n6,n7,n8,n9,n10,n11,n12,n13,n14
49990,49990,Vk4,aW9z,ZXM,,True,LTA1OjAw,A,225.784499,796.480029,...,302.043115,1471893643.580557,2.868624,839.613398,,0.215628,1.860212,0.057979,60.171193,0.127638
49991,49991,SVI,,YXI,dml2b,True,KzAxOjAw,B,532.933509,1953.215192,...,706.229869,2691458889.223356,1474.47428,1892.847031,,4.969846,12.108273,0.0,148.292031,0.650241
49992,49992,SVQ,aW9z,ZXM,,True,LTA1OjAw,A,185.79959,649.437864,...,249.430808,1365676429.748203,1.567043,699.187024,,0.154988,1.324797,1.258639,49.205591,0.097786
49993,49993,QlI,YW5kc,ZW4,YXBwb,True,,A,78.605459,264.176769,...,107.450487,1.587175,0.281583,318.067027,,0.055314,0.371792,0.027475,21.896281,0.037713
49994,49994,Vk4,aW9z,ZXM,,True,KzAzOjAw,A,230.254507,812.997304,...,307.919271,3300792462.877718,3.073821,855.318967,,0.223661,1.927667,0.50287,61.419137,0.131299
49995,49995,R0I,YW5kc,ZXM,,True,LTA0OjAw,A,145.520371,502.846716,...,196.294021,2728730244.340174,0.856128,557.47951,,0.109462,0.894437,0.027793,38.571046,0.072299
49996,49996,Q0E,aW9z,ZnI,aHVhd,True,KzAzOjAw,A,265.792743,944.802399,...,354.61339,2048980296.158362,5.416289,980.27084,,0.299305,2.524877,0.197182,71.471077,0.163053
49997,49997,VVM,YW5kc,ZW4,c2Ftc,True,KzA1OjMw,A,17.169584,55.014058,...,24.484764,-0.234342,0.052445,80.387571,,0.016381,0.058887,0.678615,7.015246,0.010946
49998,49998,UEU,aW9z,aXQ,bW90b,True,KzA4OjAw,B,441.861705,1606.896007,...,586.067364,2170715048.897887,175.055901,1593.317859,,1.526828,7.591714,0.095397,122.558052,0.418659
49999,49999,R0I,YW5kc,ZXM,,True,LTA0OjAw,A,148.854774,514.913095,...,200.699976,1284566844.863385,0.900638,569.240242,,0.112773,0.926387,0.225178,39.434342,0.074257


In [8]:
raw_plays.info()



<class 'modin.pandas.dataframe.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 22 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            50000 non-null  string 
 1   c1            45054 non-null  string 
 2   c2            44943 non-null  string 
 3   c3            50000 non-null  string 
 4   c4            42445 non-null  string 
 5   c5            50000 non-null  boolean
 6   c6            44913 non-null  string 
 7   player_group  50000 non-null  string 
 8   n1            50000 non-null  string 
 9   n2            50000 non-null  string 
 10  n3            50000 non-null  string 
 11  n4            50000 non-null  string 
 12  n5            50000 non-null  string 
 13  n6            50000 non-null  string 
 14  n7            50000 non-null  string 
 15  n8            50000 non-null  string 
 16  n9            2257 non-null   string 
 17  n10           50000 non-null  string 
 18  n11           50000 n

In [9]:
raw_plays._to_pandas().describe()

Unnamed: 0,id,c1,c2,c3,c4,c5,c6,player_group,n1,n2,...,n5,n6,n7,n8,n9,n10,n11,n12,n13,n14
count,50000,45054,44943,50000,42445,50000,44913,50000,50000.0,50000.0,...,50000.0,50000.0,50000.0,50000.0,2257.0,50000.0,50000.0,50000.0,50000.0,50000.0
unique,50000,174,4,29,268,2,35,2,49932.0,49952.0,...,49961.0,49863.0,48136.0,49865.0,2257.0,45690.0,46673.0,45445.0,49979.0,42458.0
top,2115,SU4,YW5kc,ZW4,c2Ftc,True,KzAyOjAw,A,-0.999999,-0.999999,...,-0.999999,-0.222614,0.001478,-1.0,45.582528,inf,0.0,0.0,1.795469,0.000846
freq,1,7641,29980,26894,13891,49956,7449,35280,6.0,5.0,...,5.0,2.0,6.0,45.0,1.0,293.0,203.0,125.0,2.0,10.0


In [10]:
raw_plays.groupby('c5').agg(['count']).c5



Unnamed: 0_level_0,count
c5,Unnamed: 1_level_1
False,44
True,49956


In [11]:
raw_plays.isnull().sum(axis = 0).to_frame('# NAs')

Unnamed: 0,# NAs
id,0
c1,4946
c2,5057
c3,0
c4,7555
c5,0
c6,5087
player_group,0
n1,0
n2,0


In [12]:
# Dropping id field as it useless
raw_plays.drop('id', axis=1, inplace=True)
# Dropping n9 numerical feature as it has very few valid values
raw_plays.drop('n9', axis=1, inplace=True)
# Dropping c5 categorical feature (which is a boolean though) as it has very few False values (44 vs 49956)
raw_plays.drop('c5', axis=1, inplace=True)

In [13]:
# Convert numerical string features to floats
raw_plays_numerical_columns = raw_plays.filter(regex='^n\d{1,2}$').columns
for col in raw_plays_numerical_columns:
    raw_plays[col] = raw_plays[col].astype(float)

In [None]:
raw_plays_categorical_columns = raw_plays.filter(regex='^c\d$').columns
raw_plays_player_group_columns = raw_plays.filter(like='player_group').columns
le_col = LabelEncoderByCol(list(raw_plays_categorical_columns) + list(raw_plays_player_group_columns))
le_col.fit(raw_plays)
le_col.transform(raw_plays)
raw_plays.info()

In [None]:
raw_plays.head(10)

In [None]:
raw_plays.corr()

In [None]:
from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()
raw_plays_categorical_columns = raw_plays.filter(regex='^c\d$').columns
raw_plays_player_group_columns = raw_plays.filter(like='player_group').columns

# for col in list(raw_plays_categorical_columns):
#    raw_plays[col] = enc.fit_transform(raw_plays[col])

enc.fit_transform(raw_plays['c6'])

In [None]:
raw_plays_nas_mask = raw_plays._to_pandas().isnull()
fig, ax = plt.subplots(figsize=(15,15)) # Sample figsize in inches
sns.heatmap(raw_plays._to_pandas(),
            annot=True,
            fmt='.2f',            
            square=True,
            mask=raw_plays_nas_mask)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

raw_plays_nas_mask = raw_plays.isnull()

fig, ax = plt.subplots(figsize=(15,15)) # Sample figsize in inches
sns.heatmap(raw_plays.corr().abs(),
            annot=True,
            fmt='.2f',            
            square=True,
            xticklabels=raw_plays.corr().columns,
            yticklabels=raw_plays.corr().columns,
            mask=raw_plays_nas_mask)

In [None]:
raw_plays._to_pandas().describe()

### Question 1

Identify a globally better design for all players, i.e., if you would have to choose one of the two designs, A or B, for all the players, which one would you choose? Why?

##### Some answer
We have to compare the two samples with some sort of hypotheses test on the features related to the actual test, i.e., n13 and n14, and statitiscal metrics to be calculated as per the corresponding player_group.

Conveniently, from the course in statistics I did exactly 18 years ago, student t-test distribution is handy for this purpose.
If you have the statistical metrics, use this: https://www.evanmiller.org/ab-testing/t-test.html otherwise see below to know how to get them and compute the p-value and make your conclusions about the hypotheses.
Some references: https://blog.minitab.com/blog/adventures-in-statistics-2/understanding-t-tests-1-sample-2-sample-and-paired-t-tests

In [None]:
# Compute some descriptive statistics of A and B.
player_group_A_n13 = raw_plays[(raw_plays.player_group == 'A')].n13
player_group_B_n13 = raw_plays[(raw_plays.player_group == 'B')].n13

player_group_A_n14 = raw_plays[(raw_plays.player_group == 'A')].n14
player_group_B_n14 = raw_plays[(raw_plays.player_group == 'B')].n14

print("Player group A - n13\n\tMean: %g\n\tStd: %g\n\tSize: %g" % (player_group_A_n13.mean(),
                                                             player_group_A_n13.std(),
                                                             player_group_A_n13.size))

print("Player group B - n13\n\tMean: %g\n\tStd: %g\n\tSize: %g" % (player_group_B_n13.mean(),
                                                             player_group_B_n13.std(),
                                                             player_group_B_n13.size))

print("Player group A - n14\n\tMean: %g\n\tStd: %g\n\tSize: %g" % (player_group_A_n14.mean(),
                                                             player_group_A_n14.std(),
                                                             player_group_A_n14.size))

print("Player group B - n14\n\tMean: %g\n\tStd: %g\n\tSize: %g" % (player_group_B_n14.mean(),
                                                             player_group_B_n14.std(),
                                                             player_group_B_n14.size))

In [None]:
player_group_A_n13 = raw_plays[(raw_plays.player_group == 'A')].n13
player_group_B_n13 = raw_plays[(raw_plays.player_group == 'B')].n13

tt_eq, pp_eq, df_eq = sm.stats.weightstats.ttest_ind(player_group_A_n13,
                                            player_group_B_n13,
                                            alternative='two-sided',
                                            #alternative='larger',
                                            #alternative='smaller',
                                            usevar='unequal')
tt_lt, pp_lt, df_lt = sm.stats.weightstats.ttest_ind(player_group_A_n13,
                                            player_group_B_n13,
                                            #alternative='two-sided',
                                            #alternative='larger',
                                            alternative='smaller',
                                            usevar='unequal')
tt_gt, pp_gt, df_gt = sm.stats.weightstats.ttest_ind(player_group_A_n13,
                                            player_group_B_n13,
                                            #alternative='two-sided',
                                            alternative='larger',
                                            #alternative='smaller',
                                            usevar='unequal')

print("T-test - n13 (A == B)\n\tt = %g\n\tp = %g\n\tdf = %g" % (tt_eq, pp_eq, df_eq))
print("T-test - n13 (A <= B)\n\tt = %g\n\tp = %g\n\tdf = %g" % (tt_lt, pp_lt, df_lt))
print("T-test - n13 (A >= B)\n\tt = %g\n\tp = %g\n\tdf = %g" % (tt_gt, pp_gt, df_gt))

Engamenent-wise (n13) there is some difference and player group B seems to be more engaged (on average) than group A.

In [None]:
player_group_A_n14 = raw_plays[(raw_plays.player_group == 'A')].n14
player_group_B_n14 = raw_plays[(raw_plays.player_group == 'B')].n14

tt_eq, pp_eq, df_eq = sm.stats.weightstats.ttest_ind(player_group_A_n14,
                                            player_group_B_n14,
                                            alternative='two-sided',
                                            #alternative='larger',
                                            #alternative='smaller',
                                            usevar='unequal')
tt_lt, pp_lt, df_lt = sm.stats.weightstats.ttest_ind(player_group_A_n14,
                                            player_group_B_n14,
                                            #alternative='two-sided',
                                            #alternative='larger',
                                            alternative='smaller',
                                            usevar='unequal')
tt_gt, pp_gt, df_gt = sm.stats.weightstats.ttest_ind(player_group_A_n14,
                                            player_group_B_n14,
                                            #alternative='two-sided',
                                            alternative='larger',
                                            #alternative='smaller',
                                            usevar='unequal')

print("T-test - n14 (A == B)\n\tt = %g\n\tp = %g\n\tdf = %g" % (tt_eq, pp_eq, df_eq))
print("T-test - n14 (A <= B)\n\tt = %g\n\tp = %g\n\tdf = %g" % (tt_lt, pp_lt, df_lt))
print("T-test - n14 (A >= B)\n\tt = %g\n\tp = %g\n\tdf = %g" % (tt_gt, pp_gt, df_gt))

Monetization-wise (n14) there is no difference between either player group.

### Question 2, 3 and 4

Design a model-based approach to recommend a design (i.e. A or B) for each player to maximize his/her future monetization metric while trying to avoid any decline of engagement intensity.

Report the model performance (training/evaluation/testing) to justify the selection of your approach, model, and settings.

Pleaseserveyourtrainedmodelinawaysothatprediction/recommendation for other players can be easily carried out.

Bla bla bla

In [None]:
raw_plays[raw_plays.columns].corr().abs()

In [None]:
# Some variables can most likely be removed, so let's check for correlations
fig, ax = plt.subplots(figsize=(15,15)) # Sample figsize in inches
sns.heatmap(data=raw_plays.corr().abs(),
            annot=True,
            fmt='.2f',
            square=True,
            xticklabels=raw_plays_numerical_columns,
            yticklabels=raw_plays_numerical_columns,
           )

In [None]:
# Let's drop what is not needed. Don't forget though...
# 'n1': engagement intensity (last 30 days before A/B test experiment)
# 'n13': engagement intensity (first 7 days of A/B test experiment)
# 'n14': monetization metric (first 7 days of A/B test experiment expenditure)

# n1 == (n2, n5, n13)
raw_plays.drop('n2', axis=1, inplace=True)
raw_plays.drop('n5', axis=1, inplace=True)
raw_plays.drop('n13', axis=1, inplace=True)
# n14 == n11
raw_plays.drop('n11', axis=1, inplace=True) # Drop n11 

In [None]:
raw_plays_numerical_columns = raw_plays.filter(regex='^n\d{1,2}$').columns
fig, ax = plt.subplots(figsize=(15,15)) # Sample figsize in inches
sns.heatmap(data=raw_plays.corr().abs(),
            annot=True,
            fmt='.2f',
            square=True,
            xticklabels=raw_plays_numerical_columns,
            yticklabels=raw_plays_numerical_columns,
           )

In [None]:
raw_plays.groupby('player_group').agg(['count']).player_group

In [None]:
import h2o
from h2o.automl import H2OAutoML

In [None]:
h2o.init()

In [None]:
raw_plays_h2o = h2o.H2OFrame(raw_plays._to_pandas())

In [None]:
raw_plays_h2o.describe()

In [None]:
raw_plays_h2o['player_group'] = raw_plays_h2o['player_group'].asfactor()
# raw_plays_h2o['n9'] = raw_plays_h2o['n9'].asnumeric()

In [None]:
# Identify predictors and response
x = raw_plays_h2o.columns
y = "player_group"
x.remove(y)

In [None]:
plays_train, plays_test, plays_validation = raw_plays_h2o.split_frame(ratios=[.7, .15])

In [None]:
aml = H2OAutoML(max_models = 7, seed = random_seed, balance_classes=True)
aml.train(x = x, y = y, training_frame = raw_plays_h2o)

In [None]:
lb = h2o.automl.get_leaderboard(aml, extra_columns = 'ALL')
lb.head(rows=lb.nrows)

In [None]:
aml.predict(raw_plays_h2o)

In [None]:
raw_plays_h2o.head(10)

In [None]:
plays_test.describe()