In [2]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os
pd.options.mode.chained_assignment = None
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# Read in data

In [3]:
# training set dir
ts_dir = '../data/training_sets/'

# get file name with the latest version
file_name = sorted(os.listdir(ts_dir))[-1]

# read data
df = pd.read_csv(ts_dir + file_name)
df.head(1)

Unnamed: 0,user_pseudo_id,retentionScore,add_sticker_tapped,app_clear_data,app_exception,app_remove,app_update,challenge_created,challenge_shared,comment_liked,...,screen_view,search_input_tapped,search_tab_selected,sticker_image_cut,sticker_image_selected,sticker_uploaded,user_blocked,user_items_uploaded,user_unblocked,user_unfollowed
0,b77c070ea6bebab1b35869f2cea470bb,2.124,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Reading data dictionary

In [4]:
# Reading in the data dictionary
dict_dir = '../data/meta/data_dict.xlsx'

dic = pd.read_excel(dict_dir, engine = 'openpyxl')
dic.head()

Unnamed: 0,number,name,description,type,binary,role,use,comment
0,1,user_pseudo_id,unique user identifier,id,N,id,,
1,2,retentionScore,user retention score,num,N,target,Y,
2,3,add_sticker_tapped,event,num,N,predictor,Y,
3,4,app_clear_data,event,num,N,predictor,Y,
4,5,app_exception,event,num,N,predictor,Y,


## Numerical, Categorical Datasets

In [5]:
#Making everything a numerical or categorial variable
num = dic.loc[dic.type == 'num', 'name'].values.tolist()
cat = dic.loc[dic.type == 'cat', 'name'].values.tolist()
target = dic.loc[dic.role == 'target', 'name'].values[0]
df_num = df[num]
df_cat = df[cat]

## Categorical Preprocessing

In the raw dataset we don't have any categorical features. However, in case we have categoricals in the tranining data those features should be identified as categorical.

In [6]:
for i in cat:
    df[i] = pd.Categorical(df[i])

## Descriptive Statistics

By looking at the descriptive statistics of the numerical variables, we see that most of the features in our dataset including the target variable have positive skeweness in their distributions, meaning right skewed.

In [7]:
df_num.describe()

Unnamed: 0,retentionScore,add_sticker_tapped,app_clear_data,app_exception,app_remove,app_update,challenge_created,challenge_shared,comment_liked,comment_posted,...,screen_view,search_input_tapped,search_tab_selected,sticker_image_cut,sticker_image_selected,sticker_uploaded,user_blocked,user_items_uploaded,user_unblocked,user_unfollowed
count,373296.0,373296.0,373296.0,373296.0,373296.0,373296.0,373296.0,373296.0,373296.0,373296.0,...,373296.0,373296.0,373296.0,373296.0,373296.0,373296.0,373296.0,373296.0,373296.0,373296.0
mean,20.811,0.183,0.001,0.004,0.097,0.017,0.019,0.007,0.202,0.302,...,60.099,0.119,0.031,0.093,0.16,0.118,0.0,0.193,0.0,0.051
std,129.201,5.272,0.234,0.176,0.297,0.132,0.354,0.388,8.784,10.261,...,561.666,2.352,0.68,2.129,4.211,3.851,0.01,6.618,0.023,3.905
min,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.067,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.133,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,20526.061,1977.0,119.0,67.0,2.0,2.0,56.0,133.0,1554.0,3028.0,...,80424.0,476.0,127.0,558.0,1284.0,1113.0,5.0,1818.0,11.0,1557.0


# Nonzero percentages of features

In [8]:
df_zeroratio = pd.DataFrame({'nonzero_entries': (df_num!=0).astype(int).sum(axis=0), 'all_entries': np.repeat(df_num.shape[0], df_num.shape[1])})
df_zeroratio['percent_nonzero'] = (100 * df_zeroratio['nonzero_entries'] / df_zeroratio['all_entries'])
df_zeroratio.sort_values(by='percent_nonzero', ascending=False, inplace=True)

In [None]:
fig = px.bar(x=df_zeroratio.percent_nonzero[1:], y=df_zeroratio.index[1:], text = df_zeroratio.percent_nonzero[1:], template='plotly_dark')
fig.update_layout(title={'text': 'Nonzero percentage in the feature values'})
fig.update_traces(texttemplate='%{text:.2f}%', textposition='outside', textfont_size=14, marker_color='indianred')
fig.write_html("../documentation/docs/assets/nonzero_percent.html")

# Skewness in the feature distributions

In [None]:
skewness = df_num.skew()
skewness.sort_values(ascending=False, inplace=True)

In [None]:
colors = ['lightslategray',] * len(skewness)
colors[np.where(skewness.index=='retentionScore')[0][0]] = 'indianred'
fig = px.bar(x=skewness.index, y=skewness.values, text = skewness.values, template='plotly_dark')
fig.update_layout(title={'text': 'Skewness of the features and the target variable'})
fig.update_traces(texttemplate='%{text:.0f}', textposition='outside', textfont_size=14, marker_color=colors)
fig.write_html("../documentation/docs/assets/skewness.html")

# Correlations

In [14]:
correlations = df[num + cat].corr()
corr_pairs=correlations.unstack()
strong_pairs = corr_pairs.sort_values(kind="quicksort")[(corr_pairs!=1) & (abs(corr_pairs) > 0.5)]

In [None]:
mask = np.triu(np.ones_like(correlations, dtype=bool))
df_mask = correlations.mask(mask)
fig = go.Figure(data=go.Heatmap(z=df_mask.values, x=df_mask.index, y=df_mask.columns, colorscale=px.colors.diverging.RdBu_r, showscale=True, ygap=1, xgap=1, zmin=-1, zmax=1))
fig.update_layout(title='Correlations Heatmap', template='plotly_dark')
fig.write_html("../documentation/docs/assets/corr.html")

# Feature-Target Correlations

In [15]:
corr_mat = correlations.stack().reset_index(name="correlation")
corr_target=corr_mat[corr_mat['level_0']==target][1:]
corr_target=corr_target.reindex(corr_target.correlation.abs().sort_values().index)
corr_target_top = corr_target[-10:]

In [None]:
colors=px.colors.sequential.Reds_r
fig = px.bar(corr_target_top, x='level_1', y='correlation', color=abs(corr_target_top.correlation), template='plotly_dark', text = 'correlation', color_continuous_scale='Reds', range_color=[0,1], )
fig.update_layout(title={'text': 'The first 10 most correlated features with target variable'}, coloraxis_colorbar=dict(title="Correlation Magnitude"))
fig.add_hline(y=0)
fig.update_traces(texttemplate='%{text:.2f}', textposition='outside', textfont_size=14)
fig.write_html("../documentation/docs/assets/corr_target.html")

## Top 5 correlated features with target as imp_var 

In [None]:
imp_var = corr_target[-5:].level_1.to_list()

## Log-transformation applied and rounded target variable

In [None]:
df_num['ret_disc'] = df_num.retentionScore.apply(lambda x: np.log(x).round())

In [None]:
df_ret_vc = df_num.ret_disc.value_counts()
fig=px.bar(x = df_ret_vc.index, y=df_ret_vc.values, template='plotly_dark')
fig.update_xaxes(title='Retention Category')
fig.update_yaxes(title='# of Users')
fig.write_html("../documentation/docs/assets/target_value_counts.html")

# Mean event counts for each retention category

In [None]:
df_norm = df_num[['ret_disc']+imp_var]
for i in imp_var:
    df_norm[i] = (df_norm[i]-df_norm[i].min())/(df_norm[i].max()-df_norm[i].min())
    
df_mean=df_norm.groupby('ret_disc').mean().stack().reset_index(name='mean')
df_mean.rename(columns={'level_1':'imp_var'},inplace=True)
df_mean=df_mean.pivot(index='imp_var', columns='ret_disc', values='mean')

In [None]:
fig = make_subplots(rows=5, cols=1, shared_xaxes=True, x_title = 'Retention Category', y_title ='Mean Event Count', row_titles=imp_var)
x = df_mean.columns.tolist()
colors = px.colors.qualitative.Set3*2

for ii, i in enumerate(imp_var):
    figs = px.bar(x=x, y=df_mean.loc[i,].values, color=x)
    fig.add_trace(figs['data'][0], row=ii+1, col=1)

fig.update_layout(template='plotly_dark')
fig.update_traces(marker_color=colors)

fig.write_html("../documentation/docs/assets/mean_imp_bar.html")

# Boxplot

In [None]:
df_ret_sorted = df_num.sort_values(by='ret_disc')
fig = make_subplots(rows=5, cols=1, shared_xaxes=True, x_title = 'Retention Category', y_title='Event Count', row_titles=imp_var)

for ii, i in enumerate(imp_var):
    figs = px.box(df_ret_sorted, x='ret_disc', y=i, color='ret_disc', notched=True)
    for j in range(len(x)):
        fig.add_trace(figs['data'][j], row=ii+1, col=1)

fig.update_layout(template='plotly_dark', showlegend=False)

fig.write_html("../documentation/docs/assets/box.html")

## item_added Boxplot

In [None]:
fig = px.box(df_ret_sorted, x='ret_disc', y='item_added', color='ret_disc', notched=True, template='plotly_dark')
fig.write_html("../documentation/docs/assets/box_item_added.html")

# item_added distribution w.r.t. retention score

In [None]:
fig = px.scatter(df_num, x="retentionScore", y="item_added", color="ret_disc", marginal_y="violin", marginal_x="box", trendline="ols", template="plotly_dark")
fig.update_layout(title={'text': 'Distribution of item_added versus retentionScore colored by log-discrete retention'})
fig.write_html("../documentation/docs/assets/item_added_vs_retention.html")