In [None]:
import pandas as pd


### Analyses Ideas

- 

### Read the data

In [None]:
FILE_PATH = "/Users/emulie/Downloads/bq-results-20250220-145824-1740063653584.csv"
df_raw = pd.read_csv(FILE_PATH)

In [None]:
df_raw.head()

In [None]:
max(df_raw['days_since_first_open'])

In [None]:
metrics = [
    'listening_sessions', 'play_content',
       'pause_content', 'play_previous', 'play_next', 'screen_content_playing',
       'screen_recorder', 'download_content', 'screen_playlist_modal',
       'sleep_recorder_landed', 'create_timer', 'toggle_favorite',
       'create_favorite', 'create_favorite_result', 'mixer_add_music',
       'show_isochronic_dialog', 'mixer_drawer_clear_all'
]

In [None]:
USER_PSEUDO_ID = "B01640707036421AB94DAC64D95CDC50"
df = df_raw[df_raw['user_pseudo_id'] == USER_PSEUDO_ID]

In [None]:
df

### Cohorting users based on max day in app

We are looking at users for 2 months and are classifying them into tiers:
- T0: only logged day 0
- T1: logged day 1
- T2: logged between day 2 and 7
- T3: logged between day 8 and 30
- T4: logged between day 31 and  60

To be considered in each tier, every user needs to at least:
- go to mixer/player ie go to `screen_content_playing` or `screen_mixer`

In [None]:
df_cohort = df_raw.groupby(['user_pseudo_id']).agg({'days_since_first_open': ['max']}).reset_index()
# df_cohort = df_cohort.rename(columns={('days_since_first_open', 'max'): 'max_days_open', 
#                                       ('days_since_first_open', 'min'): 'first_day_open'})
df_cohort.columns = ['user_pseudo_id', 'max_days_open']

In [None]:
num_users = df_cohort.shape[0]
df_cohort['max_days_open'].value_counts() / num_users * 100

### joining user cohort 

In [None]:
df_cohorted = pd.merge(df_raw, df_cohort, on='user_pseudo_id')

In [None]:
def get_user_cohort(day: int):
    if day == 0:
        return 'T0'
    elif day == 1:
        return 'T1'
    elif 1 <= day <= 7:
        return 'T2'
    elif 8 <= day <= 30:
        return 'T3'
    else: 
        return 'T4'

df_cohorted['cohort'] = df_cohort['max_days_open'].apply(lambda x: get_user_cohort(x))

In [None]:
# --- get distribution per Tiers
df_cohorted['cohort'].value_counts() / num_users * 100

In [None]:
df_cohorted.head()

### Comparing users behaviors per cohort

- perform correlation analysis on each cohort

In [None]:
def get_cohort_metric_correlation(df_cohorted, cohort):
    return df_cohorted[df_cohorted['cohort'] == cohort][metrics].corr()

In [None]:
correlation_matrices = {}
cohorts = df_cohorted['cohort'].unique()
for cohort in cohorts:
    correlation_matrices[cohort] = get_cohort_metric_correlation(df_cohorted, cohort)

In [None]:
# --- visualizing correlation with heatmaps

import seaborn as sns
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, len(cohorts), figsize=(15, 5))

for i, cohort in enumerate(cohorts):
    sns.heatmap(correlation_matrices[cohort], annot=True, cmap="coolwarm", ax=axes[i])
    axes[i].set_title(f"Correlation Matrix - {cohort}")

plt.show()


In [None]:
# --- Compute Correlation difference between cohorts
import numpy as np

def compare_cohort_correlation(correlation_matrices, cohort1, cohort2):
    # Compute absolute difference
    correlation_diff = correlation_matrices[cohort1] - correlation_matrices[cohort2]

    # Plot heatmap
    plt.figure(figsize=(8, 6))
    sns.heatmap(correlation_diff, annot=True, cmap="coolwarm", center=0)
    plt.title(f"Correlation Difference: {cohort1} vs {cohort2}")
    plt.show()


In [None]:
compare_cohort_correlation(correlation_matrices, 'T0', 'T1')

In [None]:
# --- compute statistical significance of correlation differences
from scipy.stats import fisher_exact

def fisher_z_test(r1, r2, n1, n2):
    """ Fisher’s Z-test for correlation differences """
    z1 = 0.5 * np.log((1 + r1) / (1 - r1))
    z2 = 0.5 * np.log((1 + r2) / (1 - r2))
    se = np.sqrt(1 / (n1 - 3) + 1 / (n2 - 3))
    z_score = (z1 - z2) / se
    return z_score

def compute_statistical_correlation_differences(correlation_matrices, cohort1, cohort2, metric1, metric2):
    r1 = correlation_matrices[cohort1].loc[metric1, metric2]
    r2 = correlation_matrices[cohort2].loc[metric1, metric2]

    n1 = len(df[df['cohort'] == cohort1])
    n2 = len(df[df['cohort'] == cohort2])

    z_score = fisher_z_test(r1, r2, n1, n2)
    return z_score

def find_cohorts_correlation_difference(correlation_matrices, cohort1, cohort2): # todo: upper triangle only
    THRESHOLD = 1.96
    for i, metric1 in enumerate(metrics):
        for j, metric2 in enumerate(metrics[i+1:]):
            z_score = compute_statistical_correlation_differences(correlation_matrices, cohort1, cohort2, metric1, metric2)
            if z_score > THRESHOLD:
                print(f"Correlation {cohort1}-{cohort2} for {metric1}-{metric2}: {z_score}")


In [None]:
find_cohorts_correlation_difference(correlation_matrices, 'T0', 'T4')

In [None]:
find_cohorts_correlation_difference(correlation_matrices, 'T0', 'T1')


In [None]:
get_cohort_metric_correlation(df_cohorted, 'T0')

In [None]:
get_cohort_metric_correlation(df_cohorted, 'T1')

In [None]:
get_cohort_metric_correlation(df_cohorted, 'T2')

In [None]:
get_cohort_metric_correlation(df_cohorted, 'T3')

In [None]:
get_cohort_metric_correlation(df_cohorted, 'T4')

### Comparing cohort behavior on their first day

Z = \frac{(X1 - X2) - \mu}{\sqrt{\sigma_1^2 + \sigma_2^2}}

In [None]:
df_cohorted.head()

In [None]:
# filtering user rows from their first day
is_first_day = df_cohorted['first_open_date'] == df_cohorted['event_date']
df_firstday = df_cohorted[is_first_day]

In [None]:
# --- computing the average user for T0
tierA = 'T0'
tierB = 'T3'
is_tierA, is_tierB = df_firstday['cohort'] == tierA, df_firstday['cohort'] == tierB
df_tierA, df_tierB = df_firstday[is_tierA], df_firstday[is_tierB]
df_tierA = df_tierA.drop(columns=['user_pseudo_id', 'os', 'first_open_date', 'event_date', 'time_bucket', 'cohort'])
df_tierB = df_tierB.drop(columns=['user_pseudo_id', 'os', 'first_open_date', 'event_date', 'time_bucket', 'cohort'])



In [None]:
(df_tierA.mean() - df_tierB.mean()) / (np.sqrt(df_tierA.var() + df_tierB.var()))

In [None]:
from scipy.stats import zscore

z_tierA = df_tierA.apply(zscore)
z_tierB = df_tierB.apply(zscore)

zscore_diff = z_tierA.mean() - z_tierB.mean()


In [None]:
# --- computing the average user for T4