# Event_params formatting and visualization analysis

In [73]:
#Import pandas, matplotlib.pyplot, and seaborn 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json

plt.style.use(['dark_background'])

from library.sb_utils import save_file

In [2]:
with open('./data/hymnal_data_export.json', encoding="utf8") as f:
    data = f.readlines()
    data = [json.loads(line) for line in data]

In [3]:
df = pd.json_normalize(data, max_level=1)
df.head(1).T

Unnamed: 0,0
event_date,20201107
event_timestamp,1604801718108000
event_name,session_start
event_params,"[{'key': 'firebase_event_origin', 'value': {'s..."
event_previous_timestamp,1604785674744000
event_bundle_sequence_id,13
event_server_timestamp_offset,754702
user_pseudo_id,cb052c8ce7b261aecf783ce043089fb3
user_properties,"[{'key': 'ga_session_id', 'value': {'int_value..."
user_first_touch_timestamp,1562977643627000


In [4]:
#Creating index column for merging reference later on
df['index1'] = df.index
df.head(1).T

Unnamed: 0,0
event_date,20201107
event_timestamp,1604801718108000
event_name,session_start
event_params,"[{'key': 'firebase_event_origin', 'value': {'s..."
event_previous_timestamp,1604785674744000
event_bundle_sequence_id,13
event_server_timestamp_offset,754702
user_pseudo_id,cb052c8ce7b261aecf783ce043089fb3
user_properties,"[{'key': 'ga_session_id', 'value': {'int_value..."
user_first_touch_timestamp,1562977643627000


In [5]:
# Opening json file by reading each line as a new object
with open('./data/hymnal_data_formatted.json', encoding="utf8") as f:
    data1 = f.readlines()
    data1 = [json.loads(line) for line in data1]

In [6]:
df_event_params = pd.json_normalize(
    data1, 
    record_path='event_params',
    record_prefix='params_',
    meta=['index1'],
    max_level=1
    )
print(df_event_params.params_key.unique())
df_event_params 

['firebase_event_origin' 'ga_session_id' 'engaged_session_event'
 'session_engaged' 'ga_session_number' 'firebase_screen_id'
 'firebase_screen_class' 'engagement_time_msec' 'entrances' 'freeride'
 'firebase_previous_id' 'firebase_previous_class' 'item_subcategory'
 'recommended' 'item_category' 'item_name' 'item_number' 'item_id'
 'debug_event' 'timestamp' 'previous_os_version' 'campaign_info_source'
 'medium' 'source' 'system_app' 'system_app_update'
 'update_with_analytics' 'firebase_conversion' 'previous_first_open_count'
 'search_term' 'search_book' 'search_type' 'search_subcategory'
 'search_category' 'previous_app_version']


Unnamed: 0,params_key,params_value.string_value,params_value.int_value,params_value.double_value,index1
0,firebase_event_origin,auto,,,0
1,ga_session_id,,1604801718,,0
2,engaged_session_event,,1,,0
3,session_engaged,,1,,0
4,ga_session_number,,8,,0
...,...,...,...,...,...
86127,recommended,,0,,10938
86128,firebase_screen_class,MainActivity,,,10938
86129,item_subcategory,Life in Eternity,,,10938
86130,ga_session_number,,121,,10938


In [7]:
df_event_params.params_key.value_counts()

firebase_event_origin        10939
ga_session_number            10921
ga_session_id                10921
engaged_session_event        10645
firebase_screen_id           10486
firebase_screen_class        10486
engagement_time_msec          4707
firebase_previous_id          3629
firebase_previous_class       3629
recommended                    974
item_id                        974
item_number                    974
item_name                      974
item_category                  951
item_subcategory               904
search_book                    802
search_type                    802
search_term                    802
entrances                      473
session_engaged                452
debug_event                    242
freeride                       212
search_category                 59
search_subcategory              59
previous_os_version             57
timestamp                       11
update_with_analytics           10
firebase_conversion             10
previous_first_open_

In [8]:
df_event_params = df_event_params.set_index(['index1', 'params_key'])

### Creating for loops to clean event_params dataframe

In [9]:
parameters = ['firebase_event_origin', 'ga_session_id', 'engaged_session_event',
       'session_engaged', 'ga_session_number', 'firebase_screen_id',
       'firebase_screen_class', 'engagement_time_msec', 'entrances',
       'freeride', 'firebase_previous_id', 'firebase_previous_class',
       'item_subcategory', 'recommended', 'item_category', 'item_name',
       'item_number', 'item_id', 'debug_event', 
       'previous_os_version', 'campaign_info_source', 'medium', 'source',
       'system_app', 'system_app_update', 'update_with_analytics',
       'firebase_conversion', 'previous_first_open_count', 'search_term',
       'search_book', 'search_type', 'search_subcategory', 'search_category',
       'previous_app_version']
df_dict = {}

for i in parameters:
    df_name = 'df_' + i 
    df_dict[df_name] = df_event_params.xs(i, level=1, drop_level=False).reset_index(level=[0,1], drop=False)
    df_dict[df_name] = df_dict[df_name].dropna(axis=1)
    df_dict[df_name] = df_dict[df_name].rename(columns={df_dict[df_name].columns[2]:df_dict[df_name].iloc[0,1]})
    df_dict[df_name] = df_dict[df_name].drop(columns=df_dict[df_name].columns[1])

In [10]:
# Checking the shape of newly created dataframe
for i in parameters:
    print('Shape of df_' + i + ':', df_dict['df_'+i].shape) 

Shape of df_firebase_event_origin: (10939, 2)
Shape of df_ga_session_id: (10921, 2)
Shape of df_engaged_session_event: (10645, 2)
Shape of df_session_engaged: (452, 2)
Shape of df_ga_session_number: (10921, 2)
Shape of df_firebase_screen_id: (10486, 2)
Shape of df_firebase_screen_class: (10486, 2)
Shape of df_engagement_time_msec: (4707, 2)
Shape of df_entrances: (473, 2)
Shape of df_freeride: (212, 2)
Shape of df_firebase_previous_id: (3629, 2)
Shape of df_firebase_previous_class: (3629, 2)
Shape of df_item_subcategory: (904, 2)
Shape of df_recommended: (974, 2)
Shape of df_item_category: (951, 2)
Shape of df_item_name: (974, 2)
Shape of df_item_number: (974, 2)
Shape of df_item_id: (974, 2)
Shape of df_debug_event: (242, 2)
Shape of df_previous_os_version: (57, 2)
Shape of df_campaign_info_source: (3, 2)
Shape of df_medium: (3, 2)
Shape of df_source: (3, 2)
Shape of df_system_app: (3, 2)
Shape of df_system_app_update: (3, 2)
Shape of df_update_with_analytics: (10, 2)
Shape of df_fire

In [11]:
 # Timestamp was singled out since it had NaN values in columns where values existed
df_timestamp = df_event_params.xs('timestamp', level=1, drop_level=False)\
    .reset_index(level=[0,1], drop=False)\
    .drop(labels=['params_value.string_value','params_key'], axis=1)\
    .rename(columns={'params_value.int_value':'timestamp','params_value.double_value':'timestamp_double'})
df_timestamp

Unnamed: 0,index1,timestamp,timestamp_double
0,315,1604771210.0,
1,482,,1604788000.0
2,2542,1604757574.0,
3,2576,1604760358.0,
4,3465,,1604750000.0
5,4448,,1604737000.0
6,4449,,1604745000.0
7,6047,,1604802000.0
8,6048,,1604803000.0
9,8031,,1604801000.0


In [12]:
# Merging all sub-dataframes into one
df_event_params_merged = df_dict['df_'+parameters[0]]
for i in parameters:
    df_event_params_merged = df_event_params_merged.merge(df_dict['df_'+i], on='index1', how='left')

df_event_params_merged = df_event_params_merged.merge(df_timestamp, on='index1', how='left')

In [13]:
df_event_params_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10939 entries, 0 to 10938
Data columns (total 38 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   index1                     10939 non-null  int64  
 1   firebase_event_origin_x    10939 non-null  object 
 2   firebase_event_origin_y    10939 non-null  object 
 3   ga_session_id              10921 non-null  object 
 4   engaged_session_event      10645 non-null  object 
 5   session_engaged            452 non-null    object 
 6   ga_session_number          10921 non-null  object 
 7   firebase_screen_id         10486 non-null  object 
 8   firebase_screen_class      10486 non-null  object 
 9   engagement_time_msec       4707 non-null   object 
 10  entrances                  473 non-null    object 
 11  freeride                   212 non-null    object 
 12  firebase_previous_id       3629 non-null   object 
 13  firebase_previous_class    3629 non-null   obj

### Saving dataframe into csv files

In [14]:
# save the data to a new csv file
datapath = '../Project Hymnal/data'
save_file(df_event_params_merged, 'event_params_merged.csv', datapath)

Writing file.  "../Project Hymnal/data\event_params_merged.csv"


# Initial attempt at cleaning the dataframe

### Inspecting a few index using 2 different methods

In [15]:
df_event_params.iloc[df_event_params.index.get_level_values('index1') == 6]

Unnamed: 0_level_0,Unnamed: 1_level_0,params_value.string_value,params_value.int_value,params_value.double_value
index1,params_key,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6,firebase_previous_class,ViewerActivity,,
6,firebase_previous_id,,5.886278566822991e+18,
6,firebase_screen_class,ViewerActivity,,
6,engaged_session_event,,1.0,
6,engagement_time_msec,,1713.0,
6,firebase_event_origin,auto,,
6,firebase_screen_id,,5.886278566822991e+18,
6,ga_session_number,,7.0,
6,ga_session_id,,1604785674.0,


In [16]:
df_event_params.query('index1 >= 1 and index1 <= 3')

Unnamed: 0_level_0,Unnamed: 1_level_0,params_value.string_value,params_value.int_value,params_value.double_value
index1,params_key,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,ga_session_number,,7.0,
1,firebase_event_origin,auto,,
1,session_engaged,,1.0,
1,engaged_session_event,,1.0,
1,ga_session_id,,1604785674.0,
2,firebase_event_origin,auto,,
2,engaged_session_event,,1.0,
2,firebase_screen_id,,5.886278566822991e+18,
2,firebase_screen_class,MainActivity,,
2,ga_session_id,,1604785674.0,


### df_event_params dtypes conversion

In [17]:
# Converting column 2 into float dtype
df_event_params[['params_value.int_value']] = df_event_params[['params_value.int_value']].apply(pd.to_numeric)

In [18]:
df_event_params.info

<bound method DataFrame.info of                              params_value.string_value  \
index1 params_key                                        
0      firebase_event_origin                      auto   
       ga_session_id                               NaN   
       engaged_session_event                       NaN   
       session_engaged                             NaN   
       ga_session_number                           NaN   
...                                                ...   
10938  recommended                                 NaN   
       firebase_screen_class              MainActivity   
       item_subcategory               Life in Eternity   
       ga_session_number                           NaN   
       item_number                                 NaN   

                              params_value.int_value  \
index1 params_key                                      
0      firebase_event_origin                     NaN   
       ga_session_id                    1.604

for every row, notna() then save to new column

In [19]:
#Function to display the full dataframe
def print_full(x):
    pd.set_option('display.max_rows', len(x))
    print(x)
    pd.reset_option('display.max_rows')

### Inspecting item params_key values

In [20]:
#df_event_params.xs('item_number', level=1, drop_level=False).info()
df_event_params.xs('item_number', level=1, drop_level=False)['params_value.int_value'].value_counts()

1.0       23
912.0     17
2128.0    15
984.0     13
1918.0    10
          ..
910.0      1
306.0      1
1586.0     1
1356.0     1
475.0      1
Name: params_value.int_value, Length: 472, dtype: int64

In [21]:
#df_event_params.xs('item_id', level=1, drop_level=False).info()
df_event_params.xs('item_id', level=1, drop_level=False)['params_value.string_value'].value_counts()

-Kz10RN3pzqTPrY8Knk1    23
-Kz13CMkpzqTPrY8Knk1    17
-LbluzPaAs1Wu5WhoO-g    15
-Kz13P3opzqTPrY8Knk1    13
-L31eQKmUrAqj7a_EFX9    10
                        ..
-Kz1-KstpzqTPrY8Knk1     1
-Kz13SuVpzqTPrY8Knk1     1
-Kz10Zp4pzqTPrY8Knk1     1
-Kz10lWgpzqTPrY8Knk1     1
-Kz11dw7pzqTPrY8Knk1     1
Name: params_value.string_value, Length: 494, dtype: int64

In [22]:
#df_event_params.xs('item_name', level=1, drop_level=False).info()
df_event_params.xs('item_name', level=1, drop_level=False)['params_value.string_value'].value_counts()

Glory be to God the Father                           23
Christ to minister is service                        17
Lord, thank You for a new day                        14
River of living water                                13
O God, Thou art the source of life                   10
                                                     ..
I've given up the world because                       1
O God of burning altar fire,                          1
Oh how sweet it is just to know my Christ!            1
How sweet is the story of Christ's boundless love     1
Savior, I by faith am touching                        1
Name: params_value.string_value, Length: 480, dtype: int64

In [23]:
#df_event_params.xs('recommended', level=1, drop_level=False).info()
df_event_params.xs('recommended', level=1, drop_level=False)['params_value.int_value'].value_counts()

0.0    971
1.0      3
Name: params_value.int_value, dtype: int64

So far only 3 users have used the recommended section. This could be an important feature of the app that needs improvement.

In [24]:
#df_event_params.xs('item_category', level=1, drop_level=False).info()
df_event_params.xs('item_category', level=1, drop_level=False)['params_value.string_value'].value_counts()

Praise of the Lord                   188
Experience of Christ                 145
Consecration                          82
Worship of the Father                 76
Longings                              62
The Church                            51
Gospel                                50
Scriptures for Singing                47
Assurance and Joy of Salvation        39
Blessing of the Trinity               34
Service                               30
Hope of Glory                         22
Fulness of the Spirit                 20
Encouragement                         19
Ultimate Manifestation                17
Comfort in Trials                     16
Experience of God                      9
Union with Christ                      8
Spiritual Warfare                      8
Study of the Word                      7
Prayer                                 7
Preaching of the Gospel                4
Various Aspects of the Inner Life      3
The Kingdom                            3
The Word of God 

In [25]:
#df_event_params.xs('item_subcategory', level=1, drop_level=False).info()
df_event_params.xs('item_subcategory', level=1, drop_level=False)['params_value.string_value'].value_counts()

Loving Him                      45
Remembrance of Him              43
General                         32
His Love                        31
His Redemption                  25
                                ..
Functioning                      1
Christ as Our Burnt Offering     1
A Channel of Blessing            1
His Preciousness                 1
A Wonderful Savior               1
Name: params_value.string_value, Length: 196, dtype: int64

### Reformating and merging all item params_key into a single dataframe

In [26]:
df_event_params.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,params_value.string_value,params_value.int_value,params_value.double_value
index1,params_key,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,firebase_event_origin,auto,,
0,ga_session_id,,1604802000.0,
0,engaged_session_event,,1.0,
0,session_engaged,,1.0,
0,ga_session_number,,8.0,
1,ga_session_number,,7.0,
1,firebase_event_origin,auto,,
1,session_engaged,,1.0,
1,engaged_session_event,,1.0,
1,ga_session_id,,1604786000.0,


In [27]:
# Cleaning and reformatting all item params_key
df_item_number = df_event_params.xs('item_number', level=1, drop_level=False)\
    .reset_index(level=[0,1], drop=False)\
    .drop(labels=['params_value.string_value','params_value.double_value','params_key'], axis=1)\
    .rename(columns={'params_value.int_value':'item_number'})
df_item_id = df_event_params.xs('item_id', level=1, drop_level=False)\
    .reset_index(level=[0,1], drop=False)\
    .drop(labels=['params_value.int_value','params_value.double_value','params_key'], axis=1)\
    .rename(columns={'params_value.string_value':'item_id'})
df_recommended = df_event_params.xs('recommended', level=1, drop_level=False)\
    .reset_index(level=[0,1], drop=False)\
    .drop(labels=['params_value.string_value','params_value.double_value','params_key'], axis=1)\
    .rename(columns={'params_value.int_value':'recommended'})
df_item_name = df_event_params.xs('item_name', level=1, drop_level=False)\
    .reset_index(level=[0,1], drop=False)\
    .drop(labels=['params_value.int_value','params_value.double_value','params_key'], axis=1)\
    .rename(columns={'params_value.string_value':'item_name'})
df_item_category = df_event_params.xs('item_category', level=1, drop_level=False)\
    .reset_index(level=[0,1], drop=False)\
    .drop(labels=['params_value.int_value','params_value.double_value','params_key'], axis=1)\
    .rename(columns={'params_value.string_value':'item_category'})
df_item_subcategory = df_event_params.xs('item_subcategory', level=1, drop_level=False)\
    .reset_index(level=[0,1], drop=False)\
    .drop(labels=['params_value.int_value','params_value.double_value','params_key'], axis=1)\
    .rename(columns={'params_value.string_value':'item_subcategory'})

In [28]:
# Checking uniformity of the dataframe
print('Shape of df_item_number', df_item_number.shape)
print('Shape of df_item_id', df_item_id.shape)
print('Shape of df_recommended', df_recommended.shape)
print('Shape of df_item_name', df_item_name.shape)
print('Shape of df_item_category', df_item_category.shape)
print('Shape of df_item_subcategory', df_item_subcategory.shape)

Shape of df_item_number (974, 2)
Shape of df_item_id (974, 2)
Shape of df_recommended (974, 2)
Shape of df_item_name (974, 2)
Shape of df_item_category (951, 2)
Shape of df_item_subcategory (904, 2)


In [29]:
# Merging all item params_key into 1 dataframe
df_item = df_item_number.merge(df_item_id, how='left')\
    .merge(df_recommended, how='left')\
    .merge(df_item_name, how='left')\
    .merge(df_item_category, how='left')\
    .merge(df_item_subcategory, how='left')
print(df_item.shape)
print(df_item.info())
df_item.head(3)

(974, 7)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 974 entries, 0 to 973
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   index1            974 non-null    int64  
 1   item_number       974 non-null    float64
 2   item_id           974 non-null    object 
 3   recommended       974 non-null    float64
 4   item_name         974 non-null    object 
 5   item_category     951 non-null    object 
 6   item_subcategory  904 non-null    object 
dtypes: float64(2), int64(1), object(4)
memory usage: 60.9+ KB
None


Unnamed: 0,index1,item_number,item_id,recommended,item_name,item_category,item_subcategory
0,14,475.0,-Kz11vltpzqTPrY8Knk1,0.0,"One with Thee, Thou Son eternal",Union with Christ,One with Him
1,37,643.0,-Kz12SGxpzqTPrY8Knk1,0.0,Take time to behold Him,Encouragement,For Fellowship with the Lord
2,38,643.0,-LCY45EBnrPPuNjAi5l_,0.0,"Take time to behold Him,",Encouragement,For Fellowship with the Lord


# Inspecting search params_key values

In [33]:
# Taking a look at search_term parameter
df_event_params.xs('search_term', level=1, drop_level=False)['params_value.string_value'].value_counts()

912                  14
984                  11
381                   9
1251                  9
203                   7
                     ..
lord thou art wo      1
draw me dear lord     1
i have fough          1
i must press on       1
now the lord          1
Name: params_value.string_value, Length: 543, dtype: int64

There are 543 unique search terms.

Here I can see a problem whereby the search_term consists of numerics and strings in the same column. 

In [34]:
# Taking a look at search_type paramater
df_event_params.xs('search_type', level=1, drop_level=False)['params_value.string_value'].value_counts()

alphabetical    520
numerical       282
Name: params_value.string_value, dtype: int64

In [35]:
# getting an idea of what values are in search_book parameter
df_event_params.xs('search_book', level=1, drop_level=False)['params_value.string_value'].value_counts()

All          764
Hymnal        22
New Songs      9
History        4
Updates        3
Name: params_value.string_value, dtype: int64

### Reformating and merging all search params_key into a single dataframe

In [36]:
# Cleaning and reformatting all search params_key
df_search_term = df_event_params.xs('search_term', level=1, drop_level=False)\
    .reset_index(level=[0,1], drop=False)\
    .drop(labels=['params_value.int_value','params_value.double_value','params_key'], axis=1)\
    .rename(columns={'params_value.string_value':'search_term'})
df_search_type = df_event_params.xs('search_type', level=1, drop_level=False)\
    .reset_index(level=[0,1], drop=False)\
    .drop(labels=['params_value.int_value','params_value.double_value','params_key'], axis=1)\
    .rename(columns={'params_value.string_value':'search_type'})
df_search_book = df_event_params.xs('search_book', level=1, drop_level=False)\
    .reset_index(level=[0,1], drop=False)\
    .drop(labels=['params_value.int_value','params_value.double_value','params_key'], axis=1)\
    .rename(columns={'params_value.string_value':'search_book'})

In [37]:
# Checking uniformity of the dataframe
print('Shape of df_search_term', df_search_term.shape)
print('Shape of df_search_type', df_search_type.shape)
print('Shape of df_search_book', df_search_book.shape)

Shape of df_search_term (802, 2)
Shape of df_search_type (802, 2)
Shape of df_search_book (802, 2)


In [38]:
# Merging all search params_key into 1 dataframe
df_search = df_search_term.merge(df_search_type, how='left').merge(df_search_book, how='left')
df_search

Unnamed: 0,index1,search_term,search_type,search_book
0,448,753,numerical,All
1,451,547,numerical,New Songs
2,455,547,numerical,All
3,457,547,numerical,New Songs
4,465,547,numerical,All
...,...,...,...,...
797,10623,132,numerical,All
798,10658,52,numerical,All
799,10721,thank you for,alphabetical,All
800,10751,before the throne of,alphabetical,All


In [39]:
# Taking a look at column dtypes and null values
df_search.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 802 entries, 0 to 801
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   index1       802 non-null    int64 
 1   search_term  802 non-null    object
 2   search_type  802 non-null    object
 3   search_book  802 non-null    object
dtypes: int64(1), object(3)
memory usage: 31.3+ KB


In [40]:
# Taking a look at the unique search_terms and distribution
print(df_search.search_term.value_counts())
df_search.search_term.value_counts(normalize=True).head()

912                  14
984                  11
381                   9
1251                  9
203                   7
                     ..
lord thou art wo      1
draw me dear lord     1
i have fough          1
i must press on       1
now the lord          1
Name: search_term, Length: 543, dtype: int64


912     0.017456
984     0.013716
381     0.011222
1251    0.011222
203     0.008728
Name: search_term, dtype: float64

# Taking a look into the search_(sub)category 

In [42]:
df_event_params.xs('search_subcategory', level=1, drop_level=False)['params_value.string_value'].value_counts()

All    59
Name: params_value.string_value, dtype: int64

In [43]:
df_event_params.xs('search_category', level=1, drop_level=False)['params_value.string_value'].value_counts()

All              58
Encouragement     1
Name: params_value.string_value, dtype: int64

Looking at the unique values for these 2 parameters suggests that most users uses the all category when searching. This suggests that searching through the all category is the most picked and convenient method. Hence, there would not be a need to add a subcategory search function.

# Firebase

### firebase_event_origin

In [44]:
df_event_params.xs('firebase_event_origin', level=1, drop_level=False).info()
df_event_params.xs('firebase_event_origin', level=1, drop_level=False)['params_value.string_value'].value_counts()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 10939 entries, (0, 'firebase_event_origin') to (10938, 'firebase_event_origin')
Data columns (total 3 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   params_value.string_value  10939 non-null  object 
 1   params_value.int_value     0 non-null      float64
 2   params_value.double_value  0 non-null      float64
dtypes: float64(2), object(1)
memory usage: 695.5+ KB


auto    9163
app     1776
Name: params_value.string_value, dtype: int64

### ga_session_number

In [45]:
df_event_params.xs('ga_session_number', level=1, drop_level=False).info()
df_event_params.xs('ga_session_number', level=1, drop_level=False)['params_value.int_value'].value_counts()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 10921 entries, (0, 'ga_session_number') to (10938, 'ga_session_number')
Data columns (total 3 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   params_value.string_value  0 non-null      object 
 1   params_value.int_value     10921 non-null  float64
 2   params_value.double_value  0 non-null      float64
dtypes: float64(2), object(1)
memory usage: 695.1+ KB


138.0    398
143.0    342
11.0     309
127.0    227
1.0      208
        ... 
54.0       2
142.0      2
393.0      2
415.0      2
130.0      1
Name: params_value.int_value, Length: 340, dtype: int64

### ga_session_id

In [46]:
df_event_params.xs('ga_session_id', level=1, drop_level=False).info()
df_event_params.xs('ga_session_id', level=1, drop_level=False)['params_value.int_value'].value_counts()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 10921 entries, (0, 'ga_session_id') to (10938, 'ga_session_id')
Data columns (total 3 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   params_value.string_value  0 non-null      object 
 1   params_value.int_value     10921 non-null  float64
 2   params_value.double_value  0 non-null      float64
dtypes: float64(2), object(1)
memory usage: 695.1+ KB


1.604747e+09    386
1.604747e+09    342
1.604774e+09    242
1.604794e+09    227
1.604791e+09    195
               ... 
1.604768e+09      1
1.604737e+09      1
1.604807e+09      1
1.596983e+09      1
1.604748e+09      1
Name: params_value.int_value, Length: 519, dtype: int64

### engaged_session_event

In [47]:
df_event_params.xs('engaged_session_event', level=1, drop_level=False).info()
df_event_params.xs('engaged_session_event', level=1, drop_level=False)['params_value.int_value'].value_counts()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 10645 entries, (0, 'engaged_session_event') to (10938, 'engaged_session_event')
Data columns (total 3 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   params_value.string_value  0 non-null      object 
 1   params_value.int_value     10645 non-null  float64
 2   params_value.double_value  0 non-null      float64
dtypes: float64(2), object(1)
memory usage: 687.8+ KB


1.0    10645
Name: params_value.int_value, dtype: int64

### firebase_screen_id

In [48]:
df_event_params.xs('firebase_screen_id', level=1, drop_level=False).info()
df_event_params.xs('firebase_screen_id', level=1, drop_level=False)['params_value.int_value'].value_counts()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 10486 entries, (2, 'firebase_screen_id') to (10938, 'firebase_screen_id')
Data columns (total 3 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   params_value.string_value  0 non-null      object 
 1   params_value.int_value     10486 non-null  float64
 2   params_value.double_value  0 non-null      float64
dtypes: float64(2), object(1)
memory usage: 683.6+ KB


 6.644756e+18    385
 8.490211e+18    344
 6.576743e+18    194
 6.287030e+17    180
-2.164390e+18    134
                ... 
-6.159770e+18      1
-8.695321e+18      1
-4.885996e+18      1
 5.353056e+18      1
 4.877089e+18      1
Name: params_value.int_value, Length: 483, dtype: int64

### firebase_screen_class

In [49]:
df_event_params.xs('firebase_screen_class', level=1, drop_level=False).info()
df_event_params.xs('firebase_screen_class', level=1, drop_level=False)['params_value.string_value'].value_counts()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 10486 entries, (2, 'firebase_screen_class') to (10938, 'firebase_screen_class')
Data columns (total 3 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   params_value.string_value  10486 non-null  object 
 1   params_value.int_value     0 non-null      float64
 2   params_value.double_value  0 non-null      float64
dtypes: float64(2), object(1)
memory usage: 683.6+ KB


HymnListViewController             3939
HymnViewController                 3886
UIAlertController                   570
StyledPreviewController             517
CenterViewController                479
MainActivity                        407
ViewerActivity                      328
MDCBottomSheetController            214
UpdateViewController                 29
UIActivityContentViewController      29
SongbookAlertController              26
SLComposeViewController              17
SubmissionsViewController             8
OrganizeViewController                7
UIActivityViewController              7
ContainerViewController               7
ReviewViewController                  6
MeetViewController                    4
SFAirDropViewController               2
ComposeViewController                 2
SongbookViewController                2
Name: params_value.string_value, dtype: int64

### engagement_time_msec

In [50]:
df_event_params.xs('engagement_time_msec', level=1, drop_level=False).info()
df_event_params.xs('engagement_time_msec', level=1, drop_level=False)['params_value.int_value'].value_counts()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 4707 entries, (2, 'engagement_time_msec') to (10937, 'engagement_time_msec')
Data columns (total 3 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   params_value.string_value  0 non-null      object 
 1   params_value.int_value     4707 non-null   float64
 2   params_value.double_value  0 non-null      float64
dtypes: float64(2), object(1)
memory usage: 531.2+ KB


1.0        56
2.0        18
3.0        16
0.0        13
1659.0      6
           ..
11301.0     1
2157.0      1
3503.0      1
7442.0      1
1219.0      1
Name: params_value.int_value, Length: 3905, dtype: int64

### firebase_previous_id

In [51]:
df_event_params.xs('firebase_previous_id', level=1, drop_level=False).info()
df_event_params.xs('firebase_previous_id', level=1, drop_level=False)['params_value.int_value'].value_counts()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 3629 entries, (4, 'firebase_previous_id') to (10932, 'firebase_previous_id')
Data columns (total 3 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   params_value.string_value  0 non-null      object 
 1   params_value.int_value     3629 non-null   float64
 2   params_value.double_value  0 non-null      float64
dtypes: float64(2), object(1)
memory usage: 502.8+ KB


 8.490211e+18    186
 6.644756e+18    160
 6.576743e+18    114
 6.287030e+17     72
 4.539133e+17     50
                ... 
 2.758248e+18      1
 4.113373e+18      1
-4.985425e+18      1
-2.097768e+18      1
-6.381400e+18      1
Name: params_value.int_value, Length: 449, dtype: int64

### firebase_previous_class

In [52]:
df_event_params.xs('firebase_previous_class', level=1, drop_level=False).info()
df_event_params.xs('firebase_previous_class', level=1, drop_level=False)['params_value.string_value'].value_counts()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 3629 entries, (4, 'firebase_previous_class') to (10932, 'firebase_previous_class')
Data columns (total 3 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   params_value.string_value  3629 non-null   object 
 1   params_value.int_value     0 non-null      float64
 2   params_value.double_value  0 non-null      float64
dtypes: float64(2), object(1)
memory usage: 502.8+ KB


HymnViewController                 1318
HymnListViewController              963
CenterViewController                458
UIAlertController                   325
StyledPreviewController             166
MainActivity                        115
MDCBottomSheetController            104
ViewerActivity                      101
UIActivityContentViewController      17
SongbookAlertController              14
UpdateViewController                 11
SLComposeViewController               8
UIActivityViewController              7
ContainerViewController               6
OrganizeViewController                4
SubmissionsViewController             4
ReviewViewController                  3
MeetViewController                    2
SFAirDropViewController               1
ComposeViewController                 1
SongbookViewController                1
Name: params_value.string_value, dtype: int64

# Merging dataframes on index1

In [53]:
df_firebase_event_origin = df_event_params.xs('firebase_event_origin', level=1, drop_level=False)\
    .reset_index(level=[0,1], drop=False)\
    .drop(labels=['params_value.int_value','params_value.double_value','params_key'], axis=1)\
    .rename(columns={'params_value.string_value':'firebase_event_origin'})

df_ga_session_number = df_event_params.xs('ga_session_number', level=1, drop_level=False)\
    .reset_index(level=[0,1], drop=False)\
    .drop(labels=['params_value.string_value','params_value.double_value','params_key'], axis=1)\
    .rename(columns={'params_value.int_value':'ga_session_number'})

df_ga_session_id = df_event_params.xs('ga_session_id', level=1, drop_level=False)\
    .reset_index(level=[0,1], drop=False)\
    .drop(labels=['params_value.string_value','params_value.double_value','params_key'], axis=1)\
    .rename(columns={'params_value.int_value':'ga_session_id'})

df_engaged_session_event = df_event_params.xs('engaged_session_event', level=1, drop_level=False)\
    .reset_index(level=[0,1], drop=False)\
    .drop(labels=['params_value.string_value','params_value.double_value','params_key'], axis=1)\
    .rename(columns={'params_value.int_value':'engaged_session_event'})

df_firebase_screen_id = df_event_params.xs('firebase_screen_id', level=1, drop_level=False)\
    .reset_index(level=[0,1], drop=False)\
    .drop(labels=['params_value.string_value','params_value.double_value','params_key'], axis=1)\
    .rename(columns={'params_value.int_value':'firebase_screen_id'})

df_firebase_screen_class = df_event_params.xs('firebase_screen_class', level=1, drop_level=False)\
    .reset_index(level=[0,1], drop=False)\
    .drop(labels=['params_value.int_value','params_value.double_value','params_key'], axis=1)\
    .rename(columns={'params_value.string_value':'firebase_screen_class'})

df_engagement_time_msec = df_event_params.xs('engagement_time_msec', level=1, drop_level=False)\
    .reset_index(level=[0,1], drop=False)\
    .drop(labels=['params_value.string_value','params_value.double_value','params_key'], axis=1)\
    .rename(columns={'params_value.int_value':'engagement_time_msec'})

df_firebase_previous_id = df_event_params.xs('firebase_previous_id', level=1, drop_level=False)\
    .reset_index(level=[0,1], drop=False)\
    .drop(labels=['params_value.string_value','params_value.double_value','params_key'], axis=1)\
    .rename(columns={'params_value.int_value':'firebase_previous_id'})

df_firebase_previous_class = df_event_params.xs('firebase_previous_class', level=1, drop_level=False)\
    .reset_index(level=[0,1], drop=False)\
    .drop(labels=['params_value.int_value','params_value.double_value','params_key'], axis=1)\
    .rename(columns={'params_value.string_value':'firebase_previous_class'})

In [54]:
# Merging all search params_key into 1 dataframe
df_firebase = df_firebase_event_origin.\
    merge(df_ga_session_number, how='left').\
    merge(df_ga_session_id, how='left').\
    merge(df_engaged_session_event, how='left').\
    merge(df_firebase_screen_id, how='left').\
    merge(df_firebase_screen_class, how='left').\
    merge(df_engagement_time_msec, how='left').\
    merge(df_firebase_previous_id, how='left').\
    merge(df_firebase_previous_class, how='left')

In [55]:
df_firebase.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10939 entries, 0 to 10938
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   index1                   10939 non-null  int64  
 1   firebase_event_origin    10939 non-null  object 
 2   ga_session_number        10921 non-null  float64
 3   ga_session_id            10921 non-null  float64
 4   engaged_session_event    10645 non-null  float64
 5   firebase_screen_id       10486 non-null  float64
 6   firebase_screen_class    10486 non-null  object 
 7   engagement_time_msec     4707 non-null   float64
 8   firebase_previous_id     3629 non-null   float64
 9   firebase_previous_class  3629 non-null   object 
dtypes: float64(6), int64(1), object(3)
memory usage: 940.1+ KB


# Further inspection on remainder params_key

### Entrances

In [56]:
df_event_params.xs('entrances', level=1, drop_level=False).info()
df_event_params.xs('entrances', level=1, drop_level=False)['params_value.int_value'].value_counts()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 473 entries, (2, 'entrances') to (10936, 'entrances')
Data columns (total 3 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   params_value.string_value  0 non-null      object 
 1   params_value.int_value     473 non-null    float64
 2   params_value.double_value  0 non-null      float64
dtypes: float64(2), object(1)
memory usage: 419.6+ KB


1.0    473
Name: params_value.int_value, dtype: int64

### Session_engaged

In [57]:
df_event_params.xs('session_engaged', level=1, drop_level=False).info()
df_event_params.xs('session_engaged', level=1, drop_level=False)['params_value.int_value'].value_counts()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 452 entries, (0, 'session_engaged') to (10934, 'session_engaged')
Data columns (total 3 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   params_value.string_value  0 non-null      object 
 1   params_value.int_value     452 non-null    float64
 2   params_value.double_value  0 non-null      float64
dtypes: float64(2), object(1)
memory usage: 419.0+ KB


1.0    452
Name: params_value.int_value, dtype: int64

### debug_event

In [58]:
df_event_params.xs('debug_event', level=1, drop_level=False).info()
df_event_params.xs('debug_event', level=1, drop_level=False)['params_value.int_value'].value_counts()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 242 entries, (43, 'debug_event') to (284, 'debug_event')
Data columns (total 3 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   params_value.string_value  0 non-null      object 
 1   params_value.int_value     242 non-null    float64
 2   params_value.double_value  0 non-null      float64
dtypes: float64(2), object(1)
memory usage: 413.5+ KB


1.0    242
Name: params_value.int_value, dtype: int64

### freeride

In [59]:
df_event_params.xs('freeride', level=1, drop_level=False).info()
df_event_params.xs('freeride', level=1, drop_level=False)['params_value.int_value'].value_counts()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 212 entries, (3, 'freeride') to (10931, 'freeride')
Data columns (total 3 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   params_value.string_value  0 non-null      object 
 1   params_value.int_value     212 non-null    float64
 2   params_value.double_value  0 non-null      float64
dtypes: float64(2), object(1)
memory usage: 412.7+ KB


1.0    212
Name: params_value.int_value, dtype: int64

### previous_os_version

In [60]:
df_event_params.xs('previous_os_version', level=1, drop_level=False).info()
df_event_params.xs('previous_os_version', level=1, drop_level=False)['params_value.string_value'].value_counts()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 57 entries, (317, 'previous_os_version') to (10502, 'previous_os_version')
Data columns (total 3 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   params_value.string_value  57 non-null     object 
 1   params_value.int_value     0 non-null      float64
 2   params_value.double_value  0 non-null      float64
dtypes: float64(2), object(1)
memory usage: 408.6+ KB


13.7      21
14.0.1    11
13.6.1     6
14.1       3
14.0       3
10         2
13.3       2
13.6       2
13.3.1     2
11.1.2     1
9          1
8.0.0      1
13.5.1     1
12.4.8     1
Name: params_value.string_value, dtype: int64

### timestamp

In [61]:
df_event_params.xs('timestamp', level=1, drop_level=False).info()
df_event_params.xs('timestamp', level=1, drop_level=False)['params_value.int_value'].value_counts()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 11 entries, (315, 'timestamp') to (8032, 'timestamp')
Data columns (total 3 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   params_value.string_value  0 non-null      object 
 1   params_value.int_value     3 non-null      float64
 2   params_value.double_value  8 non-null      float64
dtypes: float64(2), object(1)
memory usage: 407.4+ KB


1.604771e+09    1
1.604760e+09    1
1.604758e+09    1
Name: params_value.int_value, dtype: int64

In [62]:
df_event_params.xs('timestamp', level=1, drop_level=False)['params_value.double_value'].value_counts()

1.604803e+09    1
1.604737e+09    1
1.604750e+09    1
1.604800e+09    1
1.604802e+09    1
1.604788e+09    1
1.604801e+09    1
1.604745e+09    1
Name: params_value.double_value, dtype: int64

### update_with_analytics

In [63]:
df_event_params.xs('update_with_analytics', level=1, drop_level=False).info()
df_event_params.xs('update_with_analytics', level=1, drop_level=False)['params_value.int_value'].value_counts()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 10 entries, (418, 'update_with_analytics') to (10645, 'update_with_analytics')
Data columns (total 3 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   params_value.string_value  0 non-null      object 
 1   params_value.int_value     10 non-null     float64
 2   params_value.double_value  0 non-null      float64
dtypes: float64(2), object(1)
memory usage: 407.4+ KB


1.0    5
0.0    5
Name: params_value.int_value, dtype: int64

### firebase_conversion

In [64]:
df_event_params.xs('firebase_conversion', level=1, drop_level=False).info()
df_event_params.xs('firebase_conversion', level=1, drop_level=False)['params_value.int_value'].value_counts()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 10 entries, (418, 'firebase_conversion') to (10645, 'firebase_conversion')
Data columns (total 3 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   params_value.string_value  0 non-null      object 
 1   params_value.int_value     10 non-null     float64
 2   params_value.double_value  0 non-null      float64
dtypes: float64(2), object(1)
memory usage: 407.4+ KB


1.0    10
Name: params_value.int_value, dtype: int64

### previous_first_open_count

In [65]:
df_event_params.xs('previous_first_open_count', level=1, drop_level=False).info()
df_event_params.xs('previous_first_open_count', level=1, drop_level=False)['params_value.int_value'].value_counts()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 10 entries, (418, 'previous_first_open_count') to (10645, 'previous_first_open_count')
Data columns (total 3 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   params_value.string_value  0 non-null      object 
 1   params_value.int_value     10 non-null     float64
 2   params_value.double_value  0 non-null      float64
dtypes: float64(2), object(1)
memory usage: 407.4+ KB


0.0    10
Name: params_value.int_value, dtype: int64

### system_app

In [66]:
df_event_params.xs('system_app', level=1, drop_level=False).info()
df_event_params.xs('system_app', level=1, drop_level=False)['params_value.int_value'].value_counts()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 3 entries, (418, 'system_app') to (4902, 'system_app')
Data columns (total 3 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   params_value.string_value  0 non-null      object 
 1   params_value.int_value     3 non-null      float64
 2   params_value.double_value  0 non-null      float64
dtypes: float64(2), object(1)
memory usage: 407.2+ KB


0.0    3
Name: params_value.int_value, dtype: int64

### medium

In [67]:
df_event_params.xs('medium', level=1, drop_level=False).info()
df_event_params.xs('medium', level=1, drop_level=False)['params_value.string_value'].value_counts()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 3 entries, (412, 'medium') to (4904, 'medium')
Data columns (total 3 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   params_value.string_value  3 non-null      object 
 1   params_value.int_value     0 non-null      float64
 2   params_value.double_value  0 non-null      float64
dtypes: float64(2), object(1)
memory usage: 407.2+ KB


organic    3
Name: params_value.string_value, dtype: int64

### source

In [68]:
df_event_params.xs('source', level=1, drop_level=False).info()
df_event_params.xs('source', level=1, drop_level=False)['params_value.string_value'].value_counts()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 3 entries, (412, 'source') to (4904, 'source')
Data columns (total 3 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   params_value.string_value  3 non-null      object 
 1   params_value.int_value     0 non-null      float64
 2   params_value.double_value  0 non-null      float64
dtypes: float64(2), object(1)
memory usage: 407.2+ KB


google-play    3
Name: params_value.string_value, dtype: int64

### campaign_info_source

In [69]:
df_event_params.xs('campaign_info_source', level=1, drop_level=False).info()
df_event_params.xs('campaign_info_source', level=1, drop_level=False)['params_value.string_value'].value_counts()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 3 entries, (412, 'campaign_info_source') to (4904, 'campaign_info_source')
Data columns (total 3 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   params_value.string_value  3 non-null      object 
 1   params_value.int_value     0 non-null      float64
 2   params_value.double_value  0 non-null      float64
dtypes: float64(2), object(1)
memory usage: 407.2+ KB


referrer API v2    3
Name: params_value.string_value, dtype: int64

### system_app_update

In [70]:
df_event_params.xs('system_app_update', level=1, drop_level=False).info()
df_event_params.xs('system_app_update', level=1, drop_level=False)['params_value.int_value'].value_counts()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 3 entries, (418, 'system_app_update') to (4902, 'system_app_update')
Data columns (total 3 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   params_value.string_value  0 non-null      object 
 1   params_value.int_value     3 non-null      float64
 2   params_value.double_value  0 non-null      float64
dtypes: float64(2), object(1)
memory usage: 407.2+ KB


0.0    3
Name: params_value.int_value, dtype: int64

### previous_app_version

In [71]:
df_event_params.xs('previous_app_version', level=1, drop_level=False).info()
df_event_params.xs('previous_app_version', level=1, drop_level=False)['params_value.string_value'].value_counts()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2 entries, (8731, 'previous_app_version') to (9567, 'previous_app_version')
Data columns (total 3 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   params_value.string_value  2 non-null      object 
 1   params_value.int_value     0 non-null      float64
 2   params_value.double_value  0 non-null      float64
dtypes: float64(2), object(1)
memory usage: 407.1+ KB


1.5.3    1
1.5.6    1
Name: params_value.string_value, dtype: int64

Having look at all the remainder params_key, there is not enough data to do anything about it. Most values are either not useful or too scarce to be analyzed.

# Merging all dataframes

In [72]:
df_event_params_merged_small = df_firebase.\
    merge(df_item, how='left').\
    merge(df_search, how='left')