Parameters:Description
https://support.google.com/firebase/answer/7061705?hl=en

In [1]:
#Import pandas, matplotlib.pyplot, and seaborn 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json

from library.sb_utils import save_file

# Method 3

with open('./hymnal_data_export.json', encoding="utf8") as f:
    data = f.readlines()
    data = [json.loads(line) for line in data]

df_1 = pd.json_normalize(
    data, 
    record_path='event_params',
    record_prefix='event_params_',  
    meta=['event_date', 'event_timestamp', 'event_name','device']
    )
df_device = pd.json_normalize(df_1["device"]).add_prefix('device_')
df_1 = pd.concat([df_1,df_device], axis=1)
del df_1['device']
display(df_1)

df_session_id = df_1[df_1.event_params_key == 'ga_session_id']
df_session_id = df_session_id.iloc[:,[0,2,4,5,6,7,8,9,10,11,12,13]]
df_session_id

df_session_id.sort_values(['event_params_value.int_value'], ascending=False)

df_category_id = df_1[df_1.event_params_key == 'item_category']
df_category_id = df_category_id.iloc[:,[0,1,4,5,6,7,8,9,10,11,12,13,15,16]]
df_category_id

df_event_name = df_1[df_1.event_name == 'app_remove']
#df_event_name = df_event_name.iloc[:,[0,1,2,4,5,6,7,8,9,10,11,12,13,15,16]]
df_event_name.count()

df_event_name = df_1[df_1.event_name == 'first_open']
#df_event_name = df_event_name.iloc[:,[0,1,2,4,5,6,7,8,9,10,11,12,13,15,16]]
df_event_name.count()

# Reading each nested item as new dataframe

# Checking out the json file

In [2]:
# Exploring the nested json file
df_0 = pd.read_json('./hymnal_data_export.json', lines=True)
print(df_0.columns)
df_0.head(2).T

Index(['event_date', 'event_timestamp', 'event_name', 'event_params',
       'event_previous_timestamp', 'event_bundle_sequence_id',
       'event_server_timestamp_offset', 'user_pseudo_id', 'user_properties',
       'user_first_touch_timestamp', 'device', 'geo', 'app_info',
       'traffic_source', 'stream_id', 'platform', 'items', 'ecommerce'],
      dtype='object')


Unnamed: 0,0,1
event_date,20201107,20201107
event_timestamp,1604801718108000,1604785674744000
event_name,session_start,session_start
event_params,"[{'key': 'firebase_event_origin', 'value': {'s...","[{'key': 'ga_session_number', 'value': {'int_v..."
event_previous_timestamp,1.60479e+15,1.60468e+15
event_bundle_sequence_id,13,12
event_server_timestamp_offset,754702,718754
user_pseudo_id,cb052c8ce7b261aecf783ce043089fb3,cb052c8ce7b261aecf783ce043089fb3
user_properties,"[{'key': 'ga_session_id', 'value': {'int_value...","[{'key': 'ga_session_number', 'value': {'int_v..."
user_first_touch_timestamp,1562977643627000,1562977643627000


Looking at the dataframe above, I can see which columns are nested.

In [3]:
#Checking shape of dataframe
df_0.shape

(10939, 18)

This shows that the dataframe has 10939 rows and 18 columns

In [4]:
#Checking number of users by unique user_pseudo_id
df_0.user_pseudo_id.nunique()

336

# Checking out event_name column

In [5]:
#Checking unique values for event_name
df_0.event_name.unique()

array(['session_start', 'screen_view', 'user_engagement', 'view_item',
       'os_update', 'firebase_campaign', 'first_open', 'search',
       'app_remove', 'app_update'], dtype=object)

In [6]:
#Counting sum of the unique values from event_name column
df_0.event_name.value_counts()

user_engagement      4497
screen_view          4070
view_item             974
search                802
session_start         522
os_update              57
first_open             10
firebase_campaign       3
app_remove              2
app_update              2
Name: event_name, dtype: int64

# Checking out event_timestamp column

In [7]:
df_0.event_timestamp.value_counts().sum()

10939

In [8]:
#Checking for duplicated event_timestamp to see if I could use this column to sort my data
df_0.event_timestamp.duplicated().sum()

17

The event_timestamp column showed that there are 17 duplicated rows.

# Formatting firebase event_timestamp

"Firebase.ServerValue.TIMESTAMP is not actual timestamp it is constant that will be replaced with actual value in server if you have it set into some variable."

In [9]:
df_0['event_timestamp'] = pd.to_datetime(df_0['event_timestamp'], unit='us')
df_0['event_timestamp']

0       2020-11-08 02:15:18.108000
1       2020-11-07 21:47:54.744000
2       2020-11-07 21:47:56.284001
3       2020-11-07 21:48:45.805005
4       2020-11-07 21:48:45.964006
                   ...            
10934   2020-11-07 21:10:30.866000
10935   2020-11-07 21:17:57.078004
10936   2020-11-07 21:10:31.489001
10937   2020-11-07 21:16:01.687002
10938   2020-11-07 15:54:36.799002
Name: event_timestamp, Length: 10939, dtype: datetime64[ns]

# Checking relationship between stream_id and platform columns

In [10]:
#Checking unique values in stream_id column
df_0.stream_id.unique()

array([1440534155, 1080202923], dtype=int64)

Here I see that there are only 2 values, this could be because the dataframe only consist of 2 types of platforms

In [11]:
df_0.platform.unique()

array(['ANDROID', 'IOS'], dtype=object)

As hypothesized, the stream_id is based on the type of platform. Next, I want to know which stream_id belongs to which platform

In [12]:
df_0.stream_id.value_counts()

1080202923    10124
1440534155      815
Name: stream_id, dtype: int64

In [13]:
df_0.platform.value_counts()

IOS        10124
ANDROID      815
Name: platform, dtype: int64

Now, I can clearly see which stream_id represents which platform. This also shows us which platform is more dominantly used for this application.

# Investigating event _date column 

In [14]:
df_0.event_date.unique()

array([20201107], dtype=int64)

In [15]:
df_0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10939 entries, 0 to 10938
Data columns (total 18 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   event_date                     10939 non-null  int64         
 1   event_timestamp                10939 non-null  datetime64[ns]
 2   event_name                     10939 non-null  object        
 3   event_params                   10939 non-null  object        
 4   event_previous_timestamp       10858 non-null  float64       
 5   event_bundle_sequence_id       10939 non-null  int64         
 6   event_server_timestamp_offset  10939 non-null  int64         
 7   user_pseudo_id                 10939 non-null  object        
 8   user_properties                10939 non-null  object        
 9   user_first_touch_timestamp     10939 non-null  int64         
 10  device                         10939 non-null  object        
 11  geo            

In [17]:
df_0.isnull().sum()

event_date                          0
event_timestamp                     0
event_name                          0
event_params                        0
event_previous_timestamp           81
event_bundle_sequence_id            0
event_server_timestamp_offset       0
user_pseudo_id                      0
user_properties                     0
user_first_touch_timestamp          0
device                              0
geo                                 0
app_info                            0
traffic_source                      0
stream_id                           0
platform                            0
items                               0
ecommerce                        9965
dtype: int64

In [18]:
df_0.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
event_date,10939.0,20201110.0,0.0,20201110.0,20201110.0,20201110.0,20201110.0,20201110.0
event_previous_timestamp,10858.0,1604661000000000.0,872874700000.0,1570506000000000.0,1604748000000000.0,1604776000000000.0,1604799000000000.0,1604819000000000.0
event_bundle_sequence_id,10939.0,1029.789,1271.314,1.0,205.0,583.0,1292.0,6949.0
event_server_timestamp_offset,10939.0,1344692.0,11256450.0,89.0,144027.5,258026.0,541230.0,257000100.0
user_first_touch_timestamp,10939.0,1573392000000000.0,20396940000000.0,1511237000000000.0,1562038000000000.0,1574576000000000.0,1587438000000000.0,1604804000000000.0
stream_id,10939.0,1107049000.0,94623500.0,1080203000.0,1080203000.0,1080203000.0,1080203000.0,1440534000.0


# Making nested event_params column into separate dataframe

In [57]:
import json 
import pandas as pd 
from pandas.io.json import json_normalize

#load json object
with open('./hymnal_data_export.json') as f:
    lines = f.read().splitlines()
df_inter = pd.DataFrame(lines)
df_inter.columns = ['json_element']
df_inter['json_element'].apply(json.loads)
df_final = pd.json_normalize(df_inter['json_element'].apply(json.loads))

In [60]:
df_user_properties = df_final['user_properties']
df_user_properties
df_user_properties.to_csv('df_user_properties_nested.csv', encoding='utf-8', index=False)

In [62]:
def flatten_json(y):
    out = {}

    def flatten(x, name=''):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + '_')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + '_')
                i += 1
        else:
            out[name[:-1]] = x

    flatten(y)
    return out

In [65]:
with open('./hymnal_data_export.json', encoding="utf8") as f:
    data = f.readlines()
    data = [json.loads(line) for line in data]
    
df_event_params = pd.json_normalize(
    data, 
    record_path='event_params',
    record_prefix='params_',  
    meta=['event_date', 'event_timestamp', 'event_name']
    )
print(df_event_params.params_key.unique())
df_event_params 

['firebase_event_origin' 'ga_session_id' 'engaged_session_event'
 'session_engaged' 'ga_session_number' 'firebase_screen_id'
 'firebase_screen_class' 'engagement_time_msec' 'entrances' 'freeride'
 'firebase_previous_id' 'firebase_previous_class' 'item_subcategory'
 'recommended' 'item_category' 'item_name' 'item_number' 'item_id'
 'debug_event' 'timestamp' 'previous_os_version' 'campaign_info_source'
 'medium' 'source' 'system_app' 'system_app_update'
 'update_with_analytics' 'firebase_conversion' 'previous_first_open_count'
 'search_term' 'search_book' 'search_type' 'search_subcategory'
 'search_category' 'previous_app_version']


Unnamed: 0,params_key,params_value.string_value,params_value.int_value,params_value.double_value,event_date,event_timestamp,event_name
0,firebase_event_origin,auto,,,20201107,1604801718108000,session_start
1,ga_session_id,,1604801718,,20201107,1604801718108000,session_start
2,engaged_session_event,,1,,20201107,1604801718108000,session_start
3,session_engaged,,1,,20201107,1604801718108000,session_start
4,ga_session_number,,8,,20201107,1604801718108000,session_start
...,...,...,...,...,...,...,...
86127,recommended,,0,,20201107,1604764476799002,view_item
86128,firebase_screen_class,MainActivity,,,20201107,1604764476799002,view_item
86129,item_subcategory,Life in Eternity,,,20201107,1604764476799002,view_item
86130,ga_session_number,,121,,20201107,1604764476799002,view_item


In [16]:
df_event_params.params_key.value_counts()

firebase_event_origin        10939
ga_session_id                10921
ga_session_number            10921
engaged_session_event        10645
firebase_screen_id           10486
firebase_screen_class        10486
engagement_time_msec          4707
firebase_previous_class       3629
firebase_previous_id          3629
item_id                        974
item_name                      974
item_number                    974
recommended                    974
item_category                  951
item_subcategory               904
search_type                    802
search_book                    802
search_term                    802
entrances                      473
session_engaged                452
debug_event                    242
freeride                       212
search_category                 59
search_subcategory              59
previous_os_version             57
timestamp                       11
previous_first_open_count       10
firebase_conversion             10
update_with_analytic

item_name:song_title

item_number: song_number


# Creating separate DataFrames for nested user_properties

In [207]:
df_test = pd.DataFrame(df_0.user_properties)
df_test

Unnamed: 0,user_properties
0,"[{'key': 'ga_session_id', 'value': {'int_value..."
1,"[{'key': 'ga_session_number', 'value': {'int_v..."
2,"[{'key': 'ga_session_id', 'value': {'int_value..."
3,"[{'key': 'ga_session_id', 'value': {'int_value..."
4,"[{'key': 'ga_session_id', 'value': {'int_value..."
...,...
10934,"[{'key': 'first_open_time', 'value': {'int_val..."
10935,"[{'key': 'first_open_time', 'value': {'int_val..."
10936,"[{'key': 'ga_session_id', 'value': {'int_value..."
10937,"[{'key': 'ga_session_number', 'value': {'int_v..."


In [17]:
df_user_properties = pd.json_normalize(
    data, 
    record_path='user_properties',
    record_prefix='user_',  
    meta=['event_date', 'event_timestamp', 'event_name']
    )
print(df_user_properties.user_key.unique())
df_user_properties

['ga_session_id' 'first_open_time' 'ga_session_number']


Unnamed: 0,user_key,user_value.int_value,user_value.set_timestamp_micros,event_date,event_timestamp,event_name
0,ga_session_id,1604801718,1604801718108000,20201107,1604801718108000,session_start
1,first_open_time,1562979600000,1562977643627000,20201107,1604801718108000,session_start
2,ga_session_number,8,1604801718108000,20201107,1604801718108000,session_start
3,ga_session_number,7,1604785674744000,20201107,1604785674744000,session_start
4,ga_session_id,1604785674,1604785674744000,20201107,1604785674744000,session_start
...,...,...,...,...,...,...
32776,ga_session_id,1604783430,1604783430866000,20201107,1604783761687002,user_engagement
32777,first_open_time,1578348000000,1578347879890000,20201107,1604783761687002,user_engagement
32778,ga_session_number,121,1604764472262000,20201107,1604764476799002,view_item
32779,first_open_time,1584248400000,1584248039606000,20201107,1604764476799002,view_item


In [18]:
df_user_properties.user_key.value_counts()

first_open_time      10939
ga_session_id        10921
ga_session_number    10921
Name: user_key, dtype: int64

Looking at the above count of each value, only first_open_time has the same number of rows as the original dataframe (df_0), while the other two values seemed to have missing values. 

In [19]:
df_user_properties['timestamp_count']=df_user_properties['event_timestamp'].map(df_user_properties['event_timestamp'].value_counts())
#df_user_properties['timestamp_bool'] = 
#df_user_properties['event_timestamp'].value_counts() <= 2
df_user_properties['timestamp_count'].value_counts()

3    32682
6       60
4       28
1       11
Name: timestamp_count, dtype: int64

Trying to figure out here the missing rows are for session_id and session_number

In [20]:
#Grouping user_properties dataframe by user_key
df_user_properties_grouped = df_user_properties.groupby(by='user_key')
df_user_properties_grouped.first()

Unnamed: 0_level_0,user_value.int_value,user_value.set_timestamp_micros,event_date,event_timestamp,event_name,timestamp_count
user_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
first_open_time,1562979600000,1562977643627000,20201107,1604801718108000,session_start,3
ga_session_id,1604801718,1604801718108000,20201107,1604801718108000,session_start,3
ga_session_number,8,1604801718108000,20201107,1604801718108000,session_start,3


In [174]:
#Splitting up each groupby groups into a separate dataframe with new column names
df_user_properties_grouped1 = df_user_properties_grouped.get_group('ga_session_id').reset_index().rename(columns = {'user_value.int_value':'ga_session_id.value', 'user_value.set_timestamp_micros':'ga_session_id.timestamp'})
df_user_properties_grouped2 = df_user_properties_grouped.get_group('ga_session_number').reset_index().rename(columns = {'user_value.int_value':'ga_session_number.value', 'user_value.set_timestamp_micros':'ga_session_number.timestamp'})
df_user_properties_grouped3 = df_user_properties_grouped.get_group('first_open_time').reset_index().rename(columns = {'user_value.int_value':'first_open_time.value', 'user_value.set_timestamp_micros':'first_open_time.timestamp'})

In [175]:
#Deleting index column
df_user_properties_grouped1.drop('index', 1, inplace=True)
df_user_properties_grouped2.drop('index', 1, inplace=True)
df_user_properties_grouped3.drop('index', 1, inplace=True)

In [176]:
print('Shape of grouped1', df_user_properties_grouped1.shape)
print('Shape of grouped2', df_user_properties_grouped2.shape)
print('Shape of grouped3', df_user_properties_grouped3.shape)
#Here I see that there are missing data in grouped1 and grouped2

Shape of grouped1 (10921, 6)
Shape of grouped2 (10921, 6)
Shape of grouped3 (10939, 6)


In [181]:
df_user_properties_grouped1.head()

Unnamed: 0,ga_session_id.value,ga_session_id.timestamp,event_date,event_timestamp,event_name,timestamp_count
0,1604801718,1604801718108000,20201107,1604801718108000,session_start,3
1,1604785674,1604785674744000,20201107,1604785674744000,session_start,3
2,1604785674,1604785674744000,20201107,1604785676284001,screen_view,3
3,1604785674,1604785674744000,20201107,1604785725805005,user_engagement,3
4,1604785674,1604785674744000,20201107,1604785725964006,screen_view,3


In [182]:
df_user_properties_grouped2.head()

Unnamed: 0,ga_session_number.value,ga_session_number.timestamp,event_date,event_timestamp,event_name,timestamp_count
0,8,1604801718108000,20201107,1604801718108000,session_start,3
1,7,1604785674744000,20201107,1604785674744000,session_start,3
2,7,1604785674744000,20201107,1604785676284001,screen_view,3
3,7,1604785674744000,20201107,1604785725805005,user_engagement,3
4,7,1604785674744000,20201107,1604785725964006,screen_view,3


# Checking for duplicated rows

In [180]:
#Checking for duplicated rows
df_user_properties_grouped1[df_user_properties_grouped1.duplicated(keep=False)]

Unnamed: 0,ga_session_id.value,ga_session_id.timestamp,event_date,event_timestamp,event_name,timestamp_count
1121,1604804764,1604804764715000,20201107,1604804795360000,user_engagement,6
1122,1604804764,1604804764715000,20201107,1604804795360000,user_engagement,6
3855,1604785413,1604785413882000,20201107,1604785438126003,screen_view,6
3865,1604785413,1604785413882000,20201107,1604785433137000,search,6
3866,1604785413,1604785413882000,20201107,1604785438125002,user_engagement,6
3867,1604785413,1604785413882000,20201107,1604785444724004,user_engagement,6
3868,1604785413,1604785413882000,20201107,1604785438125002,user_engagement,6
3888,1604785413,1604785413882000,20201107,1604785438126003,screen_view,6
3908,1604785413,1604785413882000,20201107,1604785444724004,user_engagement,6
3928,1604785413,1604785413882000,20201107,1604785433137000,search,6


In [169]:
#Checking for duplicated rows
df_user_properties_grouped2[df_user_properties_grouped2.duplicated(keep=False)]

Unnamed: 0,ga_session_number.value,ga_session_number.timestamp,event_date,event_timestamp,event_name,timestamp_count
1121,332,1604804765477000,20201107,1604804795360000,user_engagement,6
1122,332,1604804765477000,20201107,1604804795360000,user_engagement,6
3855,760,1604785414557000,20201107,1604785438126003,screen_view,6
3865,760,1604785414557000,20201107,1604785433137000,search,6
3866,760,1604785414557000,20201107,1604785438125002,user_engagement,6
3867,760,1604785414557000,20201107,1604785444724004,user_engagement,6
3868,760,1604785414557000,20201107,1604785438125002,user_engagement,6
3888,760,1604785414557000,20201107,1604785438126003,screen_view,6
3908,760,1604785414557000,20201107,1604785444724004,user_engagement,6
3928,760,1604785414557000,20201107,1604785433137000,search,6


In [191]:
#Checking for duplicated rows
df_user_properties_grouped3[df_user_properties_grouped3.duplicated(keep=False)]

Unnamed: 0,first_open_time.value,first_open_time.timestamp,event_date,event_timestamp,event_name,timestamp_count
1127,1549738800000,1549737714644000,20201107,1604804795360000,user_engagement,6
1128,1549738800000,1549737714644000,20201107,1604804795360000,user_engagement,6
3862,1560812400000,1560809397338000,20201107,1604785438126003,screen_view,6
3872,1560812400000,1560809397338000,20201107,1604785433137000,search,6
3873,1560812400000,1560809397338000,20201107,1604785438125002,user_engagement,6
3874,1560812400000,1560809397338000,20201107,1604785444724004,user_engagement,6
3875,1560812400000,1560809397338000,20201107,1604785438125002,user_engagement,6
3895,1560812400000,1560809397338000,20201107,1604785438126003,screen_view,6
3915,1560812400000,1560809397338000,20201107,1604785444724004,user_engagement,6
3935,1560812400000,1560809397338000,20201107,1604785433137000,search,6


# Merging User_properties sub dataframe

In [199]:
pd.merge(df_user_properties_grouped1,df_user_properties_grouped2, how='left', on=['event_date','event_timestamp','event_name', 'timestamp_count'])

Unnamed: 0,ga_session_id.value,ga_session_id.timestamp,event_date,event_timestamp,event_name,timestamp_count,ga_session_number.value,ga_session_number.timestamp
0,1604801718,1604801718108000,20201107,1604801718108000,session_start,3,8,1604801718108000
1,1604785674,1604785674744000,20201107,1604785674744000,session_start,3,7,1604785674744000
2,1604785674,1604785674744000,20201107,1604785676284001,screen_view,3,7,1604785674744000
3,1604785674,1604785674744000,20201107,1604785725805005,user_engagement,3,7,1604785674744000
4,1604785674,1604785674744000,20201107,1604785725964006,screen_view,3,7,1604785674744000
...,...,...,...,...,...,...,...,...
10936,1604783430,1604783430866000,20201107,1604783430866000,session_start,3,359,1604783430866000
10937,1604783430,1604783430866000,20201107,1604783877078004,user_engagement,3,359,1604783430866000
10938,1604783430,1604783430866000,20201107,1604783431489001,screen_view,3,359,1604783430866000
10939,1604783430,1604783430866000,20201107,1604783761687002,user_engagement,3,359,1604783430866000


In [192]:
#Merging all 3 grouped dataframes into grouped_all
from functools import reduce
df_user_properties_grouped_total = [df_user_properties_grouped1,df_user_properties_grouped2,df_user_properties_grouped3]
df_user_properties_grouped_all = reduce(lambda left,right: pd.merge(left,right,on=['event_date','event_timestamp','event_name']), df_user_properties_grouped_total)

"""#Removing unwanted columns
df_user_properties_grouped_all.drop(['index_x','index_y','index','event_date'],1, inplace=True)"""

"#Removing unwanted columns\ndf_user_properties_grouped_all.drop(['index_x','index_y','index','event_date'],1, inplace=True)"

In [193]:
#Rearranging columns in dataframe
df_user_properties_grouped_all = df_user_properties_grouped_all[['event_name','event_timestamp','first_open_time.value','ga_session_id.value','ga_session_number.value','first_open_time.timestamp','ga_session_id.timestamp','ga_session_number.timestamp']]
df_user_properties_grouped_all

Unnamed: 0,event_name,event_timestamp,first_open_time.value,ga_session_id.value,ga_session_number.value,first_open_time.timestamp,ga_session_id.timestamp,ga_session_number.timestamp
0,session_start,1604801718108000,1562979600000,1604801718,8,1562977643627000,1604801718108000,1604801718108000
1,session_start,1604785674744000,1562979600000,1604785674,7,1562977643627000,1604785674744000,1604785674744000
2,screen_view,1604785676284001,1562979600000,1604785674,7,1562977643627000,1604785674744000,1604785674744000
3,user_engagement,1604785725805005,1562979600000,1604785674,7,1562977643627000,1604785674744000,1604785674744000
4,screen_view,1604785725964006,1562979600000,1604785674,7,1562977643627000,1604785674744000,1604785674744000
...,...,...,...,...,...,...,...,...
10976,session_start,1604783430866000,1578348000000,1604783430,359,1578347879890000,1604783430866000,1604783430866000
10977,user_engagement,1604783877078004,1578348000000,1604783430,359,1578347879890000,1604783430866000,1604783430866000
10978,screen_view,1604783431489001,1578348000000,1604783430,359,1578347879890000,1604783430866000,1604783430866000
10979,user_engagement,1604783761687002,1578348000000,1604783430,359,1578347879890000,1604783430866000,1604783430866000


In [25]:
#Converting dtypes to int in order to match dtype in df_0 dataframe
df_user_properties_grouped_all['event_timestamp'] = pd.to_numeric(df_user_properties_grouped_all['event_timestamp'])
df_user_properties_grouped_all['first_open_time.value'] = pd.to_numeric(df_user_properties_grouped_all['first_open_time.value'])
df_user_properties_grouped_all['ga_session_id.value'] = pd.to_numeric(df_user_properties_grouped_all['ga_session_id.value'])
df_user_properties_grouped_all['ga_session_number.value'] = pd.to_numeric(df_user_properties_grouped_all['ga_session_number.value'])

df_user_properties_grouped_all.dtypes

event_name                     object
event_timestamp                 int64
first_open_time.value           int64
ga_session_id.value             int64
ga_session_number.value         int64
first_open_time.timestamp      object
ga_session_id.timestamp        object
ga_session_number.timestamp    object
dtype: object

# Merging user_properties dataframe with original dataframe df_0

In [76]:
reduce(lambda left,right: pd.merge(left,right,on=['event_timestamp','event_name']), [df_user_properties_grouped_all,df_0])

NameError: name 'reduce' is not defined

In [None]:
df_0.merge(df_userhow='left')

In [None]:
inner = pd.concat([df_0,df_user_properties_grouped_all],axis=1, join='inner')
inner

In [None]:
inner.isna().sum()

In [27]:
#Checking for similar values between columns
np.unique(np.where((df_user_properties_grouped_all['ga_session_number.timestamp'] == df_user_properties_grouped_all['event_timestamp']), True, False), return_counts=True)

(array([False]), array([10981], dtype=int64))

Here 10804 rows have non-matching columns

# Trying groupby method to rearrange user_properties dataframe to fit original dataframe

In [28]:
df_user_properties_unique=df_user_properties.groupby(['user_key','user_value.int_value']).size().reset_index().rename(columns={0:'count'})
df_user_properties_unique

Unnamed: 0,user_key,user_value.int_value,count
0,first_open_time,1511240400000,33
1,first_open_time,1513180800000,9
2,first_open_time,1514829600000,13
3,first_open_time,1517760000000,22
4,first_open_time,1517911200000,14
...,...,...,...
1178,ga_session_number,96,9
1179,ga_session_number,97,18
1180,ga_session_number,974,5
1181,ga_session_number,98,7


In [29]:
#Groupby dataframe sorted by first_open_time
df_user_properties_unique_sorted1  = df_user_properties_unique.loc[df_user_properties_unique['user_key'] == 'first_open_time'].sort_values('count')
df_user_properties_unique_sorted1

Unnamed: 0,user_key,user_value.int_value,count
174,first_open_time,1581080400000,1
229,first_open_time,1590825600000,1
244,first_open_time,1594576800000,1
163,first_open_time,1578844800000,1
65,first_open_time,1562810400000,2
...,...,...,...
35,first_open_time,1550714400000,203
27,first_open_time,1546412400000,227
311,first_open_time,1604185200000,242
215,first_open_time,1587441600000,352


In [30]:
#Here I see that the highest number of uses for a user is 386 time and the lowest at 1 with an average use of 34 times per user
df_user_properties_unique_sorted1.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
count,324.0,33.762346,46.185475,1.0,9.0,16.0,44.0,386.0


# Dataframe separated by nested items original matrix size

In [69]:
#Method 1
df_device_1 = df_0['device'].apply(pd.Series)

#Method 2
df_device = pd.json_normalize(df_0.device)
df_device

Unnamed: 0,category,mobile_brand_name,mobile_model_name,mobile_os_hardware_model,operating_system,operating_system_version,language,is_limited_ad_tracking,time_zone_offset_seconds,mobile_marketing_name,vendor_id
0,mobile,Google,Pixel,Pixel,ANDROID,10,en-us,No,-21600,,
1,mobile,Google,Pixel,Pixel,ANDROID,10,en-us,No,-21600,,
2,mobile,Google,Pixel,Pixel,ANDROID,10,en-us,No,-21600,,
3,mobile,Google,Pixel,Pixel,ANDROID,10,en-us,No,-21600,,
4,mobile,Google,Pixel,Pixel,ANDROID,10,en-us,No,-21600,,
...,...,...,...,...,...,...,...,...,...,...,...
10934,mobile,OnePlus,A6013,ONEPLUS A6013,ANDROID,10,en-us,No,-18000,6T,
10935,mobile,OnePlus,A6013,ONEPLUS A6013,ANDROID,10,en-us,No,-18000,6T,
10936,mobile,OnePlus,A6013,ONEPLUS A6013,ANDROID,10,en-us,No,-18000,6T,
10937,mobile,OnePlus,A6013,ONEPLUS A6013,ANDROID,10,en-us,No,-18000,6T,


In [32]:
df_geo = pd.json_normalize(df_0.geo)
df_geo

Unnamed: 0,continent,country,region,city,sub_continent,metro
0,Americas,United States,Texas,Austin,Northern America,(not set)
1,Americas,United States,Texas,Austin,Northern America,(not set)
2,Americas,United States,Texas,Austin,Northern America,(not set)
3,Americas,United States,Texas,Austin,Northern America,(not set)
4,Americas,United States,Texas,Austin,Northern America,(not set)
...,...,...,...,...,...,...
10934,Americas,Canada,Ontario,Toronto,Northern America,(not set)
10935,Americas,Canada,Ontario,Toronto,Northern America,(not set)
10936,Americas,Canada,Ontario,Toronto,Northern America,(not set)
10937,Americas,Canada,Ontario,Toronto,Northern America,(not set)


In [33]:
df_app_info = pd.json_normalize(df_0.app_info)
df_app_info

Unnamed: 0,id,version,firebase_app_id,install_source
0,org.livingletter.hymnal,1.1.7,1:76837103840:android:e1d753a7fbfeeaac,com.android.vending
1,org.livingletter.hymnal,1.1.7,1:76837103840:android:e1d753a7fbfeeaac,com.android.vending
2,org.livingletter.hymnal,1.1.7,1:76837103840:android:e1d753a7fbfeeaac,com.android.vending
3,org.livingletter.hymnal,1.1.7,1:76837103840:android:e1d753a7fbfeeaac,com.android.vending
4,org.livingletter.hymnal,1.1.7,1:76837103840:android:e1d753a7fbfeeaac,com.android.vending
...,...,...,...,...
10934,org.livingletter.hymnal,1.1.7,1:76837103840:android:e1d753a7fbfeeaac,com.android.vending
10935,org.livingletter.hymnal,1.1.7,1:76837103840:android:e1d753a7fbfeeaac,com.android.vending
10936,org.livingletter.hymnal,1.1.7,1:76837103840:android:e1d753a7fbfeeaac,com.android.vending
10937,org.livingletter.hymnal,1.1.7,1:76837103840:android:e1d753a7fbfeeaac,com.android.vending


In [34]:
df_traffic_source = pd.json_normalize(df_0.traffic_source)
df_traffic_source

Unnamed: 0,medium,source,name
0,organic,google-play,
1,organic,google-play,
2,organic,google-play,
3,organic,google-play,
4,organic,google-play,
...,...,...,...
10934,organic,google-play,
10935,organic,google-play,
10936,organic,google-play,
10937,organic,google-play,


event_date
event_timestamp

ga_session_id	int	Unique session identifier (based on the timestamp of the session_start event) associated with each event that occurs within a session
ga_session_number	int	Monotonically increasing identifier (starting with 1) of the ordinal position of a session as it relates to a user (e.g., a user's 1st or 5th session) associated with each event that occurs in a session


Using ga session id as the grouping element,order by 

List of dataframes by nested 

- df_device
- df_geo
- df_app_info
- df_traffic_source

# Saving sub DataFrames into csv files

In [200]:
# save the data to a new csv file
datapath = '../Project Hymnal/data'
save_file(df_device, 'device.csv', datapath)
save_file(df_geo, 'geo.csv', datapath)
save_file(df_app_info, 'app_info.csv', datapath)
save_file(df_traffic_source, 'traffic_source.csv', datapath)

A file already exists with this name.

Do you want to overwrite? (Y/N)y
Writing file.  "../Project Hymnal/data\device.csv"
Writing file.  "../Project Hymnal/data\geo.csv"
Writing file.  "../Project Hymnal/data\app_info.csv"
Writing file.  "../Project Hymnal/data\traffic_source.csv"
