###  Data Loading & Preprocessing

In [1]:
import pandas as pd
import numpy as np
from google.cloud import bigquery as bq

service_account_path = "/home/yusuf/DataScience/dream_games/ybektas20.json" 
client = bq.Client.from_service_account_json(service_account_path)

query = """
SELECT
  table_name,
  column_name,
  data_type,
  is_nullable,
  ordinal_position
FROM `casedreamgames.case_db.INFORMATION_SCHEMA.COLUMNS`
ORDER BY table_name, ordinal_position;
"""
db_info = client.query(query).result().to_dataframe()
db_info.groupby(["table_name"]).apply(lambda x: print(x), include_groups=False)



  column_name data_type is_nullable  ordinal_position
0        date      DATE         YES                 1
1     network    STRING         YES                 2
2     country    STRING         YES                 3
3    platform    STRING         YES                 4
4        cost   FLOAT64         YES                 5
  column_name  data_type is_nullable  ordinal_position
5  event_time  TIMESTAMP         YES                 1
6     user_id     STRING         YES                 2
7    platform     STRING         YES                 3
8     network     STRING         YES                 4
9     country     STRING         YES                 5
   column_name  data_type is_nullable  ordinal_position
10  event_time  TIMESTAMP         YES                 1
11     user_id     STRING         YES                 2
12    platform     STRING         YES                 3
13       level      INT64         YES                 4
14      status     STRING         YES                 5
15  time_s

In [2]:
# Define queries for each Q1 table
q1_tables_queries = {
    "q1_table_install": """
        SELECT
          COUNT(*) AS total_rows,
          COUNT(user_id) AS non_null_user_id,
          COUNT(platform) AS non_null_platform,
          COUNT(network) AS non_null_network,
          COUNT(country) AS non_null_country,
          COUNT(event_time) AS non_null_event_time
        FROM `casedreamgames.case_db.q1_table_install`;
    """,
    "q1_table_level_end": """
        SELECT
          COUNT(*) AS total_rows,
          COUNT(user_id) AS non_null_user_id,
          COUNT(platform) AS non_null_platform,
          COUNT(level) AS non_null_level,
          COUNT(status) AS non_null_status,
          COUNT(time_spent) AS non_null_time_spent,
          COUNT(moves_made) AS non_null_moves_made,
          COUNT(moves_left) AS non_null_moves_left,
          COUNT(event_time) AS non_null_event_time
        FROM `casedreamgames.case_db.q1_table_level_end`;
    """,
    "q1_table_session": """
        SELECT
          COUNT(*) AS total_rows,
          COUNT(user_id) AS non_null_user_id,
          COUNT(platform) AS non_null_platform,
          COUNT(coin_status) AS non_null_coin_status,
          COUNT(time_spent) AS non_null_time_spent,
          COUNT(level) AS non_null_level,
          COUNT(event_time) AS non_null_event_time
        FROM `casedreamgames.case_db.q1_table_session`;
    """,
    "q1_table_revenue": """
        SELECT
          COUNT(*) AS total_rows,
          COUNT(user_id) AS non_null_user_id,
          COUNT(platform) AS non_null_platform,
          COUNT(package_type) AS non_null_package_type,
          COUNT(revenue) AS non_null_revenue,
          COUNT(event_time) AS non_null_event_time
        FROM `casedreamgames.case_db.q1_table_revenue`;
    """,
    "q1_table_cost": """
        SELECT
          COUNT(*) AS total_rows,
          COUNT(date) AS non_null_date,
          COUNT(network) AS non_null_network,
          COUNT(platform) AS non_null_platform,
          COUNT(country) AS non_null_country,
          COUNT(cost) AS non_null_cost
        FROM `casedreamgames.case_db.q1_table_cost`;
    """
}

# Iterate through each query, run it, and print the shape and null counts per column
for table_name, query in q1_tables_queries.items():
    print(f"Results for {table_name}:")
    
    # Run the query; this assumes that 'client' is your BigQuery client (or use your analytics object)
    df = client.query(query).result().to_dataframe()
    
    # Extract total row count
    total_rows = df.loc[0, "total_rows"]
    print(f"Shape: ({total_rows} rows)")
    
    # For each column (ignoring the total_rows column), compute and print null counts
    for col in df.columns:
        if col != "total_rows":
            non_null_count = df.loc[0, col]
            null_count = total_rows - non_null_count
            # Extract original column name by removing "non_null_" prefix
            orig_col = col.replace("non_null_", "")
            print(f"Column '{orig_col}': non-null = {non_null_count}, null = {null_count}")
    
    print("\n" + "-"*50 + "\n")


Results for q1_table_install:




Shape: (217415 rows)
Column 'user_id': non-null = 217415, null = 0
Column 'platform': non-null = 217415, null = 0
Column 'network': non-null = 217415, null = 0
Column 'country': non-null = 217415, null = 0
Column 'event_time': non-null = 217415, null = 0

--------------------------------------------------

Results for q1_table_level_end:
Shape: (72044374 rows)
Column 'user_id': non-null = 72044374, null = 0
Column 'platform': non-null = 72044373, null = 1
Column 'level': non-null = 72044374, null = 0
Column 'status': non-null = 72044374, null = 0
Column 'time_spent': non-null = 72044370, null = 4
Column 'moves_made': non-null = 72044373, null = 1
Column 'moves_left': non-null = 72044370, null = 4
Column 'event_time': non-null = 72044374, null = 0

--------------------------------------------------

Results for q1_table_session:
Shape: (297358858 rows)
Column 'user_id': non-null = 297358858, null = 0
Column 'platform': non-null = 297358850, null = 8
Column 'coin_status': non-null = 2973

### User Acquisition & Daily Active Users (DAU)

In [3]:
query = """
SELECT 
    DATE(event_time) AS date,
    country,
    platform,
    network,
    COUNT(DISTINCT user_id) AS unique_users,
    COUNT(*) AS installs
FROM `casedreamgames.case_db.q1_table_install`
GROUP BY date, country, platform, network
ORDER BY date, country, platform, network;
"""

installs = client.query(query).result().to_dataframe()
installs['date'] = pd.to_datetime(installs['date'])
print(installs.head())
print(installs.tail())
installs.info()



        date  country platform  network  unique_users  installs
0 2021-04-30  Mercury  android     Buzz            85        85
1 2021-04-30  Mercury  android   Jessie             8         8
2 2021-04-30  Mercury  android  Organic            25        25
3 2021-04-30  Mercury  android      Sid             6         6
4 2021-04-30  Mercury  android    Woody            44        44
          date country platform  network  unique_users  installs
921 2021-05-31   Venus      ios     Buzz           510       510
922 2021-05-31   Venus      ios  Organic           133       133
923 2021-05-31   Venus      ios      Sid            95       124
924 2021-05-31   Venus      ios    Woody             9         9
925 2021-06-01   Venus      ios      Sid            10        14
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 926 entries, 0 to 925
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   date          926

In [4]:
import plotly.express as px

# Define the grouping columns and titles for each plot
grouping_columns = [
    (["date"], "Daily Installation Counts", None),
    (["date", "country"], "Daily Installation Counts by Country", "country"),
    (["date", "platform"], "Daily Installation Counts by Platform", "platform"),
    (["date", "network"], "Daily Installation Counts by Network", "network")
]

# Loop through the grouping columns and create the plots
for group_cols, title, color in grouping_columns:
    grouped_data = installs.groupby(group_cols, as_index=False).agg({"installs": "sum"})
    
    fig = px.line(
        grouped_data,
        x="date",
        y="installs",
        color=color,
        title=title,
        markers=True
    )
    
    fig.update_layout(template="plotly_white", xaxis_title="Date", yaxis_title="Installation Count")
    fig.show()

In [5]:
query = """
WITH session_data AS (
    SELECT 
      DATE(event_time) AS date, 
      user_id
    FROM `casedreamgames.case_db.q1_table_session`
)
SELECT
    s.date,
    COALESCE(i.country, 'unknown') AS country,
    COALESCE(i.platform, 'unknown') AS platform,
    COALESCE(i.network, 'unknown') AS network,
    COUNT(DISTINCT s.user_id) AS dau
FROM session_data s
LEFT JOIN `casedreamgames.case_db.q1_table_install` i
    ON s.user_id = i.user_id
GROUP BY s.date, country, platform, network
ORDER BY s.date, country, platform, network;     
"""
dau = client.query(query).result().to_dataframe()
dau['date'] = pd.to_datetime(dau['date'])
print(dau.head())
print(dau.tail())
dau.info()



BigQuery Storage module not found, fetch data with the REST endpoint instead.



        date  country platform  network  dau
0 2021-04-30  Mercury  android     Buzz   11
1 2021-04-30  Mercury  android  Organic    8
2 2021-04-30  Mercury  android      Sid    1
3 2021-04-30  Mercury  android    Woody    7
4 2021-04-30  Mercury      ios     Buzz   20
           date country platform  network   dau
1809 2021-06-14   Venus      ios     Buzz  5646
1810 2021-06-14   Venus      ios   Jessie   115
1811 2021-06-14   Venus      ios  Organic  1461
1812 2021-06-14   Venus      ios      Sid  1294
1813 2021-06-14   Venus      ios    Woody   337
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1814 entries, 0 to 1813
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   date      1814 non-null   datetime64[ns]
 1   country   1814 non-null   object        
 2   platform  1814 non-null   object        
 3   network   1814 non-null   object        
 4   dau       1814 non-null   Int64         
dtypes: Int64(

In [6]:
import pandas as pd
import plotly.express as px


grouping_columns = [
    (["date"], "Daily Active Users", None),
    (["date", "country"], "Daily Active Users by Country", "country"),
    (["date", "platform"], "Daily Active Users by Platform", "platform"),
    (["date", "network"], "Daily Active Users by Network", "network")
]

# Loop through the grouping columns and create the plots
for group_cols, title, color in grouping_columns:
    grouped_data = dau.groupby(group_cols, as_index=False).agg({"dau": "sum"})
    
    fig = px.line(
        grouped_data,
        x="date",
        y="dau",
        color=color,
        title=title,
        markers=True
    )
    
    fig.update_layout(template="plotly_white", xaxis_title="Date", yaxis_title="DAU")
    fig.show()

### User Engagement and Retention

In [7]:
query ="""
WITH installs AS (
  SELECT 
    user_id, 
    DATE(event_time) AS install_date,
    country,
    platform,
    network
  FROM `casedreamgames.case_db.q1_table_install`
),
sessions AS (
  SELECT 
    user_id, 
    DATE(event_time) AS session_date
  FROM `casedreamgames.case_db.q1_table_session`
)
SELECT
  DATE_ADD(i.install_date, INTERVAL 1 DAY) AS retention_date,
  i.country,
  i.platform,
  i.network,
  COUNT(DISTINCT i.user_id) AS installs_previous_day,
  COUNT(DISTINCT s.user_id) AS retained_today,
  SAFE_DIVIDE(COUNT(DISTINCT s.user_id), COUNT(DISTINCT i.user_id)) AS retention_rate
FROM installs i
LEFT JOIN sessions s
  ON i.user_id = s.user_id
  AND s.session_date = DATE_ADD(i.install_date, INTERVAL 1 DAY)
GROUP BY retention_date, i.country, i.platform, i.network
ORDER BY retention_date, i.country, i.platform, i.network;
"""

retention = client.query(query).result().to_dataframe()
retention['retention_date'] = pd.to_datetime(retention['retention_date'])
print(retention.head())
print(retention.tail())
retention.info()


BigQuery Storage module not found, fetch data with the REST endpoint instead.



  retention_date  country platform  network  installs_previous_day  \
0     2021-05-01  Mercury  android     Buzz                     85   
1     2021-05-01  Mercury  android   Jessie                      8   
2     2021-05-01  Mercury  android  Organic                     25   
3     2021-05-01  Mercury  android      Sid                      6   
4     2021-05-01  Mercury  android    Woody                     44   

   retained_today  retention_rate  
0              62        0.729412  
1               5        0.625000  
2              19        0.760000  
3               2        0.333333  
4              14        0.318182  
    retention_date country platform  network  installs_previous_day  \
921     2021-06-01   Venus      ios     Buzz                    510   
922     2021-06-01   Venus      ios  Organic                    133   
923     2021-06-01   Venus      ios      Sid                     95   
924     2021-06-01   Venus      ios    Woody                      9   
925     

In [8]:
import pandas as pd
import plotly.express as px

grouping_columns = [
    (["retention_date"], "Retention Rate", None),
    (["retention_date", "country"], "Retention Rate by Country", "country"),
    (["retention_date", "platform"], "Retention Rate by Platform", "platform"),
    (["retention_date", "network"], "Retention Rate by Network", "network")
]

for group_cols, title, color in grouping_columns:
    grouped_data = retention.groupby(group_cols, as_index=False).agg({"retained_today": "sum", "installs_previous_day": "sum"})
    grouped_data["retention_rate"] = grouped_data["retained_today"] / grouped_data["installs_previous_day"]
    
    fig = px.line(
        grouped_data.iloc[1:-1],  # Exclude the first and last rows
        x="retention_date",
        y="retention_rate",
        color=color,
        title=title,
        markers=True
    )
    
    fig.update_layout(template="plotly_white", xaxis_title="Date", yaxis_title="Retention Rate")
    fig.show()

### Game Play 

In [9]:
query = """
WITH user_hourly AS (
  SELECT
    DATE(s.event_time) AS date,
    EXTRACT(DAYOFWEEK FROM s.event_time) AS day_of_week,
    EXTRACT(HOUR FROM s.event_time) AS hour_of_day,
    i.country,
    i.platform,
    i.network,
    s.user_id,
    SUM(s.time_spent) AS total_time_spent
  FROM `casedreamgames.case_db.q1_table_session` s
  JOIN `casedreamgames.case_db.q1_table_install` i
    ON s.user_id = i.user_id
  GROUP BY 
    date,
    day_of_week,
    hour_of_day,
    i.country,
    i.platform,
    i.network,
    s.user_id
)
SELECT
  date,
  day_of_week,
  hour_of_day,
  country,
  platform,
  network,
  SUM(total_time_spent) AS total_time_spent,
  COUNT(DISTINCT user_id) AS user_count
FROM user_hourly
GROUP BY 
  date,
  day_of_week,
  hour_of_day,
  country,
  platform,
  network
ORDER BY 
  date,
  day_of_week,
  hour_of_day,
  country,
  platform,
  network;

"""

time_spent = client.query(query).result().to_dataframe()
time_spent['date'] = pd.to_datetime(time_spent['date'])
print(time_spent.head())
print(time_spent.tail())

time_spent.info()


BigQuery Storage module not found, fetch data with the REST endpoint instead.



        date  day_of_week  hour_of_day  country platform  network  \
0 2021-04-30            6           21  Mercury  android     Buzz   
1 2021-04-30            6           21   Pluton      ios     Buzz   
2 2021-04-30            6           21    Venus      ios     Buzz   
3 2021-04-30            6           22  Mercury      ios     Buzz   
4 2021-04-30            6           22  Mercury      ios  Organic   

   total_time_spent  user_count  
0                27           1  
1              1022           4  
2                20           1  
3                88           2  
4                66           1  
            date  day_of_week  hour_of_day country platform  network  \
34044 2021-06-14            2           20   Venus      ios     Buzz   
34045 2021-06-14            2           20   Venus      ios   Jessie   
34046 2021-06-14            2           20   Venus      ios  Organic   
34047 2021-06-14            2           20   Venus      ios      Sid   
34048 2021-06-14     

In [10]:
import pandas as pd
import plotly.express as px

# Define the grouping columns and titles for each plot
grouping_columns = [
    (["date"], "Average Time Spent by User per Day", None),
    (["date", "country"], "Average Time Spent by User per Day by Country", "country"),
    (["date", "platform"], "Average Time Spent by User per Day by Platform", "platform"),
    (["date", "network"], "Average Time Spent by User per Day by Network", "network")
]

# Loop through the grouping columns and create the plots
for group_cols, title, color in grouping_columns:
    grouped_data = time_spent.groupby(group_cols, as_index=False).agg({"total_time_spent": "sum", "user_count": "sum"})
    grouped_data['avg_time_spent'] = grouped_data['total_time_spent'] / grouped_data['user_count']
    
    fig = px.line(
        grouped_data,
        x="date",
        y="avg_time_spent",
        color=color,
        title=title,
        markers=True
    )
    
    fig.update_layout(template="plotly_white", xaxis_title="Date", yaxis_title="Average Time Spent")
    fig.show()

In [11]:
### average time spent by user per dayofweek per country
time_spent_by_dow = time_spent.groupby(["day_of_week"], as_index=False).agg({"total_time_spent": "sum", "user_count": "sum"})
time_spent_by_dow['avg_time_spent'] = time_spent_by_dow['total_time_spent'] / time_spent_by_dow['user_count']

fig_total = px.line(
    time_spent_by_dow,
    x="day_of_week",
    y="avg_time_spent",
    title="Average Time Spent by User per Day of Week",
    markers=True
)

fig_total.update_layout(template="plotly_white", xaxis_title="Day of Week", yaxis_title="Average Time Spent")
fig_total.show()

time_spent_by_dow_by_country = time_spent.groupby(["day_of_week", "country"], as_index=False).agg({"total_time_spent": "sum", "user_count": "sum"})
time_spent_by_dow_by_country['avg_time_spent'] = time_spent_by_dow_by_country['total_time_spent'] / time_spent_by_dow_by_country['user_count']

fig_country = px.line(
    time_spent_by_dow_by_country,
    x="day_of_week",
    y="avg_time_spent",
    color="country",
    title="Average Time Spent by User per Day of Week by Country",
    markers=True
)

fig_country.update_layout(template="plotly_white", xaxis_title="Day of Week", yaxis_title="Average Time Spent")
fig_country.show()


## average time spent by user by hour of day per country
time_spent_by_hour = time_spent.groupby(["hour_of_day"], as_index=False).agg({"total_time_spent": "sum", "user_count": "sum"})
time_spent_by_hour['avg_time_spent'] = time_spent_by_hour['total_time_spent'] / time_spent_by_hour['user_count']

fig_total = px.line(
    time_spent_by_hour,
    x="hour_of_day",
    y="avg_time_spent",
    title="Average Time Spent by User per Hour of Day",
    markers=True
)

fig_total.update_layout(template="plotly_white", xaxis_title="Hour of Day", yaxis_title="Average Time Spent")
fig_total.show()

time_spent_by_hour_by_country = time_spent.groupby(["hour_of_day", "country"], as_index=False).agg({"total_time_spent": "sum", "user_count": "sum"})
time_spent_by_hour_by_country['avg_time_spent'] = time_spent_by_hour_by_country['total_time_spent'] / time_spent_by_hour_by_country['user_count']

fig_country = px.line(
    time_spent_by_hour_by_country,
    x="hour_of_day",
    y="avg_time_spent",
    color="country",
    title="Average Time Spent by User per Hour of Day by Country",
    markers=True
)

fig_country.update_layout(template="plotly_white", xaxis_title="Hour of Day", yaxis_title="Average Time Spent")
fig_country.show()



 

In [12]:
query = """
SELECT level,
        COUNT(DISTINCT user_id) AS num_people_passed
FROM `casedreamgames.case_db.q1_table_level_end`
WHERE status = 'win'
GROUP BY level
ORDER BY level;
"""

num_users_passed_level = client.query(query).result().to_dataframe()
fig = px.bar(
    num_users_passed_level,
    x="level",
    y="num_people_passed",
    title="Number of Users Passing Each Level",
    labels={"num_people_passed": "Number of Users Passed"}
)

"""
!!!filter out users who have daily retention rate less than 0.5
"""
fig.update_layout(template="plotly_white", xaxis_title="Level", yaxis_title="Number of Users Passed")
fig.show()


BigQuery Storage module not found, fetch data with the REST endpoint instead.



In [13]:
query = """
WITH user_level_time AS (
    SELECT
    level,
    user_id,
    SUM(time_spent) AS total_time_spent
    FROM `casedreamgames.case_db.q1_table_level_end`
    GROUP BY level, user_id
)
SELECT
    level,
    AVG(total_time_spent) AS avg_total_time_spent
FROM user_level_time
GROUP BY level
ORDER BY level;
"""

avg_time_spent_per_level = client.query(query).result().to_dataframe()
fig = px.bar(
    avg_time_spent_per_level,
    x="level",
    y="avg_total_time_spent",
    title="Average Time Spent per Level",
    labels={"avg_total_time_spent": "Average Time Spent"}
)

fig.update_layout(template="plotly_white", xaxis_title="Level", yaxis_title="Average Time Spent")
fig.show()


BigQuery Storage module not found, fetch data with the REST endpoint instead.



In [14]:
query = """ 
SELECT DISTINCT level FROM `casedreamgames.case_db.q1_table_level_end`
ORDER BY level;
"""

"""
WARNING: there seems no level between 950 and 1007
"""

levels = client.query(query).result().to_dataframe()
levels


BigQuery Storage module not found, fetch data with the REST endpoint instead.



Unnamed: 0,level
0,1
1,2
2,3
3,4
4,5
...,...
949,950
950,1007
951,1008
952,1009


In [15]:
query = """
SELECT
  level,
  COUNT(*) AS num_attempts,
  COUNT(DISTINCT user_id) AS num_people_attempted,
  COUNT(DISTINCT (moves_made + moves_left)) AS distinct_total_moves,
  SUM(moves_made) AS total_moves_made,
  SUM(moves_left) AS total_moves_left,
  MIN(moves_made + moves_left) AS common_total_moves_min,
  MAX(moves_made + moves_left) AS common_total_moves_max
FROM `casedreamgames.case_db.q1_table_level_end`
GROUP BY level
ORDER BY level;
"""
moves_per_level = client.query(query).result().to_dataframe().set_index("level")
moves_per_level.distinct_total_moves.value_counts()


BigQuery Storage module not found, fetch data with the REST endpoint instead.



distinct_total_moves
5     182
4     171
6     161
7     112
3      99
8      69
9      37
2      35
1      31
10     18
12     11
13      7
14      6
11      6
18      4
19      3
15      1
20      1
Name: count, dtype: Int64

In [16]:
avg_time_spent_per_level[avg_time_spent_per_level.avg_total_time_spent <= avg_time_spent_per_level.avg_total_time_spent.quantile(0.05)]

Unnamed: 0,level,avg_total_time_spent
0,1,28.144198
1,2,44.045061
2,3,37.834667
3,4,45.850616
4,5,54.180378
5,6,65.762373
6,7,60.958733
8,9,65.171292
9,10,66.444657
12,13,68.282373


In [17]:
"""
!!! search for the best 3 user
"""

'\n!!! search for the best 3 user\n'

In [18]:
import plotly.express as px

px.line(moves_per_level, y=['common_total_moves_min', 'common_total_moves_max'], 
              labels={'value': 'Moves', 'variable': 'Type'}, 
              title='Common Total Moves Min and Max per Level').show()

px.line(moves_per_level, y='num_attempts', title='Number of Attempts per Level').show()

avg_attempts_per_level = moves_per_level['num_attempts'] / moves_per_level['num_people_attempted']
avg_attempts_per_level.name = 'avg_attempts'
px.line(avg_attempts_per_level, title='Average Number of Attempts per Level').show()

moves_per_level['moves_made_ratio'] = moves_per_level['total_moves_made'] / (moves_per_level['total_moves_made'] + moves_per_level['total_moves_left'])
px.line(moves_per_level, y='moves_made_ratio', title='Moves Made Ratio per Level').show()




### Cost Analysis

In [20]:
## cost per installation
query = """
WITH first_installs AS (
  SELECT 
    user_id,
    MIN(DATE(event_time)) AS first_install_date
  FROM `casedreamgames.case_db.q1_table_install`
  GROUP BY user_id
),
daily_installs AS (
  SELECT
    DATE(i.event_time) AS date,
    i.country,
    i.platform,
    i.network,
    COUNT(DISTINCT i.user_id) AS installs,
    COUNT(DISTINCT CASE 
      WHEN DATE(i.event_time) = fi.first_install_date THEN i.user_id 
    END) AS new_user
  FROM `casedreamgames.case_db.q1_table_install` AS i
  LEFT JOIN first_installs AS fi 
    ON i.user_id = fi.user_id
  GROUP BY date, i.country, i.platform, i.network
),
daily_cost AS (
  SELECT
    date,
    country,
    platform,
    network,
    SUM(cost) AS cost
  FROM `casedreamgames.case_db.q1_table_cost`
  GROUP BY date, country, platform, network
)
SELECT
  COALESCE(di.date, dc.date) AS date,
  COALESCE(di.country, dc.country) AS country,
  COALESCE(di.platform, dc.platform) AS platform,
  COALESCE(di.network, dc.network) AS network,
  COALESCE(di.installs, 0) AS installs,
  COALESCE(di.new_user, 0) AS new_user,
  COALESCE(dc.cost, 0) AS cost
FROM daily_installs AS di
FULL OUTER JOIN daily_cost AS dc
  ON di.date = dc.date
  AND di.country = dc.country
  AND di.platform = dc.platform
  AND di.network = dc.network
ORDER BY 
  date,
  country,
  platform,
  network;
"""

cost = client.query(query).result().to_dataframe()
cost['date'] = pd.to_datetime(cost['date'])
print(cost.head())
print(cost.tail())
cost.info()


BigQuery Storage module not found, fetch data with the REST endpoint instead.



        date  country platform  network  installs  new_user  cost
0 2021-04-30  Mercury  android     Buzz        85        85   0.0
1 2021-04-30  Mercury  android   Jessie         8         8   0.0
2 2021-04-30  Mercury  android  Organic        25        25   0.0
3 2021-04-30  Mercury  android      Sid         6         6   0.0
4 2021-04-30  Mercury  android    Woody        44        44   0.0
           date country platform  network  installs  new_user   cost
1381 2021-05-31   Venus      ios   Jessie         0         0    0.0
1382 2021-05-31   Venus      ios  Organic       133       133    0.0
1383 2021-05-31   Venus      ios      Sid        95        63  897.0
1384 2021-05-31   Venus      ios    Woody         9         9   49.0
1385 2021-06-01   Venus      ios      Sid        10         0    0.0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1386 entries, 0 to 1385
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----  

In [21]:
import plotly.express as px

# Define the grouping columns and titles for each plot
grouping_columns = [
    (["date"], "Cost by Date", None),
    (["date", "country"], "Cost by Country", "country"),
    (["date", "platform"], "Cost by Platform", "platform"),
    (["date", "network"], "Cost by Network", "network")
]

# Loop through the grouping columns and create the plots
for group_cols, title, color in grouping_columns:
    grouped_data = cost.groupby(group_cols, as_index=False).agg({"cost": "sum"})
    
    fig = px.line(
        grouped_data,
        x="date",
        y="cost",
        color=color,
        title=title,
        markers=True
    )
    
    fig.update_layout(template="plotly_white", xaxis_title="Date", yaxis_title="Cost")
    fig.show()

In [22]:
import plotly.express as px

# Define the grouping columns and titles for each plot
grouping_columns = [
    (["date"], "Cost per New User by Date", None),
    (["date", "country"], "Cost per New User by Country", "country"),
    (["date", "platform"], "Cost per New User by Platform", "platform"),
    (["date", "network"], "Cost per New User by Network", "network")
]

# Loop through the grouping columns and create the plots
for group_cols, title, color in grouping_columns:
    grouped_data = cost.groupby(group_cols, as_index=False).agg({"cost": "sum", "new_user": "sum"})
    grouped_data["cost_per_user"] = grouped_data["cost"] / grouped_data["new_user"]
    
    fig = px.line(
        grouped_data,
        x="date",
        y="cost_per_user",
        color=color,
        title=title,
        markers=True
    )
    
    fig.update_layout(template="plotly_white", xaxis_title="Date", yaxis_title="Cost per New User")
    fig.show()

In [23]:
import plotly.express as px
import plotly.graph_objects as go
import statsmodels.api as sm
import pandas as pd

cost_per_new_user_by_date = cost.groupby("date", as_index=False).agg({"cost": "sum", "new_user": "sum"})
def run_regression(df):
    X = df["cost"]
    X = sm.add_constant(X)
    y = df["new_user"]
    model = sm.OLS(y, X).fit()
    return model

# Assuming cost_per_new_user_by_date is a DataFrame with columns: date, cost, installs
first_day = cost_per_new_user_by_date.loc[cost_per_new_user_by_date["date"] == cost_per_new_user_by_date["date"].min()]
last_day = cost_per_new_user_by_date.loc[cost_per_new_user_by_date["date"] == cost_per_new_user_by_date["date"].max()]

# Remove the first and last day
cost_per_new_user_by_date = cost_per_new_user_by_date.loc[
    (cost_per_new_user_by_date["date"] != first_day["date"].values[0]) & 
    (cost_per_new_user_by_date["date"] != last_day["date"].values[0])
]

model = run_regression(cost_per_new_user_by_date)
print(model.summary())

# Create a scatter plot with Plotly
fig = px.scatter(cost_per_new_user_by_date, x="cost", y="new_user", title="Regression of Installs on Total Cost")

# Add the regression line
fig.add_trace(go.Scatter(
    x=cost_per_new_user_by_date["cost"],
    y=model.predict(),
    mode='lines',
    name='Regression Line',
    line=dict(color='red')
))

fig.update_layout(
    xaxis_title="Total Cost",
    yaxis_title="Installs"
)

fig.show()

                            OLS Regression Results                            
Dep. Variable:               new_user   R-squared:                       0.309
Model:                            OLS   Adj. R-squared:                  0.285
Method:                 Least Squares   F-statistic:                     12.95
Date:                Sat, 22 Feb 2025   Prob (F-statistic):            0.00118
Time:                        01:10:43   Log-Likelihood:                -262.20
No. Observations:                  31   AIC:                             528.4
Df Residuals:                      29   BIC:                             531.3
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       2128.0898   1344.013      1.583      0.1

In [24]:
### we will print the regression summary for each group of country, platform and network

cost_per_new_user_models = {}

for (c, p, n), group in cost.groupby(["country", "platform", "network"]):
    if group.shape[0] < 20:
        print(f"Skipping {c}, {p}, {n} due to insufficient data")
        continue
    if group["cost"].nunique() < 2:
        print(f"Skipping {c}, {p}, {n} due to constant cost")
        continue
    if group["new_user"].nunique() < 2:
        print(f"Skipping {c}, {p}, {n} due to constant installs")
        continue
    
    model = run_regression(group)
    cost_per_new_user_models[(c, p, n)] = model

### sortin the models by betas
sorted_models = sorted(cost_per_new_user_models.items(), key=lambda x: x[1].params["cost"], reverse=True)
for (c, p, n), model in sorted_models:
    print(f"Results for {c}, {p}, {n}:")
    print(model.summary())
    print("-"*50)    

Skipping Mercury, android, Organic due to constant cost
Skipping Mercury, android, Sid due to insufficient data
Skipping Mercury, ios, Organic due to constant cost
Skipping Pluton, android, Jessie due to insufficient data
Skipping Pluton, android, Organic due to constant cost
Skipping Pluton, android, Sid due to insufficient data
Skipping Pluton, ios, Jessie due to insufficient data
Skipping Pluton, ios, Organic due to constant cost
Skipping Saturn, android, Jessie due to insufficient data
Skipping Saturn, android, Organic due to constant cost
Skipping Saturn, android, Sid due to insufficient data
Skipping Saturn, android, Woody due to constant cost
Skipping Saturn, ios, Jessie due to insufficient data
Skipping Saturn, ios, Organic due to constant cost
Skipping Uranus, android, Jessie due to insufficient data
Skipping Uranus, android, Organic due to constant cost
Skipping Uranus, android, Sid due to insufficient data
Skipping Uranus, ios, Jessie due to insufficient data
Skipping Uranus

### Monetization

In [26]:
###  get the revenue by date, country, platform, network and package_type
query = """
SELECT
  DATE(r.event_time) AS date,
  i.country,
  r.platform,
  i.network,
  r.package_type,
  SUM(IFNULL(SAFE_CAST(r.revenue AS FLOAT64), 0)) AS total_revenue
FROM `casedreamgames.case_db.q1_table_revenue` r
LEFT JOIN `casedreamgames.case_db.q1_table_install` i
  ON r.user_id = i.user_id
GROUP BY date, i.country, r.platform, i.network, r.package_type
ORDER BY date, i.country, r.platform, i.network, r.package_type;
"""
revenue = client.query(query).result().to_dataframe()
revenue['date'] = pd.to_datetime(revenue['date'])
print(revenue.head())
print(revenue.tail())
revenue.info()


BigQuery Storage module not found, fetch data with the REST endpoint instead.



        date  country platform  network   package_type  total_revenue
0 2021-04-30  Mercury  android     Buzz   lovely_packs            2.0
1 2021-05-01  Mercury  android     Buzz  awesome_packs           10.0
2 2021-05-01  Mercury  android     Buzz   lovely_packs           36.0
3 2021-05-01  Mercury  android  Organic   lovely_packs           12.0
4 2021-05-01  Mercury  android    Woody   lovely_packs           22.0
           date country platform network   package_type  total_revenue
2367 2021-06-14   Venus      ios     Sid   pretty_packs           27.0
2368 2021-06-14   Venus      ios   Woody  awesome_packs           16.0
2369 2021-06-14   Venus      ios   Woody     cool_packs           10.0
2370 2021-06-14   Venus      ios   Woody   lovely_packs           56.0
2371 2021-06-14   Venus      ios   Woody   pretty_packs            8.0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2372 entries, 0 to 2371
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype       

In [27]:
import plotly.express as px

# Define the grouping columns and titles for each plot
grouping_columns = [
    (["date"], "Revenue by Date", None),
    (["date", "country"], "Revenue by Country", "country"),
    (["date", "platform"], "Revenue by Platform", "platform"),
    (["date", "network"], "Revenue by Network", "network"),
    (["date", "package_type"], "Revenue by Package Type", "package_type")
]

# Loop through the grouping columns and create the plots
for group_cols, title, color in grouping_columns:
    grouped_data = revenue.groupby(group_cols, as_index=False).agg({"total_revenue": "sum"})
    
    fig = px.line(
        grouped_data,
        x="date",
        y="total_revenue",
        color=color,
        title=title,
        markers=True
    )
    
    fig.update_layout(template="plotly_white", xaxis_title="Date", yaxis_title="Revenue")
    fig.show()

In [28]:
profit = revenue.drop(columns=['package_type']).merge(
    cost[['date', 'country', 'platform', 'network','cost']],
    on=['date', 'country', 'platform', 'network'],
    how='outer'
)
import plotly.express as px
import pandas as pd

# Assuming 'profit' DataFrame is already created and processed
profit['date'] = pd.to_datetime(profit['date'])
profit = profit.sort_values(by='date').fillna(0)
profit["profit"] = profit["total_revenue"] - profit["cost"]

# Define the grouping columns and titles for each plot
grouping_columns = [
    (["date"], "Profit by Date", None),
    (["date", "country"], "Profit by Country", "country"),
    (["date", "platform"], "Profit by Platform", "platform"),
    (["date", "network"], "Profit by Network", "network")
]

# Loop through the grouping columns and create the plots
for group_cols, title, color in grouping_columns:
    grouped_data = profit.groupby(group_cols, as_index=False).agg({"profit": "sum"})
    
    fig = px.line(
        grouped_data,
        x="date",
        y="profit",
        color=color,
        title=title,
        markers=True
    )
    
    fig.update_layout(template="plotly_white", xaxis_title="Date", yaxis_title="Profit")
    fig.show()

# Plotting cumulative profit
fig = px.line(
    profit.groupby("date", as_index=True).agg({"profit": "sum"}).cumsum().reset_index(),
    x="date",
    y="profit",
    title="Cumulative Profit",
    markers=True
)
fig.update_layout(template="plotly_white", xaxis_title="Date", yaxis_title="Cumulative Profit")
fig.show()


In [29]:
### Arpu
arpu = revenue.merge(
    installs,
    on=['date', 'country', 'platform', 'network'],
    how='inner'
)
#arpu["arpu"] = arpu["total_revenue"] / arpu["unique_users"]

for col in ["country", "platform", "network", "package_type"]:
    total_rev = arpu.groupby(["date", col], as_index=False).agg({"total_revenue": "sum"})
    total_users = arpu.groupby(["date", col], as_index=False).agg({"unique_users": "sum"})
    arpu_col = total_rev.merge(total_users, on=["date", col], how="inner")
    arpu_col["arpu"] = arpu_col["total_revenue"].cumsum() / arpu_col["unique_users"].cumsum()

    fig = px.line(
        arpu_col,
        x="date",
        y="arpu",
        color=col,
        title=f"ARPU by {col}",
        markers=True
    )
    fig.update_layout(template="plotly_white", xaxis_title="Date", yaxis_title="ARPU")
    fig.show()
    



In [31]:
dau.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1814 entries, 0 to 1813
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   date      1814 non-null   datetime64[ns]
 1   country   1814 non-null   object        
 2   platform  1814 non-null   object        
 3   network   1814 non-null   object        
 4   dau       1814 non-null   Int64         
dtypes: Int64(1), datetime64[ns](1), object(3)
memory usage: 72.8+ KB


In [None]:
### arpdau

arpdau = dau.merge(
    revenue,
    on=['date', 'country', 'platform', 'network'],
    how='inner'
)

for col in ["country", "platform", "network", "package_type"]:
    total_rev = arpdau.groupby(["date", col], as_index=False).agg({"total_revenue": "sum"})
    total_users = arpdau.groupby(["date", col], as_index=False).agg({"dau": "sum"})
    arpdau_col = total_rev.merge(total_users, on=["date", col], how="inner")
    arpdau_col["arpdau"] = arpdau_col["total_revenue"] / arpdau_col["dau"]

    fig = px.line(
        arpdau_col,
        x="date",
        y="arpdau",
        color=col,
        title=f"ARPDau by {col}",
        markers=True
    )
    fig.update_layout(template="plotly_white", xaxis_title="Date", yaxis_title="ARPDau")
    fig.show()

ValueError: You are trying to merge on datetime64[ns] and dbdate columns for key 'date'. If you wish to proceed you should use pd.concat