###  Data Loading & Preprocessing

In [1]:
import pandas as pd
import numpy as np
from google.cloud import bigquery as bq

service_account_path = "/home/yusuf/DataScience/dream_games/ybektas20.json" 
client = bq.Client.from_service_account_json(service_account_path)

query = """
SELECT
  table_name,
  column_name,
  data_type,
  is_nullable,
  ordinal_position
FROM `casedreamgames.case_db.INFORMATION_SCHEMA.COLUMNS`
ORDER BY table_name, ordinal_position;
"""
db_info = client.query(query).result().to_dataframe()
db_info.groupby(["table_name"]).apply(lambda x: print(x), include_groups=False)



  column_name data_type is_nullable  ordinal_position
0        date      DATE         YES                 1
1     network    STRING         YES                 2
2     country    STRING         YES                 3
3    platform    STRING         YES                 4
4        cost   FLOAT64         YES                 5
  column_name  data_type is_nullable  ordinal_position
5  event_time  TIMESTAMP         YES                 1
6     user_id     STRING         YES                 2
7    platform     STRING         YES                 3
8     network     STRING         YES                 4
9     country     STRING         YES                 5
   column_name  data_type is_nullable  ordinal_position
10  event_time  TIMESTAMP         YES                 1
11     user_id     STRING         YES                 2
12    platform     STRING         YES                 3
13       level      INT64         YES                 4
14      status     STRING         YES                 5
15  time_s

In [2]:
# Define queries for each Q1 table
q1_tables_queries = {
    "q1_table_install": """
        SELECT
          COUNT(*) AS total_rows,
          COUNT(user_id) AS non_null_user_id,
          COUNT(platform) AS non_null_platform,
          COUNT(network) AS non_null_network,
          COUNT(country) AS non_null_country,
          COUNT(event_time) AS non_null_event_time
        FROM `casedreamgames.case_db.q1_table_install`;
    """,
    "q1_table_level_end": """
        SELECT
          COUNT(*) AS total_rows,
          COUNT(user_id) AS non_null_user_id,
          COUNT(platform) AS non_null_platform,
          COUNT(level) AS non_null_level,
          COUNT(status) AS non_null_status,
          COUNT(time_spent) AS non_null_time_spent,
          COUNT(moves_made) AS non_null_moves_made,
          COUNT(moves_left) AS non_null_moves_left,
          COUNT(event_time) AS non_null_event_time
        FROM `casedreamgames.case_db.q1_table_level_end`;
    """,
    "q1_table_session": """
        SELECT
          COUNT(*) AS total_rows,
          COUNT(user_id) AS non_null_user_id,
          COUNT(platform) AS non_null_platform,
          COUNT(coin_status) AS non_null_coin_status,
          COUNT(time_spent) AS non_null_time_spent,
          COUNT(level) AS non_null_level,
          COUNT(event_time) AS non_null_event_time
        FROM `casedreamgames.case_db.q1_table_session`;
    """,
    "q1_table_revenue": """
        SELECT
          COUNT(*) AS total_rows,
          COUNT(user_id) AS non_null_user_id,
          COUNT(platform) AS non_null_platform,
          COUNT(package_type) AS non_null_package_type,
          COUNT(revenue) AS non_null_revenue,
          COUNT(event_time) AS non_null_event_time
        FROM `casedreamgames.case_db.q1_table_revenue`;
    """,
    "q1_table_cost": """
        SELECT
          COUNT(*) AS total_rows,
          COUNT(date) AS non_null_date,
          COUNT(network) AS non_null_network,
          COUNT(platform) AS non_null_platform,
          COUNT(country) AS non_null_country,
          COUNT(cost) AS non_null_cost
        FROM `casedreamgames.case_db.q1_table_cost`;
    """
}

# Iterate through each query, run it, and print the shape and null counts per column
for table_name, query in q1_tables_queries.items():
    print(f"Results for {table_name}:")
    
    # Run the query; this assumes that 'client' is your BigQuery client (or use your analytics object)
    df = client.query(query).result().to_dataframe()
    
    # Extract total row count
    total_rows = df.loc[0, "total_rows"]
    print(f"Shape: ({total_rows} rows)")
    
    # For each column (ignoring the total_rows column), compute and print null counts
    for col in df.columns:
        if col != "total_rows":
            non_null_count = df.loc[0, col]
            null_count = total_rows - non_null_count
            # Extract original column name by removing "non_null_" prefix
            orig_col = col.replace("non_null_", "")
            print(f"Column '{orig_col}': non-null = {non_null_count}, null = {null_count}")
    
    print("\n" + "-"*50 + "\n")


Results for q1_table_install:




Shape: (217415 rows)
Column 'user_id': non-null = 217415, null = 0
Column 'platform': non-null = 217415, null = 0
Column 'network': non-null = 217415, null = 0
Column 'country': non-null = 217415, null = 0
Column 'event_time': non-null = 217415, null = 0

--------------------------------------------------

Results for q1_table_level_end:
Shape: (72044374 rows)
Column 'user_id': non-null = 72044374, null = 0
Column 'platform': non-null = 72044373, null = 1
Column 'level': non-null = 72044374, null = 0
Column 'status': non-null = 72044374, null = 0
Column 'time_spent': non-null = 72044370, null = 4
Column 'moves_made': non-null = 72044373, null = 1
Column 'moves_left': non-null = 72044370, null = 4
Column 'event_time': non-null = 72044374, null = 0

--------------------------------------------------

Results for q1_table_session:
Shape: (297358858 rows)
Column 'user_id': non-null = 297358858, null = 0
Column 'platform': non-null = 297358850, null = 8
Column 'coin_status': non-null = 2973

### User Acquisition & Daily Active Users (DAU)

In [3]:
query = """
WITH session_data AS (
    SELECT 
      DATE(event_time) AS date, 
      user_id
    FROM `casedreamgames.case_db.q1_table_session`
)
SELECT
    s.date,
    COALESCE(i.country, 'unknown') AS country,
    COALESCE(i.platform, 'unknown') AS platform,
    COALESCE(i.network, 'unknown') AS network,
    COUNT(DISTINCT s.user_id) AS dau
FROM session_data s
LEFT JOIN `casedreamgames.case_db.q1_table_install` i
    ON s.user_id = i.user_id
GROUP BY s.date, country, platform, network
ORDER BY s.date, country, platform, network;     
"""
dau = client.query(query).result().to_dataframe()
dau.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1814 entries, 0 to 1813
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   date      1814 non-null   dbdate
 1   country   1814 non-null   object
 2   platform  1814 non-null   object
 3   network   1814 non-null   object
 4   dau       1814 non-null   Int64 
dtypes: Int64(1), dbdate(1), object(3)
memory usage: 72.8+ KB


In [7]:
import pandas as pd
import plotly.express as px


# Ensure the date column is in datetime format
dau['date'] = pd.to_datetime(dau['date'])

fig_total = px.line(
    dau.groupby("date", as_index=False).agg({"dau": "sum"}),
    x="date",
    y="dau",
    title="Daily Active Users",
    markers=True
)
fig_total.update_layout(template="plotly_white", xaxis_title="Date", yaxis_title="DAU")
fig_total.show()

fig_country = px.line(
    dau.groupby(["date", "country"], as_index=False).agg({"dau": "sum"}),
    x="date",
    y="dau",
    color="country",
    markers=True,
    title="Daily Active Users by Country",
)
fig_country.update_layout(template="plotly_white", xaxis_title="Date", yaxis_title="DAU")
fig_country.show()

fig_platform = px.line(
    dau.groupby(["date", "platform"], as_index=False).agg({"dau": "sum"}),
    x="date",
    y="dau",
    color="platform",
    markers=True,
    title="Daily Active Users by Platform"
)
fig_platform.update_layout(template="plotly_white", xaxis_title="Date", yaxis_title="DAU")
fig_platform.show()

fig_network = px.line(
    dau.groupby(["date", "network"], as_index=False).agg({"dau": "sum"}),
    x="date",
    y="dau",
    color="network",
    markers=True,
    title="Daily Active Users by Network"
)
fig_network.update_layout(template="plotly_white", xaxis_title="Date", yaxis_title="DAU")
fig_network.show()


### User Engagement and Retention

In [8]:
query ="""
WITH installs AS (
  SELECT 
    user_id, 
    DATE(event_time) AS install_date,
    country,
    platform,
    network
  FROM `casedreamgames.case_db.q1_table_install`
),
sessions AS (
  SELECT 
    user_id, 
    DATE(event_time) AS session_date
  FROM `casedreamgames.case_db.q1_table_session`
)
SELECT
  DATE_ADD(i.install_date, INTERVAL 1 DAY) AS retention_date,
  i.country,
  i.platform,
  i.network,
  COUNT(DISTINCT i.user_id) AS installs_previous_day,
  COUNT(DISTINCT s.user_id) AS retained_today,
  SAFE_DIVIDE(COUNT(DISTINCT s.user_id), COUNT(DISTINCT i.user_id)) AS retention_rate
FROM installs i
LEFT JOIN sessions s
  ON i.user_id = s.user_id
  AND s.session_date = DATE_ADD(i.install_date, INTERVAL 1 DAY)
GROUP BY retention_date, i.country, i.platform, i.network
ORDER BY retention_date, i.country, i.platform, i.network;
"""
retention = client.query(query).result().to_dataframe()
retention.info()


BigQuery Storage module not found, fetch data with the REST endpoint instead.



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 926 entries, 0 to 925
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   retention_date         926 non-null    dbdate 
 1   country                926 non-null    object 
 2   platform               926 non-null    object 
 3   network                926 non-null    object 
 4   installs_previous_day  926 non-null    Int64  
 5   retained_today         926 non-null    Int64  
 6   retention_rate         926 non-null    float64
dtypes: Int64(2), dbdate(1), float64(1), object(3)
memory usage: 52.6+ KB


In [12]:
### plot it by total, country, platform and network
retention_total = retention.groupby(["retention_date"], as_index=False).agg({"retained_today": "sum", "installs_previous_day": "sum"})
retention_total["retention_rate"] = retention_total["retained_today"] / retention_total["installs_previous_day"]

fig_total = px.line(
    retention_total.iloc[1:-1],
    x="retention_date",
    y="retention_rate",
    title="Retention Rate",
    markers=True
)
fig_total.update_layout(template="plotly_white", xaxis_title="Date", yaxis_title="Retention Rate")
fig_total.show()


retention_by_country = retention.groupby(["retention_date", "country"], as_index=False).agg({"retained_today": "sum", "installs_previous_day": "sum"})
retention_by_country["retention_rate"] = retention_by_country["retained_today"] / retention_by_country["installs_previous_day"]
fig_country = px.line(
    retention_by_country.iloc[1:-1],
    x="retention_date",
    y="retention_rate",
    color="country",
    title="Retention Rate by Country",
    markers=True
)
fig_country.update_layout(template="plotly_white", xaxis_title="Date", yaxis_title="Retention Rate")
fig_country.show()


retention_by_platform = retention.groupby(["retention_date", "platform"], as_index=False).agg({"retained_today": "sum", "installs_previous_day": "sum"})
retention_by_platform["retention_rate"] = retention_by_platform["retained_today"] / retention_by_platform["installs_previous_day"]
fig_platform = px.line(
    retention_by_platform.iloc[1:-1],
    x="retention_date",
    y="retention_rate",
    color="platform",
    title="Retention Rate by Platform",
    markers=True
)

fig_platform.update_layout(template="plotly_white", xaxis_title="Date", yaxis_title="Retention Rate")
fig_platform.show()




### Game Play 

In [15]:
query = """
WITH user_hourly AS (
  SELECT
    DATE(s.event_time) AS date,
    EXTRACT(DAYOFWEEK FROM s.event_time) AS day_of_week,
    EXTRACT(HOUR FROM s.event_time) AS hour_of_day,
    i.country,
    i.platform,
    i.network,
    s.user_id,
    SUM(s.time_spent) AS total_time_spent
  FROM `casedreamgames.case_db.q1_table_session` s
  JOIN `casedreamgames.case_db.q1_table_install` i
    ON s.user_id = i.user_id
  GROUP BY 
    date,
    day_of_week,
    hour_of_day,
    i.country,
    i.platform,
    i.network,
    s.user_id
)
SELECT
  date,
  day_of_week,
  hour_of_day,
  country,
  platform,
  network,
  SUM(total_time_spent) AS total_time_spent,
  COUNT(DISTINCT user_id) AS user_count
FROM user_hourly
GROUP BY 
  date,
  day_of_week,
  hour_of_day,
  country,
  platform,
  network
ORDER BY 
  date,
  day_of_week,
  hour_of_day,
  country,
  platform,
  network;

"""

time_spent = client.query(query).result().to_dataframe()
time_spent.info()


BigQuery Storage module not found, fetch data with the REST endpoint instead.



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34049 entries, 0 to 34048
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   date              34049 non-null  dbdate
 1   day_of_week       34049 non-null  Int64 
 2   hour_of_day       34049 non-null  Int64 
 3   country           34049 non-null  object
 4   platform          34049 non-null  object
 5   network           34049 non-null  object
 6   total_time_spent  34049 non-null  Int64 
 7   user_count        34049 non-null  Int64 
dtypes: Int64(4), dbdate(1), object(3)
memory usage: 2.2+ MB


In [23]:
### average time spent by user per day per country, platform and network
time_spent_by_date = time_spent.groupby(["date"], as_index=False).agg({"total_time_spent": "sum", "user_count": "sum"})
time_spent_by_date['avg_time_spent'] = time_spent_by_date['total_time_spent'] / time_spent_by_date['user_count']

fig_total = px.line(
    time_spent_by_date,
    x="date",
    y="avg_time_spent",
    title="Average Time Spent by User per Day",
    markers=True
)
fig_total.update_layout(template="plotly_white", xaxis_title="Date", yaxis_title="Average Time Spent")
fig_total.show()

time_spent_by_date_by_country = time_spent.groupby(["date", "country"], as_index=False).agg({"total_time_spent": "sum", "user_count": "sum"})
time_spent_by_date_by_country['avg_time_spent'] = time_spent_by_date_by_country['total_time_spent'] / time_spent_by_date_by_country['user_count']

fig_country = px.line(
    time_spent_by_date_by_country,
    x="date",
    y="avg_time_spent",
    color="country",
    title="Average Time Spent by User per Day by Country",
    markers=True
)
fig_country.update_layout(template="plotly_white", xaxis_title="Date", yaxis_title="Average Time Spent")
fig_country.show()

time_spent_by_date_by_platform = time_spent.groupby(["date", "platform"], as_index=False).agg({"total_time_spent": "sum", "user_count": "sum"})
time_spent_by_date_by_platform['avg_time_spent'] = time_spent_by_date_by_platform['total_time_spent'] / time_spent_by_date_by_platform['user_count']

fig_platform = px.line(
    time_spent_by_date_by_platform,
    x="date",
    y="avg_time_spent",
    color="platform",
    title="Average Time Spent by User per Day by Platform",
    markers=True
)

fig_platform.update_layout(template="plotly_white", xaxis_title="Date", yaxis_title="Average Time Spent")
fig_platform.show()

time_spent_by_date_by_network = time_spent.groupby(["date", "network"], as_index=False).agg({"total_time_spent": "sum", "user_count": "sum"})
time_spent_by_date_by_network['avg_time_spent'] = time_spent_by_date_by_network['total_time_spent'] / time_spent_by_date_by_network['user_count']

fig_network = px.line(
    time_spent_by_date_by_network,
    x="date",
    y="avg_time_spent",
    color="network",
    title="Average Time Spent by User per Day by Network",
    markers=True
)
fig_network.update_layout(template="plotly_white", xaxis_title="Date", yaxis_title="Average Time Spent")
fig_network.show()


In [25]:
### average time spent by user per dayofweek per country
time_spent_by_dow = time_spent.groupby(["day_of_week"], as_index=False).agg({"total_time_spent": "sum", "user_count": "sum"})
time_spent_by_dow['avg_time_spent'] = time_spent_by_dow['total_time_spent'] / time_spent_by_dow['user_count']

fig_total = px.line(
    time_spent_by_dow,
    x="day_of_week",
    y="avg_time_spent",
    title="Average Time Spent by User per Day of Week",
    markers=True
)

fig_total.update_layout(template="plotly_white", xaxis_title="Day of Week", yaxis_title="Average Time Spent")
fig_total.show()

time_spent_by_dow_by_country = time_spent.groupby(["day_of_week", "country"], as_index=False).agg({"total_time_spent": "sum", "user_count": "sum"})
time_spent_by_dow_by_country['avg_time_spent'] = time_spent_by_dow_by_country['total_time_spent'] / time_spent_by_dow_by_country['user_count']

fig_country = px.line(
    time_spent_by_dow_by_country,
    x="day_of_week",
    y="avg_time_spent",
    color="country",
    title="Average Time Spent by User per Day of Week by Country",
    markers=True
)

fig_country.update_layout(template="plotly_white", xaxis_title="Day of Week", yaxis_title="Average Time Spent")
fig_country.show()


## average time spent by user by hour of day per country
time_spent_by_hour = time_spent.groupby(["hour_of_day"], as_index=False).agg({"total_time_spent": "sum", "user_count": "sum"})
time_spent_by_hour['avg_time_spent'] = time_spent_by_hour['total_time_spent'] / time_spent_by_hour['user_count']

fig_total = px.line(
    time_spent_by_hour,
    x="hour_of_day",
    y="avg_time_spent",
    title="Average Time Spent by User per Hour of Day",
    markers=True
)

fig_total.update_layout(template="plotly_white", xaxis_title="Hour of Day", yaxis_title="Average Time Spent")
fig_total.show()

time_spent_by_hour_by_country = time_spent.groupby(["hour_of_day", "country"], as_index=False).agg({"total_time_spent": "sum", "user_count": "sum"})
time_spent_by_hour_by_country['avg_time_spent'] = time_spent_by_hour_by_country['total_time_spent'] / time_spent_by_hour_by_country['user_count']

fig_country = px.line(
    time_spent_by_hour_by_country,
    x="hour_of_day",
    y="avg_time_spent",
    color="country",
    title="Average Time Spent by User per Hour of Day by Country",
    markers=True
)

fig_country.update_layout(template="plotly_white", xaxis_title="Hour of Day", yaxis_title="Average Time Spent")
fig_country.show()



 

In [28]:
query = """
SELECT level,
        COUNT(DISTINCT user_id) AS num_people_passed
FROM `casedreamgames.case_db.q1_table_level_end`
WHERE status = 'win'
GROUP BY level
ORDER BY level;
"""

num_users_passed_level = client.query(query).result().to_dataframe()
fig = px.bar(
    num_users_passed_level,
    x="level",
    y="num_people_passed",
    title="Number of Users Passing Each Level",
    labels={"num_people_passed": "Number of Users Passed"}
)

fig.update_layout(template="plotly_white", xaxis_title="Level", yaxis_title="Number of Users Passed")
fig.show()


BigQuery Storage module not found, fetch data with the REST endpoint instead.



In [29]:
query = """
WITH user_level_time AS (
    SELECT
    level,
    user_id,
    SUM(time_spent) AS total_time_spent
    FROM `casedreamgames.case_db.q1_table_level_end`
    GROUP BY 1, 2
)
SELECT
    level,
    AVG(total_time_spent) AS avg_total_time_spent
FROM user_level_time
GROUP BY level
ORDER BY level;
"""

avg_time_spent_per_level = client.query(query).result().to_dataframe()
fig = px.bar(
    avg_time_spent_per_level,
    x="level",
    y="avg_total_time_spent",
    title="Average Time Spent per Level",
    labels={"avg_total_time_spent": "Average Time Spent"}
)

fig.update_layout(template="plotly_white", xaxis_title="Level", yaxis_title="Average Time Spent")
fig.show()


BigQuery Storage module not found, fetch data with the REST endpoint instead.



In [None]:
query = """ 
SELECT DISTINCT level FROM `casedreamgames.case_db.q1_table_level_end`
ORDER BY level;
"""

"""
WARNING: there seems no level between 950 and 1007
"""

levels = client.query(query).result().to_dataframe()
levels


BigQuery Storage module not found, fetch data with the REST endpoint instead.



Unnamed: 0,level
0,1
1,2
2,3
3,4
4,5
...,...
949,950
950,1007
951,1008
952,1009


In [49]:
query = """
SELECT
  level,
  COUNT(*) AS num_attempts,
  COUNT(DISTINCT user_id) AS num_people_attempted,
  COUNT(DISTINCT (moves_made + moves_left)) AS distinct_total_moves,
  SUM(moves_made) AS total_moves_made,
  SUM(moves_left) AS total_moves_left,
  MIN(moves_made + moves_left) AS common_total_moves_min,
  MAX(moves_made + moves_left) AS common_total_moves_max
FROM `casedreamgames.case_db.q1_table_level_end`
GROUP BY level
ORDER BY level;
"""
moves_per_level = client.query(query).result().to_dataframe().set_index("level")
moves_per_level.distinct_total_moves.value_counts()


BigQuery Storage module not found, fetch data with the REST endpoint instead.



distinct_total_moves
5     182
4     171
6     161
7     112
3      99
8      69
9      37
2      35
1      31
10     18
12     11
13      7
14      6
11      6
18      4
19      3
15      1
20      1
Name: count, dtype: Int64

In [51]:
import plotly.express as px

px.line(moves_per_level, y=['common_total_moves_min', 'common_total_moves_max'], 
              labels={'value': 'Moves', 'variable': 'Type'}, 
              title='Common Total Moves Min and Max per Level').show()

px.line(moves_per_level, y='num_attempts', title='Number of Attempts per Level').show()

avg_attempts_per_level = moves_per_level['num_attempts'] / moves_per_level['num_people_attempted']
px.line(avg_attempts_per_level, title='Average Number of Attempts per Level').show()

moves_per_level['moves_made_ratio'] = moves_per_level['total_moves_made'] / (moves_per_level['total_moves_made'] + moves_per_level['total_moves_left'])
px.line(moves_per_level, y='moves_made_ratio', title='Moves Made Ratio per Level').show()




### Cost Analysis

In [52]:
query = """
SELECT * 
FROM `casedreamgames.case_db.q1_table_cost`
ORDER BY date, platform, network, country
"""

cost = client.query(query).result().to_dataframe()
cost.info()


BigQuery Storage module not found, fetch data with the REST endpoint instead.



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1357 entries, 0 to 1356
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   date      1357 non-null   dbdate 
 1   network   1357 non-null   object 
 2   country   1357 non-null   object 
 3   platform  1357 non-null   object 
 4   cost      1357 non-null   float64
dtypes: dbdate(1), float64(1), object(3)
memory usage: 53.1+ KB


In [53]:
## plotting cost by date
fig = px.line(
    cost.groupby("date", as_index=False).agg({"cost": "sum"}),
    x="date",
    y="cost",
    title="Cost by Date",
    markers=True
)
fig.update_layout(template="plotly_white", xaxis_title="Date", yaxis_title="Cost")
fig.show()

## plotting cost by country
fig = px.line(
    cost.groupby(["date", "country"], as_index=False).agg({"cost": "sum"}),
    x="date",
    y="cost",
    color="country",
    title="Cost by Country",
    markers=True
)
fig.update_layout(template="plotly_white", xaxis_title="Date", yaxis_title="Cost")
fig.show()

## plotting cost by platform
fig = px.line(
    cost.groupby(["date", "platform"], as_index=False).agg({"cost": "sum"}),
    x="date",
    y="cost",
    color="platform",
    title="Cost by Platform",
    markers=True
)
fig.update_layout(template="plotly_white", xaxis_title="Date", yaxis_title="Cost")
fig.show()

## plotting cost by network
fig = px.line(
    cost.groupby(["date", "network"], as_index=False).agg({"cost": "sum"}),
    x="date",
    y="cost",
    color="network",
    title="Cost by Network",
    markers=True
)
fig.update_layout(template="plotly_white", xaxis_title="Date", yaxis_title="Cost")
fig.show()




In [64]:
## cost per installation
query = """
WITH daily_installs AS (
    SELECT
        DATE(event_time) AS date,
        country,
        platform,
        network,
        COUNT(DISTINCT user_id) AS installs
    FROM `casedreamgames.case_db.q1_table_install`
    GROUP BY 1, 2, 3, 4
),        
daily_cost AS (
    SELECT
        date,
        country,
        platform,
        network,
        SUM(cost) AS total_cost
    FROM `casedreamgames.case_db.q1_table_cost`
    GROUP BY 1, 2, 3, 4
)       
SELECT
    di.date,
    di.country,
    di.platform,
    di.network,
    di.installs,
    COALESCE(dc.total_cost, 0) AS total_cost
FROM daily_installs di
LEFT JOIN daily_cost dc
    ON di.date = dc.date
    AND di.country = dc.country
    AND di.platform = dc.platform
    AND di.network = dc.network
ORDER BY 
    di.date,
    di.country,
    di.platform,
    di.network;       
"""

cost_per_install = client.query(query).result().to_dataframe()
cost_per_install.info()


BigQuery Storage module not found, fetch data with the REST endpoint instead.



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 926 entries, 0 to 925
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   date        926 non-null    dbdate 
 1   country     926 non-null    object 
 2   platform    926 non-null    object 
 3   network     926 non-null    object 
 4   installs    926 non-null    Int64  
 5   total_cost  926 non-null    float64
dtypes: Int64(1), dbdate(1), float64(1), object(3)
memory usage: 44.4+ KB


In [65]:
## plotting cost per installation by date
cost_per_install_by_date = cost_per_install.groupby("date", as_index=False).agg({"total_cost": "sum", "installs": "sum"})
cost_per_install_by_date["cost_per_install"] = cost_per_install_by_date["total_cost"] / cost_per_install_by_date["installs"]

fig = px.line(
    cost_per_install_by_date,
    x="date",
    y="cost_per_install",
    title="Cost per Installation by Date",
    markers=True
)
fig.update_layout(template="plotly_white", xaxis_title="Date", yaxis_title="Cost per Installation")
fig.show()

## plotting cost per installation by country
cost_per_install_by_country = cost_per_install.groupby(["date", "country"], as_index=False).agg({"total_cost": "sum", "installs": "sum"})
cost_per_install_by_country["cost_per_install"] = cost_per_install_by_country["total_cost"] / cost_per_install_by_country["installs"]

fig = px.line(
    cost_per_install_by_country,
    x="date",
    y="cost_per_install",
    color="country",
    title="Cost per Installation by Country",
    markers=True
)
fig.update_layout(template="plotly_white", xaxis_title="Date", yaxis_title="Cost per Installation")
fig.show()

## plotting cost per installation by platform
cost_per_install_by_platform = cost_per_install.groupby(["date", "platform"], as_index=False).agg({"total_cost": "sum", "installs": "sum"})
cost_per_install_by_platform["cost_per_install"] = cost_per_install_by_platform["total_cost"] / cost_per_install_by_platform["installs"]

fig = px.line(
    cost_per_install_by_platform,
    x="date",
    y="cost_per_install",
    color="platform",
    title="Cost per Installation by Platform",
    markers=True
)
fig.update_layout(template="plotly_white", xaxis_title="Date", yaxis_title="Cost per Installation")
fig.show()

## plotting cost per installation by network
cost_per_install_by_network = cost_per_install.groupby(["date", "network"], as_index=False).agg({"total_cost": "sum", "installs": "sum"})
cost_per_install_by_network["cost_per_install"] = cost_per_install_by_network["total_cost"] / cost_per_install_by_network["installs"]

fig = px.line(
    cost_per_install_by_network,
    x="date",
    y="cost_per_install",
    color="network",
    title="Cost per Installation by Network",
    markers=True
)
fig.update_layout(template="plotly_white", xaxis_title="Date", yaxis_title="Cost per Installation")
fig.show()


In [66]:
import plotly.express as px
import plotly.graph_objects as go
import statsmodels.api as sm
import pandas as pd

def run_regression(df):
    X = df["total_cost"]
    X = sm.add_constant(X)
    y = df["installs"]
    model = sm.OLS(y, X).fit()
    return model

# Assuming cost_per_install_by_date is a DataFrame with columns: date, total_cost, installs
first_day = cost_per_install_by_date.loc[cost_per_install_by_date["date"] == cost_per_install_by_date["date"].min()]
last_day = cost_per_install_by_date.loc[cost_per_install_by_date["date"] == cost_per_install_by_date["date"].max()]

# Remove the first and last day
cost_per_install_by_date = cost_per_install_by_date.loc[
    (cost_per_install_by_date["date"] != first_day["date"].values[0]) & 
    (cost_per_install_by_date["date"] != last_day["date"].values[0])
]

model = run_regression(cost_per_install_by_date)
print(model.summary())

# Create a scatter plot with Plotly
fig = px.scatter(cost_per_install_by_date, x="total_cost", y="installs", title="Regression of Installs on Total Cost")

# Add the regression line
fig.add_trace(go.Scatter(
    x=cost_per_install_by_date["total_cost"],
    y=model.predict(),
    mode='lines',
    name='Regression Line',
    line=dict(color='red')
))

fig.update_layout(
    xaxis_title="Total Cost",
    yaxis_title="Installs"
)

fig.show()

                            OLS Regression Results                            
Dep. Variable:               installs   R-squared:                       0.312
Model:                            OLS   Adj. R-squared:                  0.288
Method:                 Least Squares   F-statistic:                     13.14
Date:                Thu, 20 Feb 2025   Prob (F-statistic):            0.00110
Time:                        21:08:28   Log-Likelihood:                -262.07
No. Observations:                  31   AIC:                             528.1
Df Residuals:                      29   BIC:                             531.0
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       2137.0158   1338.813      1.596      0.1

In [71]:
### we will print the regression summary for each group of country, platform and network

cost_per_install_models = {}

for (c, p, n), group in cost_per_install.groupby(["country", "platform", "network"]):
    if group.shape[0] < 20:
        print(f"Skipping {c}, {p}, {n} due to insufficient data")
        continue
    if group["total_cost"].nunique() < 2:
        print(f"Skipping {c}, {p}, {n} due to constant cost")
        continue
    if group["installs"].nunique() < 2:
        print(f"Skipping {c}, {p}, {n} due to constant installs")
        continue
    
    model = run_regression(group)
    cost_per_install_models[(c, p, n)] = model

### sortin the models by betas
sorted_models = sorted(cost_per_install_models.items(), key=lambda x: x[1].params["total_cost"], reverse=True)
for (c, p, n), model in sorted_models:
    print(f"Results for {c}, {p}, {n}:")
    print(model.summary())
    print("-"*50)    

Skipping Mercury, android, Organic due to constant cost
Skipping Mercury, android, Sid due to insufficient data
Skipping Mercury, ios, Organic due to constant cost
Skipping Pluton, android, Jessie due to insufficient data
Skipping Pluton, android, Organic due to constant cost
Skipping Pluton, android, Woody due to insufficient data
Skipping Pluton, ios, Jessie due to insufficient data
Skipping Pluton, ios, Organic due to constant cost
Skipping Pluton, ios, Sid due to insufficient data
Skipping Pluton, ios, Woody due to insufficient data
Skipping Saturn, android, Buzz due to insufficient data
Skipping Saturn, android, Jessie due to insufficient data
Skipping Saturn, android, Organic due to constant cost
Skipping Saturn, android, Sid due to insufficient data
Skipping Saturn, android, Woody due to insufficient data
Skipping Saturn, ios, Buzz due to insufficient data
Skipping Saturn, ios, Organic due to constant cost
Skipping Saturn, ios, Sid due to insufficient data
Skipping Saturn, ios, 

### Monetization

In [75]:
###  get the revenue by date, country, platform, network and package_type
query = """
SELECT
  DATE(r.event_time) AS date,
  i.country,
  r.platform,
  i.network,
  r.package_type,
  SUM(IFNULL(SAFE_CAST(r.revenue AS FLOAT64), 0)) AS total_revenue
FROM `casedreamgames.case_db.q1_table_revenue` r
LEFT JOIN `casedreamgames.case_db.q1_table_install` i
  ON r.user_id = i.user_id
GROUP BY date, i.country, r.platform, i.network, r.package_type
ORDER BY date, i.country, r.platform, i.network, r.package_type;
"""
revenue = client.query(query).result().to_dataframe()

revenue.info()


BigQuery Storage module not found, fetch data with the REST endpoint instead.



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2372 entries, 0 to 2371
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           2372 non-null   dbdate 
 1   country        2372 non-null   object 
 2   platform       2372 non-null   object 
 3   network        2372 non-null   object 
 4   package_type   2372 non-null   object 
 5   total_revenue  2372 non-null   float64
dtypes: dbdate(1), float64(1), object(4)
memory usage: 111.3+ KB


In [76]:
## plotting revenue by date
fig = px.line(
    revenue.groupby("date", as_index=False).agg({"total_revenue": "sum"}),
    x="date",
    y="total_revenue",
    title="Revenue by Date",
    markers=True
)
fig.update_layout(template="plotly_white", xaxis_title="Date", yaxis_title="Revenue")
fig.show()

## plotting revenue by country
fig = px.line(
    revenue.groupby(["date", "country"], as_index=False).agg({"total_revenue": "sum"}),
    x="date",
    y="total_revenue",
    color="country",
    title="Revenue by Country",
    markers=True
)

fig.update_layout(template="plotly_white", xaxis_title="Date", yaxis_title="Revenue")
fig.show()

## plotting revenue by platform
fig = px.line(
    revenue.groupby(["date", "platform"], as_index=False).agg({"total_revenue": "sum"}),
    x="date",
    y="total_revenue",
    color="platform",
    title="Revenue by Platform",
    markers=True
)

fig.update_layout(template="plotly_white", xaxis_title="Date", yaxis_title="Revenue")
fig.show()

## plotting revenue by network
fig = px.line(
    revenue.groupby(["date", "network"], as_index=False).agg({"total_revenue": "sum"}),
    x="date",
    y="total_revenue",
    color="network",
    title="Revenue by Network",
    markers=True
)

fig.update_layout(template="plotly_white", xaxis_title="Date", yaxis_title="Revenue")
fig.show()

## plotting revenue by package_type
fig = px.line(
    revenue.groupby(["date", "package_type"], as_index=False).agg({"total_revenue": "sum"}),
    x="date",
    y="total_revenue",
    color="package_type",
    title="Revenue by Package Type",
    markers=True
)

fig.update_layout(template="plotly_white", xaxis_title="Date", yaxis_title="Revenue")
fig.show()


In [85]:
### same for profit
profit = pd.merge(revenue, cost, on=["date", "country", "platform", "network"], how="inner")
profit["profit"] = profit["total_revenue"] - profit["cost"]


## plotting profit by date
fig = px.line(
    profit.groupby("date", as_index=False).agg({"profit": "sum"}),
    x="date",
    y="profit",
    title="Profit by Date",
    markers=True
)
fig.update_layout(template="plotly_white", xaxis_title="Date", yaxis_title="Profit")
fig.show()

## plotting profit by country
fig = px.line(
    profit.groupby(["date", "country"], as_index=False).agg({"profit": "sum"}),
    x="date",
    y="profit",
    color="country",
    title="Profit by Country",
    markers=True
)

fig.update_layout(template="plotly_white", xaxis_title="Date", yaxis_title="Profit")
fig.show()

## plotting profit by platform
fig = px.line(
    profit.groupby(["date", "platform"], as_index=False).agg({"profit": "sum"}),
    x="date",
    y="profit",
    color="platform",
    title="Profit by Platform",
    markers=True
)

fig.update_layout(template="plotly_white", xaxis_title="Date", yaxis_title="Profit")
fig.show()

## plotting profit by network
fig = px.line(
    profit.groupby(["date", "network"], as_index=False).agg({"profit": "sum"}),
    x="date",
    y="profit",
    color="network",
    title="Profit by Network",
    markers=True
)

fig.update_layout(template="plotly_white", xaxis_title="Date", yaxis_title="Profit")
fig.show()

### cumulative profit
fig = px.line(
    profit.groupby("date", as_index=True).agg({"profit": "sum"}).cumsum(),
    x=profit["date"].unique(),
    y="profit",
    title="Cumulative Profit",
    markers=True
)
fig.update_layout(template="plotly_white", xaxis_title="Date", yaxis_title="Cumulative Profit")
fig.show()

