In [5]:
import pandas as pd
import numpy as np
from google.cloud import bigquery as bq

service_account_path = "/home/yusuf/DataScience/dream_games/ybektas20.json" 
client = bq.Client.from_service_account_json(service_account_path)

# Define queries for AB test tables.
ab_test_tables_queries = {
    "q2_table_ab_test_enter": """
        SELECT
          COUNT(*) AS total_rows,
          COUNT(test_entry_timestamp) AS non_null_test_entry_timestamp,
          COUNT(install_timestamp) AS non_null_install_timestamp,
          COUNT(user_id) AS non_null_user_id,
          COUNT(platform) AS non_null_platform,
          COUNT(group_id) AS non_null_group_id
        FROM `casedreamgames.case_db.q2_table_ab_test_enter`;
    """,
    "q2_table_ab_test_revenue": """
        SELECT
          COUNT(*) AS total_rows,
          COUNT(event_timestamp) AS non_null_event_timestamp,
          COUNT(user_id) AS non_null_user_id,
          COUNT(platform) AS non_null_platform,
          COUNT(package_type) AS non_null_package_type,
          COUNT(level) AS non_null_level,
          COUNT(dollar_amount) AS non_null_dollar_amount
        FROM `casedreamgames.case_db.q2_table_ab_test_revenue`;
    """,
    "q2_table_ab_test_session": """
        SELECT
          COUNT(*) AS total_rows,
          COUNT(event_timestamp) AS non_null_event_timestamp,
          COUNT(user_id) AS non_null_user_id,
          COUNT(platform) AS non_null_platform,
          COUNT(time_spent) AS non_null_time_spent,
          COUNT(level) AS non_null_level
        FROM `casedreamgames.case_db.q2_table_ab_test_session`;
    """
}

# Iterate through each query, execute it, and print the shape and null counts per column.
for table_name, query in ab_test_tables_queries.items():
    print(f"Results for {table_name}:")
    
    # Run the query; assumes 'client' is your configured BigQuery client.
    df = client.query(query).result().to_dataframe()
    
    # Extract total row count.
    total_rows = df.loc[0, "total_rows"]
    print(f"Shape: ({total_rows} rows)")
    
    # For each column (ignoring the total_rows column), compute and print null counts.
    for col in df.columns:
        if col != "total_rows":
            non_null_count = df.loc[0, col]
            null_count = total_rows - non_null_count
            # Remove the "non_null_" prefix to display the original column name.
            orig_col = col.replace("non_null_", "")
            print(f"Column '{orig_col}': non-null = {non_null_count}, null = {null_count}")
    
    print("\n" + "-"*50 + "\n")


Results for q2_table_ab_test_enter:




Shape: (73450 rows)
Column 'test_entry_timestamp': non-null = 73450, null = 0
Column 'install_timestamp': non-null = 72249, null = 1201
Column 'user_id': non-null = 73450, null = 0
Column 'platform': non-null = 73450, null = 0
Column 'group_id': non-null = 73450, null = 0

--------------------------------------------------

Results for q2_table_ab_test_revenue:
Shape: (74929 rows)
Column 'event_timestamp': non-null = 74929, null = 0
Column 'user_id': non-null = 74929, null = 0
Column 'platform': non-null = 74929, null = 0
Column 'package_type': non-null = 74929, null = 0
Column 'level': non-null = 74929, null = 0
Column 'dollar_amount': non-null = 74929, null = 0

--------------------------------------------------

Results for q2_table_ab_test_session:
Shape: (224144299 rows)
Column 'event_timestamp': non-null = 224144299, null = 0
Column 'user_id': non-null = 224144299, null = 0
Column 'platform': non-null = 224144246, null = 53
Column 'time_spent': non-null = 224144296, null = 3
Colu