In [2]:
import pandas as pd
from google.cloud import bigquery

def save_bigquery_table_to_dataframe(project_id: str, dataset_id: str, table_id: str, sample_size: int) -> pd.DataFrame:
    """
    Fetches a sample of rows from a BigQuery table and saves it to a pandas DataFrame.

    :param project_id: GCP project ID
    :param dataset_id: BigQuery dataset ID
    :param table_id: BigQuery table ID
    :param sample_size: Number of rows to sample
    :return: pandas DataFrame containing the sampled data
    """
    # Construct a BigQuery client object.
    client = bigquery.Client(project=project_id)

    # Construct the SQL query to fetch a sample of rows
    query = f"""
    SELECT *
    FROM `{project_id}.{dataset_id}.{table_id}`
    LIMIT {sample_size}
    """

    # Execute the query and convert the result to a pandas DataFrame
    query_job = client.query(query)
    result = query_job.result()
    dataframe = result.to_dataframe()

    return dataframe

In [3]:
project_id = 'analytics-147612'
dataset_id = 'dev_gsokolov'
table_id = 'session_length'

client = bigquery.Client(project=project_id)

# Construct the SQL query to fetch a sample of rows
query = f"""
SELECT *
FROM `{project_id}.{dataset_id}.{table_id}`
"""

# Execute the query and convert the result to a pandas DataFrame
query_job = client.query(query)
result = query_job.result()
dataframe = result.to_dataframe()





In [4]:
dataframe

Unnamed: 0,customer_user_id,session_id,session_length
0,38929408,304,"<DateOffset: days=0, microseconds=2302175000, ..."
1,27397556,338,"<DateOffset: days=0, microseconds=3158000000, ..."
2,33988111,139,"<DateOffset: days=0, microseconds=1036993000, ..."
3,34001467,29,"<DateOffset: days=0, microseconds=469000, mont..."
4,3770345,874,"<DateOffset: days=0, microseconds=3086000, mon..."
...,...,...,...
45334269,8189371,820,"<DateOffset: days=0, microseconds=0, months=0,..."
45334270,36291264,563,"<DateOffset: days=0, microseconds=0, months=0,..."
45334271,33497275,433,"<DateOffset: days=0, microseconds=0, months=0,..."
45334272,15424384,175,"<DateOffset: days=0, microseconds=0, months=0,..."


In [None]:
from google.cloud import bigquery

# Initialize the BigQuery client
client = bigquery.Client()

# Set the project for the BigQuery dataset and table
project_id = "your-project-id"
dataset_id = "your-dataset-id"
table_id = "your-table-id"

# Define the query with sampling
query = f"SELECT * FROM {project_id}.{dataset_id}.{table_id} LIMIT 10 OFFSET FLOOR(RAND() * (SELECT COUNT(*) FROM {project_id}.{dataset_id}.{table_id}))"

# Run the query using BigQuery client and get the results as a DataFrame
df = client.query(query).to_dataframe()


In [5]:
import math
from scipy.stats import norm

def sample_size_for_difference(p1, p2, alpha=0.05, beta=0.2):
    """
    Calculate the sample size for a difference between two proportions.

    Parameters:
    p1 (float): Proportion 1
    p2 (float): Proportion 2
    alpha (float): Significance level (default 0.05)
    beta (float): Power of the test (default 0.2)

    Returns:
    int: Sample size per group
    """
    # calculate the pooled proportion
    p_pooled = (p1 + p2) / 2
    
    # calculate the standard deviation
    std_dev = math.sqrt(p_pooled * (1 - p_pooled) * ((1 / p1) + (1 / p2)))
    
    # calculate the z-score
    z_score = norm.ppf(1 - alpha / 2)
    
    # calculate the z-score for beta
    z_beta = norm.ppf(1 - beta)
    
    # calculate the sample size
    n = (z_score + z_beta)**2 * (2 * p_pooled * (1 - p_pooled)) / (p1 - p2)**2
    
    return math.ceil(n)

# usage
p1 = 0.012  # proportion 1
p2 = 0.015  # proportion 2
alpha = 0.05  # significance level
beta = 0.2  # power of the test

n = sample_size_for_difference(p1, p2, alpha, beta)
print(f"The required sample size per group for the A/B test is: {n}")


The required sample size per group for the A/B test is: 23229


0.01201716133113985