In [7]:
import pandas as pd
from google.cloud import bigquery


def save_bigquery_table_to_dataframe(
    project_id: str, dataset_id: str, table_id: str, sample_size: int
) -> pd.DataFrame:
    """
    Fetches a sample of rows from a BigQuery table and saves it to a pandas DataFrame.

    :param project_id: GCP project ID
    :param dataset_id: BigQuery dataset ID
    :param table_id: BigQuery table ID
    :param sample_size: Number of rows to sample
    :return: pandas DataFrame containing the sampled data
    """
    # Construct a BigQuery client object.
    client = bigquery.Client(project=project_id)

    # Construct the SQL query to fetch a sample of rows
    query = f"""
    SELECT *
    FROM `{project_id}.{dataset_id}.{table_id}`
    LIMIT {sample_size}
    """

    # Execute the query and convert the result to a pandas DataFrame
    query_job = client.query(query)
    result = query_job.result()
    dataframe = result.to_dataframe()

    return dataframe

In [3]:
project_id = "analytics-147612"
dataset_id = "dev_gsokolov"
table_id = "session_length"

client = bigquery.Client(project=project_id)

# Construct the SQL query to fetch a sample of rows
query = f"""
SELECT *
FROM `{project_id}.{dataset_id}.{table_id}`
"""

# Execute the query and convert the result to a pandas DataFrame
query_job = client.query(query)
result = query_job.result()
dataframe = result.to_dataframe()



In [4]:
dataframe

Unnamed: 0,customer_user_id,session_id,session_length
0,38929408,304,"<DateOffset: days=0, microseconds=2302175000, ..."
1,27397556,338,"<DateOffset: days=0, microseconds=3158000000, ..."
2,33988111,139,"<DateOffset: days=0, microseconds=1036993000, ..."
3,34001467,29,"<DateOffset: days=0, microseconds=469000, mont..."
4,3770345,874,"<DateOffset: days=0, microseconds=3086000, mon..."
...,...,...,...
45334269,8189371,820,"<DateOffset: days=0, microseconds=0, months=0,..."
45334270,36291264,563,"<DateOffset: days=0, microseconds=0, months=0,..."
45334271,33497275,433,"<DateOffset: days=0, microseconds=0, months=0,..."
45334272,15424384,175,"<DateOffset: days=0, microseconds=0, months=0,..."


In [None]:
from google.cloud import bigquery

# Initialize the BigQuery client
client = bigquery.Client()

# Set the project for the BigQuery dataset and table
project_id = "your-project-id"
dataset_id = "your-dataset-id"
table_id = "your-table-id"

# Define the query with sampling
query = f"SELECT * FROM {project_id}.{dataset_id}.{table_id} LIMIT 10 OFFSET FLOOR(RAND() * (SELECT COUNT(*) FROM {project_id}.{dataset_id}.{table_id}))"

# Run the query using BigQuery client and get the results as a DataFrame
df = client.query(query).to_dataframe()

In [5]:
import math
from scipy.stats import norm


def sample_size_for_difference(p1, p2, alpha=0.05, beta=0.2):
    """
    Calculate the sample size for a difference between two proportions.

    Parameters:
    p1 (float): Proportion 1
    p2 (float): Proportion 2
    alpha (float): Significance level (default 0.05)
    beta (float): Power of the test (default 0.2)

    Returns:
    int: Sample size per group
    """
    # calculate the pooled proportion
    p_pooled = (p1 + p2) / 2

    # calculate the standard deviation
    std_dev = math.sqrt(p_pooled * (1 - p_pooled) * ((1 / p1) + (1 / p2)))

    # calculate the z-score
    z_score = norm.ppf(1 - alpha / 2)

    # calculate the z-score for beta
    z_beta = norm.ppf(1 - beta)

    # calculate the sample size
    n = (z_score + z_beta) ** 2 * (2 * p_pooled * (1 - p_pooled)) / (p1 - p2) ** 2

    return math.ceil(n)


# usage
p1 = 0.012  # proportion 1
p2 = 0.015  # proportion 2
alpha = 0.05  # significance level
beta = 0.2  # power of the test

n = sample_size_for_difference(p1, p2, alpha, beta)
print(f"The required sample size per group for the A/B test is: {n}")

The required sample size per group for the A/B test is: 23229


In [9]:
users_trading = save_bigquery_table_to_dataframe(
    "analytics-147612", "dev_gsokolov", "users_segment_trading", 10000
)
users_trading.head()



Unnamed: 0,user_id,registered_ut,registered_dt,verification_status,is_email_verified,is_locked,lock_comment,last_login,birthdate,country_code,...,email_hash,email_hash_md5,firstname_hash,lastname_hash,phone_hash,phone_filled,has_token,has_consent,deal_count,total_volume
0,1017854,1562047096,2019-07-02 05:58:16+00:00,3,1,0,,1716502604,1991-06-05,CHL,...,2534303fee7d84e63236e07561515699a527a33f793988...,99203519feed135300c9fae79ff2707c,f94d542014cccfbc31406d27972e3dccd8db969c154f90...,346879cfc9032bf83442fec5fc07393a0691351c4ae77f...,be4b93a09980d61271453c2259b00270ad536cdc900abd...,1,True,True,71,1.35
1,1014769,1561964738,2019-07-01 07:05:38+00:00,3,1,0,,1716911419,1974-04-05,IND,...,5541e41baf47dd6f627def88659f1c31c2df8d4d98d2ad...,95193f2262a092383947e24a174ef5e7,cfdcd7c35d3ffbd605ec021a42b8c9a59e22042b84854a...,909453c008c3e766679ee5fda1bc84a6bbf0cae73936a8...,dc4aa5eddf0770a4fcb28226e9101995ad336c30b680fa...,1,False,True,13,0.13
2,1013913,1561948559,2019-07-01 02:35:59+00:00,3,1,0,,1704961337,1977-03-02,IND,...,2338ecf7d01f5d7da1867bd00dc676f4da369beff3c44c...,75d5da827369693f2c1a3f1c670e6ee8,917fe754aedf299775c71966c21dee69524b501534c9e3...,55887e06a97812fd4612f360e996272cb90f670de33cd0...,1a42c8aca326b367e2b9ea629b0f9a64a972824724e4ce...,1,True,True,94,1.01
3,1016884,1562018484,2019-07-01 22:01:24+00:00,3,1,0,,1716559680,1997-04-02,BOL,...,7b5936f7f12458d7f22d2d0a0328d6d529a10050077f96...,ac93cc442468b96bdb37f28eadc05852,c89f933155b560d8c41d007e8a7aa1be2e60ade66ff813...,4a26e9c660a61539000bff02e6e44d29b33271a4b84b7a...,cce84a170edc3184cf27a3909898634b7ed8e29faaa30e...,1,False,True,118,5.81
4,1015854,1561988264,2019-07-01 13:37:44+00:00,3,1,0,,1716860914,1998-07-08,BOL,...,17616c5e501ca42005f0508c8fedafc8beaa65c13a97af...,bd8efede12295cfd09460a0c478d4931,1fafec1934eb613c8a4d23247a8fca44be7986180442d3...,20527dbedd1d67c2c6f38dab72c4b3b162f2777b31bab2...,ec87b90119b713948aad46893cb28e4496350f7f48bae4...,1,True,True,543,9.78


In [10]:
# Import pandas library
import pandas as pd

# Assuming 'users_trading' is your DataFrame
# users_trading = pd.DataFrame()  # Replace this with your actual DataFrame

# Identify columns to drop
cols_to_drop = [col for col in users_trading.columns if "hash" in col]

# Drop the identified columns
users_trading.drop(cols_to_drop, axis=1, inplace=True)

In [11]:
users_trading

Unnamed: 0,user_id,registered_ut,registered_dt,verification_status,is_email_verified,is_locked,lock_comment,last_login,birthdate,country_code,...,language_id,support_language_id,is_withdraw_disabled,is_deposit_disabled,email_domain,phone_filled,has_token,has_consent,deal_count,total_volume
0,1017854,1562047096,2019-07-02 05:58:16+00:00,3,1,0,,1716502604,1991-06-05,CHL,...,8,,0,0,gmail.com,1,True,True,71,1.35
1,1014769,1561964738,2019-07-01 07:05:38+00:00,3,1,0,,1716911419,1974-04-05,IND,...,1,,0,0,gmail.com,1,False,True,13,0.13
2,1013913,1561948559,2019-07-01 02:35:59+00:00,3,1,0,,1704961337,1977-03-02,IND,...,1,,0,0,gmail.com,1,True,True,94,1.01
3,1016884,1562018484,2019-07-01 22:01:24+00:00,3,1,0,,1716559680,1997-04-02,BOL,...,1,,0,0,gmail.com,1,False,True,118,5.81
4,1015854,1561988264,2019-07-01 13:37:44+00:00,3,1,0,,1716860914,1998-07-08,BOL,...,8,,0,0,gmail.com,1,True,True,543,9.78
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,750693,1549446630,2019-02-06 09:50:30+00:00,3,1,0,,1644412508,1979-01-10,MYS,...,1,,0,0,gmail.com,1,True,True,10,0.11
9996,743212,1548988640,2019-02-01 02:37:20+00:00,3,1,0,,1710483688,1975-09-15,MYS,...,3,,0,0,yahoo.com,1,True,True,855,50.16
9997,743599,1549013467,2019-02-01 09:31:07+00:00,3,1,0,,1716723450,1982-08-12,MYS,...,1,,0,0,gmail.com,1,True,True,1333,118.63
9998,747869,1549303997,2019-02-04 18:13:17+00:00,3,1,0,,1716940780,1992-10-18,MYS,...,1,,0,0,gmail.com,1,False,True,39,0.40


In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris

# Load a sample dataset
data = users_trading
df = data
df["target"] = data.total_volume

# Split the data into features and target
X = df.drop("target", axis=1)
y = df["target"]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Create a decision tree classifier
clf = DecisionTreeClassifier()

# Train the classifier
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

TypeError: Cannot cast DatetimeArray to dtype float32

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
    

np.random.seed(42)  # For reproducibility
X = np.random.rand(100, 1)  # 100 samples with 1 feature
y = np.random.choice(
    [0, 1], size=100, p=[0.3, 0.7]
)  # Class labels with 30% 0s and 70% 1s

# Convert to a pandas DataFrame for better visualization
df = pd.DataFrame(X, columns=["feature"])
df["class"] = y

# Perform stratified sampling to split the dataset into a training set and a test set
# The stratify parameter ensures that the proportion of classes in the splits is the same as the original dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Verify the split by checking the proportions of classes in the original dataset and the splits
original_proportion = np.mean(y)
train_proportion = np.mean(y_train)
test_proportion = np.mean(y_test)

print(f"Original dataset proportion of class 1: {original_proportion}")
print(f"Training set proportion of class 1: {train_proportion}")
print(f"Test set proportion of class 1: {test_proportion}")

# The proportions should be similar, indicating that stratified sampling was successful