In [23]:
import pandas as pd
import scipy as sc
import numpy as np
import scipy.stats as stats
from scipy.stats import shapiro, kstest, ttest_ind, norm, mannwhitneyu, binom
import math
import statsmodels.api as sm
import statsmodels.stats.api as sms
import statsmodels.stats.proportion as proportion
import matplotlib.pyplot as plt
import statsmodels.stats.power as smp
from tqdm.auto import tqdm
from random import randint
import seaborn as sns
from itertools import product
from tqdm.notebook import tqdm
import warnings
from pyperclip import paste, copy
import ipywidgets as widgets
from itertools import product
import matplotlib.ticker as mtick

from IPython.display import Markdown as md

%load_ext google.cloud.bigquery
warnings.filterwarnings("ignore")

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

pal = [
    "#be0707",
    "#dc3248",
    "#ef5a80",
    "#f982b2",
    "#fca9dd",
    "#ffcfff",
    "#eebdfb",
    "#daacf9",
    "#c19cf8",
    "#a48ef7",
    "#7e82f7",
]
sns.set_theme(style="white", palette=pal, rc={"figure.figsize": (12, 8)})
plt.rcParams["figure.figsize"] = (12, 8)
res = pd.DataFrame()

In [128]:
user_ids = pd.read_csv(
    "/Users/gsokolov/Library/CloudStorage/GoogleDrive-gsokolov@ourgapps.com/My Drive/Exported Data/BTC_recom_210224.csv"
)

In [None]:
params = {"user_ids": user_ids["user_id"].to_list()}

In [ ]:
%%sql
SELECT
    user_id
    , operation_id
    , symbol_name
    , cmd
    , volume
    , open_price
    , close_price
    , profit
    , open_time_dt
    , close_time_dt
FROM
    `analytics-147612.wh_raw.trading_real_raw`
WHERE
    user_id IN UNNEST(@user_ids)
    AND DATE(close_time_dt) BETWEEN '2024-01-01' AND '2024-02-26'
    AND cmd < 2

In [ ]:
%%sql
SELECT
    user_id
    , DATE(DATE_TRUNC(open_time_dt, WEEK (MONDAY))) AS trade_week
    , ARRAY_AGG(
        STRUCT(
            user_id
            , operation_id
            , symbol_name
            , cmd
            , volume
            , open_price
            , close_price
            , profit
            , open_time_dt
            , close_time_dt
        )
    ) AS deals
FROM
    `analytics-147612.wh_raw.trading_real_raw`
WHERE
    user_id IN UNNEST(@user_ids)
    AND (
        DATE(open_time_dt) BETWEEN '2024-01-01' AND '2024-02-23'
        AND DATE(close_time_dt) BETWEEN '2024-01-01' AND '2024-02-23'
    )
    AND cmd < 2
GROUP BY
    user_id
    , trade_week
ORDER BY
    user_id
    , trade_week

In [None]:
deals = res

In [None]:
deals["deals"].apply(lambda x: [i for i in x if "BTC" in i["symbol_name"]]).apply(
    lambda x: len(x)
).sum()

In [None]:
deals["deals_cnt"] = deals["deals"].apply(lambda x: len(x))
deals["sum_vol"] = deals["deals"].apply(lambda x: sum(i["volume"] for i in x))
deals["btc_deals"] = deals["deals"].apply(
    lambda x: [i for i in x if i["symbol_name"] == "BTCUSD"]
)
deals["btc_deals_cnt"] = deals["btc_deals"].apply(lambda x: len(x))
deals["sum_btc_vol"] = deals["deals"].apply(
    lambda x: sum(i["volume"] for i in x if i["symbol_name"] == "BTCUSD")
)
deals_pivot = deals.pivot(index="user_id", columns="trade_week", values="deals")

In [None]:
deals["btc_deals_cnt"].sum()

In [None]:
def transform_list(data):
    result = dict.fromkeys(data[0])
    return pd.DataFrame({key: [row[key] for row in data] for key in result})

In [None]:
# import operator
# list(map(operator.itemgetter('operation_id'), deals_0))

In [None]:
deals_agg = deals.groupby("trade_week")[
    ["deals_cnt", "sum_vol", "btc_deals_cnt", "sum_btc_vol"]
].sum()

In [None]:
deals_num = deals[["user_id", "trade_week", "btc_deals_cnt", "deals_cnt"]]
deals_num["btc_deal_cr"] = deals["btc_deals_cnt"] / deals["deals_cnt"]

In [None]:
deals_num["btc_deals_cnt"].sum() / deals_num["deals_cnt"].sum()

In [None]:
deals_agg["sum_btc_vol"] / deals_agg["sum_vol"]
deals_agg["btc_deals_cnt"] / deals_agg["deals_cnt"]

In [None]:
def g(x):
    return f"{x/1000:.1f}k"


deals_agg["deals_cnt"] = deals_agg["deals_cnt"].apply(lambda x: f"{x/1000:.1f}k")
deals_agg

In [None]:
deals_agg = deals.groupby("trade_week")[
    ["deals_cnt", "btc_deals_cnt", "sum_vol", "sum_btc_vol"]
].sum()

deals_agg["traders_cnt"] = deals.groupby("trade_week")["user_id"].size()
deals_agg["btc_vol %"] = (
    (deals_agg["sum_btc_vol"] / deals_agg["sum_vol"]) * 100
).apply(lambda x: f"{x:.2f}%")
deals_agg["btc_deals %"] = (
    (deals_agg["btc_deals_cnt"] / deals_agg["deals_cnt"]) * 100
).apply(lambda x: f"{x:.2f}%")

deals_agg["deals_cnt"] = deals_agg["deals_cnt"].apply(lambda x: f"{x/1000:.1f}k")
deals_agg["btc_deals_cnt"] = deals_agg["btc_deals_cnt"].apply(
    lambda x: f"{x/1000:.1f}k"
)

deals_agg = deals_agg.rename(
    {"deals_cnt": "total_deals_count", "sum_vol": "total_volume"}, axis=1
)

In [None]:
deals["btc_deals_cnt"].sum() / deals["deals_cnt"].sum()

In [None]:
data = deals_agg.reset_index()
import matplotlib.pyplot as plt
import seaborn as sns

# Convert 'trade_week' to datetime
# data['trade_week'] = pd.to_datetime(data['trade_week'])

# Convert 'total_deals_count' and 'btc_deals_cnt' from formatted strings to numeric
data["total_deals_count"] = (
    data["total_deals_count"].str.replace("k", "e3").astype(float)
)
data["btc_deals_cnt"] = data["btc_deals_cnt"].str.replace("k", "e3").astype(float)

# Plotting
plt.figure(figsize=(14, 10))

# Total deals count and Bitcoin deals count
plt.subplot(3, 1, 1)
plt.plot(
    data["trade_week"], data["total_deals_count"], label="Total Deals Count", marker="o"
)
plt.plot(
    data["trade_week"], data["btc_deals_cnt"], label="Bitcoin Deals Count", marker="x"
)
plt.title("Total and Bitcoin Deals Count Over Time")
plt.xlabel("Trade Week")
plt.ylabel("Deals Count")
plt.legend()

# Total volume and Bitcoin volume
plt.subplot(3, 1, 2)
plt.plot(data["trade_week"], data["total_volume"], label="Total Volume", marker="o")
plt.plot(data["trade_week"], data["sum_btc_vol"], label="Bitcoin Volume", marker="x")
plt.title("Total and Bitcoin Volume Over Time")
plt.xlabel("Trade Week")
plt.ylabel("Volume")
plt.legend()

# Traders count
plt.subplot(3, 1, 3)
plt.plot(
    data["trade_week"],
    data["traders_cnt"],
    label="Traders Count",
    color="green",
    marker="o",
)
plt.title("Traders Count Over Time")
plt.xlabel("Trade Week")
plt.ylabel("Traders Count")
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
p = sns.color_palette("Blues", n_colors=2)

In [None]:
# Preparing the data for a stacked bar chart
data["non_btc_deals_cnt"] = data["total_deals_count"] - data["btc_deals_cnt"]

# Melting the dataframe to long format for seaborn plotting
long_format = data.melt(
    id_vars=["trade_week"],
    value_vars=["btc_deals_cnt", "non_btc_deals_cnt"],
    var_name="Deal Type",
    value_name="Deals Count",
)

# Renaming for clarity in the plot
long_format["Deal Type"] = long_format["Deal Type"].map(
    {"btc_deals_cnt": "Bitcoin Deals", "non_btc_deals_cnt": "Non-Bitcoin Deals"}
)
# Plotting
plt.figure(figsize=(12, 6))
sns.barplot(x="trade_week", y="Deals Count", hue="Deal Type", data=long_format)
plt.title("Total Deals Count vs. Bitcoin Deals Count")
plt.xlabel("Trade Week")
plt.ylabel("Deals Count")
plt.xticks(rotation=45)
plt.legend(title="Deal Type")
plt.show()

In [None]:
deals["btc_deals_cnt"].sum() / deals["deals_cnt"].sum()

In [None]:
deals["sum_btc_vol"].mean()

In [None]:
effect_size = smp.tt_ind_solve_power(
    alpha=0.05, power=0.8, nobs1=11000, ratio=1, alternative="two-sided"
)
effect_size

In [None]:
smp.tt_ind_solve_power(effect_size=0.05, alpha=0.05, power=0.8, nobs1=None)

In [None]:
import statsmodels.stats.effect_size as sme

In [None]:
mu_0 = deals["sum_btc_vol"].mean()
d = 0.05
sigma = deals["sum_btc_vol"].std()

mu_1 = d * sigma + mu_0
mu_1

In [None]:
from sklearn.model_selection import train_test_split

user_id_part1, user_id_part2 = train_test_split(
    user_ids["user_id"], test_size=0.5, random_state=42
)

In [None]:
deals["deals"].apply(
    lambda x: " ".join({i["symbol_name"] for i in x if "BTC" in i["symbol_name"]})
).unique()

In [None]:
deals_test, deals_control = (
    deals[deals.user_id.isin(user_id_part1)],
    deals[deals.user_id.isin(user_id_part2)],
)

In [None]:
from numpy import var, mean, sqrt


def cohend(d1, d2):
    n1, n2 = len(d1), len(d2)
    s1, s2 = var(d1, ddof=1), var(d2, ddof=1)
    s = sqrt(((n1 - 1) * s1 + (n2 - 1) * s2) / (n1 + n2 - 2))
    u1, u2 = mean(d1), mean(d2)
    return (u1 - u2) / s

In [None]:
from scipy.stats import ttest_ind
import pingouin as pg


def calculate_sample_size(effect_size, power=0.8, alpha=0.05):
    effect_size = 0.05

    sample_size = smp.TTestIndPower().solve_power(
        effect_size=effect_size,
        power=power,
        alpha=alpha,
        ratio=1,
        alternative="two-sided",
    )

    return int(sample_size), math.ceil(sample_size / 100) * 100


# pg.compute_effsize([0.15], [0.2])
calculate_sample_size(0.05)

In [None]:
def calculate_tpr(df):
    tpr = (df["pvalue"] < 0.05).sum() / len(df)

    se = np.sqrt(tpr * (1 - tpr) / len(df))
    lower = tpr - 1.96 * se
    upper = tpr + 1.96 * se
    return pd.Series({"tpr": tpr * 100, "CI": f"[{lower:.2f}, {upper:.2f}]"})

In [None]:
def run_simulation(params):
    from scipy.stats import ttest_ind

    lift, n, data = params
    sampled_data = data.sample(n)
    sampled_data["is_control"] = np.random.binomial(1, 0.5, n)

    control_data = sampled_data[sampled_data["is_control"] == 1]
    test_data = sampled_data[sampled_data["is_control"] == 0]

    test_data["sum_btc_vol"] = binom.rvs(
        1, data["sum_btc_vol"].mean() + lift, size=len(test_data)
    )
    stat, p_value = ttest_ind(control_data["sum_btc_vol"], test_data["sum_btc_vol"])
    return {
        "lift": lift,
        "n": n,
        "pvalue": p_value,
        "stat": stat,
        "control_mean": np.mean(control_data["sum_btc_vol"]),
        "test_mean": np.mean(test_data["sum_btc_vol"]),
        "power": smp.TTestIndPower().solve_power(
            effect_size=pg.compute_effsize(
                test_data["sum_btc_vol"], control_data["sum_btc_vol"]
            ),
            power=None,
            alpha=0.05,
            nobs1=len(control_data),
            alternative="two-sided",
        ),
    }


def ab_simulation(data, num_simulations, lift, rounded, alpha):
    # lifts_boundaries = (np.floor(lift / 3 * 10) / 10, np.floor(lift * 3 * 10) / 10)
    # lifts = np.linspace(0.5 * lift, 1.5 * lift, 5).astype(float)
    lifts = np.arange(0.03, 0.1, 0.01)
    sizes = np.linspace(rounded, 3 * rounded, 10).astype(int)

    params = [
        (lift, size, data) for lift, size in product(lifts, sizes)
    ] * num_simulations

    results = list(tqdm(map(run_simulation, params), total=len(params)))

    sim_results = pd.DataFrame(results)

    final_results = (
        sim_results.groupby(["lift", "n"])
        .apply(calculate_tpr)
        .reset_index()
        .sort_values("tpr", ascending=False)
    ).rename(
        columns={
            "tpr": "True Positive (%)",
            "lift": "Detected Effect",
            "n": "Sample Size",
        }
    )
    final_results.to_csv("ab_simulation_results.csv", index=False)
    return final_results

In [None]:
res = ab_simulation(
    data=deals[deals["trade_week"].astype("str") == "2024-02-19"],
    num_simulations=1000,
    lift=0.01,
    rounded=5000,
    alpha=0.05,
)

In [None]:
res[res["True Positive (%)"] > 85]

In [None]:
heatmap_data = res.pivot(
    index="Sample Size", columns="Detected Effect", values="True Positive (%)"
)

In [None]:
deals[deals["trade_week"].astype("str") == "2024-02-19"].to_json()

In [None]:
from sklearn.utils import resample
import itertools


def bootstrap_statistic(data, statistic_func=np.mean, n_bootstraps=1000):
    resampled_list = list(
        itertools.repeat(resample(data.values, replace=True), n_bootstraps)
    )
    bootstrap_statistics = list(map(statistic_func, resampled_list))
    return np.array(bootstrap_statistics)


mean_bootstrap_distribution = bootstrap_statistic(deals["sum_btc_vol"], np.mean, 1000)

mean_bootstrap_distribution_stats = {
    "Mean": np.mean(mean_bootstrap_distribution),
    "STD": np.std(mean_bootstrap_distribution),
    "Median": np.median(mean_bootstrap_distribution),
}

In [None]:
plt.hist(mean_bootstrap_distribution)

In [None]:
from pandasai.llm.openai import OpenAI
from pandasai import SmartDataframe, Agent

llm = OpenAI(api_token="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")

In [2]:
%%sql
select * from 
             dev_gsokolov.user_deals

Unnamed: 0,user_id,variant,operation_id,symbol_name,cmd,volume,open_price,close_price,profit,open_time_dt,close_time_dt,trade_day
0,15622635,Variant A,2065040315,BTCUSD,1,0.03,56419.22,56852.14,12.99,2024-02-27 16:11:05.000000,2024-02-27 16:22:13.000000,2024-02-27
1,35738797,Control Group,141513863,BTCUSD,0,0.02,55865.68,56341.07,-9.51,2024-02-27 07:35:49.000000,2024-02-27 08:13:30.000000,2024-02-27
2,35738797,Control Group,141524114,BTCUSD,0,0.02,56376.66,56359.70,0.34,2024-02-27 08:50:13.000000,2024-02-27 09:03:16.000000,2024-02-27
3,26311632,Variant A,3138619720,BTCUSD,0,0.01,56375.89,56262.07,1.14,2024-02-27 08:30:03.000000,2024-02-27 09:09:45.000000,2024-02-27
4,26311632,Variant A,3138589914,BTCUSD,1,0.01,55983.04,56006.63,0.24,2024-02-27 07:28:09.000000,2024-02-27 07:29:23.000000,2024-02-27
...,...,...,...,...,...,...,...,...,...,...,...,...
95,2037258,Control Group,141599584,BTCUSD,0,0.01,56858.49,56940.70,-0.82,2024-02-27 14:52:55.000000,2024-02-27 15:51:38.000000,2024-02-27
96,2037258,Control Group,141617081,BTCUSD,0,0.01,56838.27,56854.71,-0.16,2024-02-27 16:20:49.000000,2024-02-27 17:39:17.000000,2024-02-27
97,2037258,Control Group,141630190,BTCUSD,1,0.01,57197.08,57232.66,0.36,2024-02-27 19:33:20.000000,2024-02-27 19:41:27.000000,2024-02-27
98,2037258,Control Group,141638492,BTCUSD,1,0.01,56955.46,56956.16,0.01,2024-02-27 20:40:10.000000,2024-02-27 22:58:52.000000,2024-02-27


In [5]:
import sys

sys.path.append("scripts")

In [10]:
# noinspection PyUnresolvedReferences
from get_schema import generate_table_creation_sql

generate_table_creation_sql(df_sql4, "deals")

'CREATE TABLE deals (\n    user_id INT,\n    variant VARCHAR(255),\n    operation_id INT,\n    symbol_name VARCHAR(255),\n    cmd INT,\n    volume DECIMAL,\n    open_price DECIMAL,\n    close_price DECIMAL,\n    profit DECIMAL,\n    open_time_dt VARCHAR(255),\n    close_time_dt VARCHAR(255),\n    trade_day VARCHAR(255)\n);'

In [97]:
%%sql
SELECT * 
FROM
    dev_gsokolov.user_deals
WHERE 
DATE(open_time_dt) BETWEEN '2024-02-26' AND '2024-03-04'
AND DATE(close_time_dt) BETWEEN '2024-02-26' AND '2024-03-04'

Unnamed: 0,user_id,variant,operation_id,symbol_name,cmd,volume,open_price,close_price,profit,open_time_dt,close_time_dt,trade_day
0,34852488,Variant A,19109432,BTCUSD,0,0.02,63143.20000,63074.60000,1.37,2024-03-04 00:01:29.051100,2024-03-04 00:03:28.035800,2024-03-04
1,36329468,Variant A,19110591,XRPUSD,0,0.01,0.62667,0.63000,-1.67,2024-03-03 23:32:21.058600,2024-03-04 00:19:37.060400,2024-03-03
2,37928291,Control Group,19109783,XAUUSD,0,0.03,2083.34000,2082.61000,2.19,2024-03-01 17:20:09.497800,2024-03-04 00:09:58.676300,2024-03-01
3,37928291,Control Group,19110393,GBPUSD,0,0.02,1.26570,1.26645,-1.50,2024-03-01 17:37:30.571300,2024-03-04 00:17:04.545000,2024-03-01
4,36858258,Variant A,19110756,BTCUSD,0,0.01,62741.60000,63502.00000,-7.60,2024-03-03 17:17:23.655500,2024-03-04 00:21:16.342900,2024-03-03
...,...,...,...,...,...,...,...,...,...,...,...,...
778445,32218984,Variant A,18567292,GBPAUD,0,0.02,1.94002,1.93901,1.32,2024-02-26 18:20:44.666200,2024-02-27 04:43:43.691100,2024-02-26
778446,38174604,Control Group,9909999,BTCUSD,0,0.01,56858.40000,56739.80000,1.19,2024-02-27 20:41:39.971900,2024-02-27 21:52:48.984100,2024-02-27
778447,18785595,Control Group,9875585,XAUUSD,0,0.01,2038.60000,2035.83000,2.77,2024-02-27 08:38:30.720700,2024-02-27 10:12:28.178600,2024-02-27
778448,37002381,Variant A,21345271,ETHUSD,1,0.01,3216.09000,3224.42000,0.83,2024-02-27 04:18:34.926600,2024-02-27 07:03:33.681000,2024-02-27


In [139]:
%%sql
SELECT
    user_id
     , variant
--      , DATE_TRUNC(open_time_dt, DAY) AS deal_day
--      , symbol_name
     , COUNT(DISTINCT operation_id) AS deals_cnt
     , SUM(volume) AS symbol_volume
     , SUM(CASE WHEN symbol_name = 'BTCUSD' THEN volume ELSE 0 END) as btc_vol
     , MIN(CASE WHEN symbol_name = 'BTCUSD' THEN open_time_dt ELSE NULL END) as first_btc_date
     , SUM(CASE WHEN symbol_name = 'BTCUSD' THEN 1 ELSE 0 END) as btc_deals_cnt
     , IF(SUM(CASE WHEN symbol_name = 'BTCUSD' THEN 1 ELSE 0 END) > 0, 1, 0) AS converted
FROM
    dev_gsokolov.user_deals
WHERE
    DATE(open_time_dt) BETWEEN '2024-02-26' AND '2024-03-04'
AND DATE(close_time_dt) BETWEEN '2024-02-26' AND '2024-03-04'
GROUP BY
    user_id, variant

Unnamed: 0,user_id,variant,deal_day,deals_cnt,symbol_volume,btc_vol,first_btc_date,btc_deals_cnt,converted
0,36329468,Variant A,2024-03-03 00:00:00.000000,43,0.71,0.00,,0,0
1,29507880,Variant A,2024-03-03 00:00:00.000000,36,1.45,0.00,,0,0
2,22695241,Control Group,2024-03-01 00:00:00.000000,35,0.48,0.03,2024-03-01 10:31:05.000000,2,1
3,15065184,Variant A,2024-03-01 00:00:00.000000,37,1.27,0.06,2024-03-01 10:33:38.000000,2,1
4,29255870,Control Group,2024-02-29 00:00:00.000000,45,1.34,0.00,,0,0
...,...,...,...,...,...,...,...,...,...
62674,20477519,Variant A,2024-03-03 00:00:00.000000,28,0.28,0.28,2024-03-03 03:19:08.000000,28,1
62675,11570037,Variant A,2024-03-03 00:00:00.000000,28,1.45,1.45,2024-03-03 00:23:56.000000,28,1
62676,7363206,Variant A,2024-03-03 00:00:00.000000,28,1.17,1.17,2024-03-03 02:38:33.000000,28,1
62677,37442962,Variant A,2024-02-27 00:00:00.000000,28,0.36,0.00,,0,0


In [134]:
# Calculations grouped by variant
grouped_stats = (
    ab_stats.groupby("variant")
    .agg(
        user_count=pd.NamedAgg(column="user_id", aggfunc="nunique"),
        avg_vol_btcusd=pd.NamedAgg(column="btc_vol", aggfunc="mean"),
        std_volume_btcusd=pd.NamedAgg(column="btc_vol", aggfunc=lambda x: x.std()),
        total_deals_btc=pd.NamedAgg(column="btc_deals_cnt", aggfunc="sum"),
        vol_btc=pd.NamedAgg(column="btc_vol", aggfunc="sum"),
        total_vol=pd.NamedAgg(column="symbol_volume", aggfunc="sum"),
        total_converted=pd.NamedAgg(column="converted", aggfunc="sum"),
        total_deals=pd.NamedAgg(column="deals_cnt", aggfunc="sum"),
    )
    .round(2)
)

# display(grouped_stats.reset_index())
display(grouped_stats)

Unnamed: 0_level_0,user_count,avg_vol_btcusd,std_volume_btcusd,total_deals_btc,vol_btc,total_vol,total_converted,total_deals
variant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Control Group,8146,0.42,5.21,79446,3461.18,13495.73,1863,393598
Variant A,8125,0.31,2.29,75672,2543.2,11016.76,1923,384852


In [119]:
%%sql
SELECT
    user_id
     , variant
--      , symbol_name
     , COUNT(DISTINCT operation_id) AS deals_cnt
     , SUM(volume) AS symbol_volume
     , SUM(CASE WHEN symbol_name = 'BTCUSD' THEN volume ELSE 0 END) as btc_vol
     , SUM(CASE WHEN symbol_name = 'BTCUSD' THEN 1 ELSE 0 END) as btc_deals_cnt
     , IF(SUM(CASE WHEN symbol_name = 'BTCUSD' THEN 1 ELSE 0 END) > 0, 1, 0) AS converted
FROM
    dev_gsokolov.user_deals
WHERE
    DATE(open_time_dt) < '2024-02-26'
GROUP BY
    user_id, variant

Unnamed: 0,user_id,variant,deals_cnt,symbol_volume,btc_vol,btc_deals_cnt,converted
0,20903638,Control Group,325,4.32,0.75,43,1
1,36778814,Control Group,914,19.05,4.21,155,1
2,1228529,Variant A,500,5.62,0.53,39,1
3,27577479,Control Group,1427,36.03,10.73,137,1
4,37157383,Variant A,2149,37.48,11.58,752,1
...,...,...,...,...,...,...,...
21100,2822953,Variant A,109,1.09,0.10,10,1
21101,36582531,Control Group,345,4.41,0.17,10,1
21102,156190,Control Group,217,4.65,0.60,10,1
21103,84498,Control Group,106,1.91,0.15,10,1


In [135]:
print(user_stats_historical["btc_vol"].std())
print(user_stats_historical["btc_vol"].mean())
print(user_stats_historical["converted"].mean())

5.891829347503155
0.7703965884861406
0.2913527600094764


https://www.analytics-toolkit.com/ab-testing-calculator/apiendpoint.php
b8ebd6e23f6979c0a5b037116a5a7e88b350d27a470b082c8beed5b55a52361e
12343

In [123]:
to_send = ab_stats[["user_id", "variant", "btc_vol"]].groupby("variant")

In [124]:
control_data = (
    to_send.get_group("Control Group")["btc_vol"].round(2).astype("str").to_list()
)
test_data = to_send.get_group("Variant A")["btc_vol"].round(2).astype("str").to_list()
data = {"d0": control_data, "d1": test_data}

In [130]:
import requests, json

url = "https://www.analytics-toolkit.com/ab-testing-calculator/apiendpoint.php"

body = {
    "auth": "b8ebd6e23f6979c0a5b037116a5a7e88b350d27a470b082c8beed5b55a52361e",
    "testId": 12349,
    "dataType": 2,
}
response = requests.post(url, params=body, data={"testData": json.dumps(data)})
response.text



In [157]:
%%sql
SELECT
    user_id
     , variant
--      , DATE(DATE_TRUNC(open_time_dt, DAY)) AS deal_day
     , DATE(MIN(CASE WHEN symbol_name = 'BTCUSD' THEN open_time_dt ELSE NULL END)) as first_btc_date
     , IF(SUM(CASE WHEN symbol_name = 'BTCUSD' THEN 1 ELSE 0 END) > 0, 1, 0) AS converted
FROM
    dev_gsokolov.user_deals
WHERE
    DATE(open_time_dt) BETWEEN '2024-02-26' AND '2024-03-04'
AND DATE(close_time_dt) BETWEEN '2024-02-26' AND '2024-03-04'
GROUP BY
    user_id, variant
order by 
    first_btc_date

Unnamed: 0,user_id,variant,first_btc_date,converted
0,1214376,Control Group,,0
1,13797465,Control Group,,0
2,9936287,Variant A,,0
3,61431,Control Group,,0
4,2815325,Control Group,,0
...,...,...,...,...
16266,35451276,Control Group,2024-03-03,1
16267,32492736,Variant A,2024-03-03,1
16268,37828307,Control Group,2024-03-04,1
16269,28665398,Variant A,2024-03-04,1


In [151]:
dates = df_sql9["deal_day"].unique().tolist()

In [154]:
d = df_sql9.groupby("deal_day")
for date in dates:
    g = d.get_group(date)
    g.groupby("user_id")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x2b6f0bb50>

In [158]:
%%sql
select * from dev_gsokolov.ab_users

Unnamed: 0,user_id,variant
0,31866284.0,Variant A
1,5344254.0,Variant A
2,28182549.0,Variant A
3,12310922.0,Variant A
4,22268842.0,Variant A
...,...,...
21121,35231603.0,Control Group
21122,33468363.0,Control Group
21123,124910.0,Control Group
21124,38274991.0,Control Group
