In [None]:
import pickle
import datetime
import pandas as pd
import altair as alt

import bok.dask_infra

In [None]:
# Define a conversion rate based on the current global market exchange rate.
IDR_TO_USD_RATE = 1.0/14150

## Extracting data from the transactions file

In [None]:
transactions = bok.dask_infra.read_parquet("data/clean/transactions_TM").compute()

## Each user's total amount of data purchased directly.

In [None]:
purchases = transactions.loc[transactions["kind"] == "purchase"]
aggregated_purchases = purchases.groupby("user")

plots_frame = aggregated_purchases[["amount_idr", "amount_bytes"]].sum()
plots_frame = plots_frame.reset_index()

plots_frame["amount_GiB"] = plots_frame["amount_bytes"] * float(1)/(1024**3)
print(plots_frame)

In [None]:
alt.Chart(plots_frame).mark_bar().encode(
    x=alt.X('user',
            sort=alt.SortField(field="amount_bytes",
                               order="descending"
                               ),
            ),
    y=alt.Y('amount_GiB',
            scale=alt.Scale(type="log"),
            ),
).display()

## Each user's count of purchase transactions

In [None]:
purchases = transactions.loc[transactions["kind"] == "purchase"]
purchase_counts = purchases.groupby("user")["timestamp"].count()

plot_frame = purchase_counts.reset_index().rename({"timestamp": "purchase_count"},
                                                  axis="columns")

alt.Chart(plot_frame).mark_bar().encode(
    x=alt.X('user',
            sort=alt.SortField(field="purchase_count",
                               order="descending"
                               ),
            ),
    y=alt.Y('purchase_count',
            scale=alt.Scale(type="log"),
            ),
).display()


## Each user's count of transfer transactions

In [None]:
user_transfers = transactions.loc[transactions["kind"] == "user_transfer"]
transfer_counts = user_transfers.groupby("user")["timestamp"].count()

plot_frame = transfer_counts.reset_index().rename({"timestamp": "transfer_src_count"},
                                                  axis="columns")

transfer_dst_counts = user_transfers.groupby("dest_user")["timestamp"].count()

transfer_dst_counts = transfer_dst_counts.reset_index().rename(
    {"timestamp": "transfer_dst_count",
     "dest_user": "user"
    },
    axis="columns"
)

plot_frame = plot_frame.merge(transfer_dst_counts, how="outer")

plot_frame = plot_frame.melt(id_vars=["user"],
                             value_vars=["transfer_src_count", "transfer_dst_count"],
                             var_name="direction",
                             value_name="count")

alt.Chart(plot_frame).mark_bar(opacity=0.7).encode(
    x=alt.X('user',
            sort=alt.SortField(field="count",
                               order="descending"
                               ),
            ),
    y=alt.Y('count',
            scale=alt.Scale(type="log"),
            stack=False,
            ),
    color="direction"
).display()


## Each user's amount of currency transferred

In [None]:
user_transfers = transactions.loc[transactions["kind"] == "user_transfer"]
transfer_counts = user_transfers.groupby("user")["amount_idr"].sum()

plot_frame = transfer_counts.reset_index().rename({"amount_idr": "transfer_src_sum"},
                                                  axis="columns")

transfer_dst_counts = user_transfers.groupby("dest_user")["amount_idr"].sum()

transfer_dst_counts = transfer_dst_counts.reset_index().rename(
    {"amount_idr": "transfer_dst_sum",
     "dest_user": "user"
    },
    axis="columns"
)

plot_frame = plot_frame.merge(transfer_dst_counts, how="outer")
plot_frame = plot_frame.melt(id_vars=["user"],
                             value_vars=["transfer_src_sum", "transfer_dst_sum"],
                             var_name="direction",
                             value_name="idr")

# Convert to USD
plot_frame["usd"] = plot_frame["idr"] * IDR_TO_USD_RATE

alt.Chart(plot_frame).mark_bar(opacity=0.7).encode(
    x=alt.X('user',
            sort=alt.SortField(field="usd",
                               order="descending"
                               ),
            ),
    y=alt.Y('usd',
            scale=alt.Scale(type="log"),
            stack=False,
            ),
    color="direction"
).display()

## Graph network revenue and expenses vs time

In [None]:
topups = transactions.loc[transactions["kind"] == "admin_topup"]
topups = topups.set_index("timestamp")[["dest_user", "amount_idr"]]
topups["idr_cumulative"] = topups["amount_idr"].cumsum()

# Convert to USD
topups["usd_cumulative"] = topups["idr_cumulative"] * IDR_TO_USD_RATE

topups = topups.reset_index()

alt.Chart(topups).mark_line().encode(
    x=alt.X('timestamp',
            type="temporal"
            ),
    y=alt.Y("usd_cumulative",
            ),
).display()

## Relatively constant data purchases across days of the week?

Market day is Saturday... it is the highest mean

In [None]:
days = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday', 'Sunday']
purchases = transactions.loc[transactions["kind"] == "purchase"]
purchases = purchases.groupby(transactions["timestamp"].dt.floor("d"))["amount_bytes"].sum()

purchases = pd.DataFrame(purchases)

purchases["GiB"] = purchases["amount_bytes"] / 1024**2
purchases = purchases.reset_index()
purchases["day"] = purchases["timestamp"].dt.day_name()

bars = alt.Chart(purchases).mark_boxplot().encode(
    x=alt.X('day:N',
            sort = days,
            title="Day of Week"
            ),
    y=alt.Y('GiB:Q',
            title="GiB Per Day Purchased"
            ),
).display()


## Conclusion Needed, User Spend Variance vs. Mean

## From the transactions data, a graph of the number of transactions of different costs.

## From the transactions data, a graph of the number of transactions of different costs.

In [None]:
purchases = transactions.loc[transactions["kind"] == "purchase"]
purchases = purchases.groupby("amount_bytes")["timestamp"].count()
purchases = purchases.reset_index().rename({"timestamp": "count"}, axis="columns")
purchases["amount_MB"] = purchases["amount_bytes"] * 1.0/1000**2
purchases["total_GB"] = purchases["amount_MB"] * purchases["count"] * 1.0/1000

print(purchases)
bars = alt.Chart(purchases).mark_bar().encode(
    x=alt.X('amount_MB',
            type="ordinal",
            ),
    y='count',
)

text = bars.mark_text(
    align="left",
    baseline="bottom",
).encode(
    text="count:Q"
)

bars = text + bars

bars.display()

alt.Chart(purchases).mark_bar().encode(
    x=alt.X('amount_MB',
            type="ordinal",
            ),
    y='total_GB',
).display()

## Explore which package most users select

In [None]:
user_packages = transactions.loc[transactions["kind"] == "purchase"]
user_packages = user_packages.groupby(["user", "amount_bytes"])["timestamp"].count()
user_packages = user_packages.reset_index().rename({"timestamp":"count"}, axis="columns")
user_packages["amount_MB"] = user_packages["amount_bytes"] * 1.0/1000**2

alt.Chart(user_packages).mark_bar(opacity=0.7).encode(
    x=alt.X('user',
            sort=alt.SortField(field="count",
                               order="descending"
                               ),
            ),
    y=alt.Y('count',
            scale=alt.Scale(type="linear"),
            stack="normalize",
            ),
    color="amount_MB:N"
).display()

alt.Chart(user_packages).mark_bar(opacity=0.7).encode(
    x=alt.X('user',
            sort=alt.SortField(field="count",
                               order="descending"
                               ),
            ),
    y=alt.Y('count',
            scale=alt.Scale(type="linear"),
            stack="zero",
            ),
    color="amount_MB:N"
).display()

## Explore the amount of bytes sold at each package level

In [None]:
bytes_per_package = user_packages
bytes_per_package["bytes_per_package"] = bytes_per_package["count"] * bytes_per_package["amount_bytes"]

print(bytes_per_package)

alt.Chart(bytes_per_package).mark_bar(opacity=0.7).encode(
    x=alt.X('user',
            sort=alt.SortField(field="bytes_per_package",
                               order="descending"
                               ),
            ),
    y=alt.Y('bytes_per_package',
            scale=alt.Scale(type="linear"),
            stack="normalize",
            ),
    color="amount_MB:N"
).display()

alt.Chart(bytes_per_package).mark_bar(opacity=0.7).encode(
    x=alt.X('user',
            sort=alt.SortField(field="bytes_per_package",
                               order="descending"
                               ),
            ),
    y=alt.Y('bytes_per_package',
            scale=alt.Scale(type="linear"),
            stack="zero",
            ),
    color="amount_MB:N"
).display()


## inter-topup time per user

