# Generate Data for Tableau

In [34]:
import sys
import os
from  pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sys.path.insert(0, r"C:\Users\vynde\PycharmProjects\dataanalysis")
import databridger as da

In [3]:
raw_data_folder = Path('..') / 'data' / 'raw'
prepared_data_folder = Path('..') / 'data' / 'prepared'

In [4]:
customers_df = pd.read_csv(raw_data_folder / 'olist_customers_dataset.csv')
geolocation_df = pd.read_csv(raw_data_folder / 'olist_geolocation_dataset.csv')
order_items_df = pd.read_csv(raw_data_folder / 'olist_order_items_dataset.csv')
order_payments_df = pd.read_csv(raw_data_folder / 'olist_order_payments_dataset.csv')
order_reviews_df = pd.read_csv(raw_data_folder / 'olist_order_reviews_dataset.csv')
orders_df = pd.read_csv(raw_data_folder / 'olist_orders_dataset.csv')
products_df = pd.read_csv(raw_data_folder / 'olist_products_dataset.csv')
sellers_df = pd.read_csv(raw_data_folder / 'olist_sellers_dataset.csv')
category_translation_df = pd.read_csv(raw_data_folder / 'product_category_name_translation.csv')

In [5]:
for table in [order_items_df, order_reviews_df, orders_df]:
    for column in table.columns:
        if table[column].dtype == 'object':
            if all(table[column].str.match(r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}")):
                table[column] = pd.to_datetime(table[column])
                print(f"converted:    / {column:30} from object to {table[column].dtype}")

converted:    / shipping_limit_date            from object to datetime64[ns]
converted:    / review_creation_date           from object to datetime64[ns]
converted:    / review_answer_timestamp        from object to datetime64[ns]
converted:    / order_purchase_timestamp       from object to datetime64[ns]
converted:    / order_approved_at              from object to datetime64[ns]
converted:    / order_delivered_carrier_date   from object to datetime64[ns]
converted:    / order_delivered_customer_date  from object to datetime64[ns]
converted:    / order_estimated_delivery_date  from object to datetime64[ns]


In [None]:
geolocation_df.groupby('zipcode_prefix')

In [19]:
combined_df = orders_df.merge(customers_df, on='customer_id', how='outer')\
                       .merge(order_items_df, on='order_id', how='outer')\
                       .merge(order_reviews_df, on='order_id', how='outer')\
                       .merge(order_payments_df, on='order_id', how='outer')\
                       .merge(sellers_df, on='seller_id', how='outer')\
                       .merge(products_df, on='product_id', how='outer')\
                       .merge(category_translation_df, on='product_category_name', how='outer')

combined_df.to_csv(prepared_data_folder / 'combined.csv', index=False)

In [20]:

total_customers = customers_df['customer_unique_id'].nunique()
total_orders = orders_df['order_id'].nunique()
total_order_items = order_items_df[['order_id', 'order_item_id']].drop_duplicates().shape[0]
total_products = products_df['product_id'].nunique()
total_sellers = sellers_df['seller_id'].nunique()

df_totals = pd.DataFrame(dict(zip(
    ["Entity", "Totals"],
    [
        ['Customers', 'Orders', 'Items', 'Products', 'Sellers'],
        [total_customers, total_orders, total_order_items, total_products, total_sellers]
    ])))

df_totals.to_csv(prepared_data_folder / 'totals.csv', index=False)

In [32]:
payments_daily = combined_df.set_index("order_purchase_timestamp")[["payment_value"]].resample('D').sum().reset_index()

payments_daily.to_csv(prepared_data_folder / 'daily_payments.csv', index=False)

In [22]:
products_eng_df = products_df.merge(category_translation_df, on='product_category_name')

products_eng_df.to_csv(prepared_data_folder / 'products_eng.csv', index=False)

In [23]:
products_sales_df = products_eng_df.merge(order_items_df, on='product_id')

products_sales_df.to_csv(prepared_data_folder / 'products_sales.csv', index=False)

In [37]:
df = combined_df.copy()
# track count if internal ids per order
df["order_numitems"] = df.groupby("order_id")["order_item_id"].transform("max")
df["order_numpayments"] = df.groupby("order_id")["payment_sequential"].transform("max")

# products
df["total_products_sold"] = df.groupby(["product_id", "order_item_id", "payment_sequential"]).transform("size")
da.wrangle.create_flag(df, col="total_products_sold", flag_col="total_products_sold_cat", 
    limits=[10, 200, 400], labels=["rarely_sold", "commonly_sold", "frequently_sold", "top_seller"])
df["avg_products_price"] = df.groupby(["product_id", "order_item_id", "payment_sequential"])["price"].transform("mean")
da.wrangle.create_flag(df, col="avg_products_price", flag_col="avg_products_price_cat", 
    limits=[1, 100, 1000], labels=["cheap", "low_priced", "high_priced", "expensive"])

# payment
df["order_payment"] = df.groupby(["order_id", "order_item_id"])["payment_value"].transform("sum")

# order items
df["order_price"] = df.groupby(["order_id", "payment_sequential"])["price"].transform("sum")
df["order_freight"] = df.groupby(["order_id", "payment_sequential"])["freight_value"].transform("sum")

# orders
df["time_to_approve"] = (df["order_approved_at"] - df["order_purchase_timestamp"]).dt.days
df["time_to_carrier"] = (df["order_delivered_carrier_date"] - df["order_purchase_timestamp"]).dt.days
df["time_to_customer"] = (df["order_delivered_customer_date"] - df["order_purchase_timestamp"]).dt.days
da.wrangle.create_flag(df, col="time_to_customer", flag_col="time_to_customer_cat", 
    limits=[1, 10, 30], labels=["max1day", "max10days", "max30days", "upto209"])
df["time_to_customer_est"] = (df["order_estimated_delivery_date"] - df["order_purchase_timestamp"]).dt.days

df.to_csv(prepared_data_folder / 'combined_extended.csv', index=False)