# Project week 8: Simulation

In [None]:
import numpy as np
import pandas as pd

## Import data

In [None]:
# Create empty DataFrame to be filled with data
df_raw = pd.DataFrame(columns=["timestamp", "customer_no", "location"])

# Import data
days = ["monday", "tuesday", "wednesday", "thursday", "friday"]

# Loop through days and import one csv file per day
for i, day in enumerate(days):
    data = pd.read_csv(f"data/{day}.csv", delimiter=";", parse_dates=[0])

    # Add 10,000 per day to customer number to separate days
    data["customer_no"] = data["customer_no"] + (i + 1) * 10000

    df_raw = pd.concat([df_raw, data])

In [None]:
# Sort, reset index and save a copy
df = df_raw.sort_values("timestamp").reset_index(drop=True).copy()

## For every customer, add a line for entrance

In [None]:
# Create a copy of a grouped dataframe (by customer_no) and get first values
first_entry = df.groupby("customer_no").first().reset_index().copy()

# Change timestamp minus one minute
first_entry["timestamp"] = first_entry["timestamp"] - pd.Timedelta(minutes=1)

# Set location to entrance
first_entry["location"] = "entrance"

# Add dataframe as rows to the old dataframe
df = pd.concat([df, first_entry])

# Sort by timestamp and reset index
df = df.sort_values("timestamp").reset_index(drop=True)

## Find customers with mising checkout

In [None]:
# Create a copy of a grouped dataframe (by customer_no) and get last values
last_locations = (
    df.groupby("customer_no").last().reset_index().copy()
)

# Filter out those customers that have no checkout as last location
last_locations = last_locations[last_locations["location"] != "checkout"]

# Add one minute to timestamp
last_locations["timestamp"] = last_locations["timestamp"] + pd.Timedelta(minutes=1)

# Set location to checkout
last_locations["location"] = "checkout"

# Add dataframe as rows to the old dataframe
df = pd.concat([df, last_locations])

# Sort by timestamp and reset index
df = df.sort_values("timestamp").reset_index(drop=True)

## Fill missing datetimes

In [None]:
# See entries for one customer before
# df[df["customer_no"] == 10434]

In [None]:
df = (
    df.set_index("timestamp")
    .groupby("customer_no")[["location"]]
    .resample(rule="1min")
    .ffill()
    .reset_index()
)

# TODO: Quite slow, improve performance

In [None]:
# See entries for one customer after
# df[df["customer_no"] == 10434]

In [None]:
df.to_csv("data/data_clean.csv")

## Calculate transition probabilities 

In [None]:
# Add shifted columns
df["loc_next"] = df.groupby("customer_no")["location"].shift(-1)
df["loc_last"] = df.groupby("customer_no")["location"].shift(1)
#df["loc_next"].fillna("checkout", inplace=True)
#df["loc_last"].fillna("entrance", inplace=True)

In [None]:
P = pd.crosstab(
    index=df["location"],
    columns=df["loc_next"],
    normalize="index",
)

# Add checkout row
checkout = pd.DataFrame(
    {"checkout": [1], "dairy": [0], "drinks": [0], "fruit": [0], "spices": [0]},
    index=["checkout"],
)
P = pd.concat([P, checkout])

# Add entrance column
P["entrance"] = 0.0

P = P[["entrance", "dairy", "drinks", "fruit", "spices", "checkout"]]

In [None]:
P

In [None]:
P.to_csv("data/transition_probabilities.csv")