In [None]:
import polars as pl
import numpy as np
import pandas as pd
from PIL import Image
from exec_funcs import preprocess_dataset, bucketize_dataset, get_val_df, calculate_aggregates_baseline, calculate_aggregates_pim, sort_pim_aggs
from vis_funcs import generate_pixel_data, find_and_display_aggs, get_group
from operator import itemgetter

%load_ext autoreload
%autoreload 2

# EDIT
# Specify path of file containing dataset to visualize.
# Below code assumes file is a csv, but code should work fine as long as dataset is read into a pandas dataframe.
file_path = "taxi_data_2019_joined.csv"
df = pl.read_csv(file_path, try_parse_dates=True)

In [None]:
# (Optional) EDIT
# Set to True if you want to print a few rows of the dataset.
if True:
	display(df)

# (Optional) EDIT
# Set to True if you want to view the column names of the dataset.
# This is useful when you want to inspect which columns would be interesting to visualize.
if True:
	display(df.columns)

In [None]:
# EDIT
# Specify columns to inspect during visualization. 
# e.g. If you want to look at column A and column B during visualization, you would write
# columns_to_inspect = [['A'], ['B']]
# e.g. If you want to look at column A and column B during visualization, and use column C as a secondary
# attribute when sorting column A, you would write
# columns_to_inspect = [['A', 'C'], ['B']]
columns_to_inspect = [['pickup_time'], ['trip_distance'], ['PUBorough', 'PUservice_zone', 'PULat', 'PUZone'], ['passenger_count'], ['tip_amount']]
#columns_to_inspect = [[df.columns[i]] for i in range(8)]

# Specify aggregation value columns with their respective data type
val_columns_and_types = [(None, pl.Int32), ('passenger_count', pl.Int32), ('fare_amount', pl.Float32), ('trip_distance', pl.Float64)]

# Preprocess dataset.
df_preprocessed = preprocess_dataset(df, columns_to_inspect)

In [21]:
# Get subset of preprocessed dataset. This is user defined.
df_analyze = df_preprocessed

In [None]:
# EDIT
# Number of buckets used for visualization.
bucket_num = 128

bucket_df = bucketize_dataset(df_analyze, bucket_num, columns_to_inspect, len(df))
val_df = get_val_df(df_analyze, val_columns_and_types)

In [None]:
exec_method = 2
threads = 20
divide = 2

vals = val_df.select(pl.nth(0))

pim = False
buffer_size = 327680
dpu_num = 64
async_mode = False
simulator = False
correctness_check = False


if pim:
	agg_info, aggs = calculate_aggregates_pim(bucket_df, vals, bucket_num, buffer_size, dpu_num, async_mode, simulator, correctness_check)
else:
	agg_info, aggs = calculate_aggregates_baseline(exec_method, bucket_df, vals, bucket_num)

In [24]:
num_aggs = len(agg_info)
aggs = aggs.reshape([num_aggs, bucket_num, bucket_num, bucket_num])

col_names = []

agg_info, aggs = sort_pim_aggs(aggs, agg_info)

for agg in agg_info:
	col_name = '+'.join(columns_to_inspect[agg[0]]) + "-" + '+'.join(columns_to_inspect[agg[1]]) + "-" + '+'.join(columns_to_inspect[agg[2]])
	col_names.append(col_name)

In [None]:
from PIL import Image

frames = 4
buckets_per_frame = int(bucket_num / frames)
for a in range(num_aggs):
	print(col_names[a])
	#curr_agg = aggs[a * (bucket_num ** 3) : (a + 1) * (bucket_num ** 3)]
	curr_agg = aggs[a]
	curr_agg = curr_agg.reshape(bucket_num, bucket_num ** 2)
	for f in range(frames):
		frame_agg = curr_agg[buckets_per_frame * f : buckets_per_frame * (f+1)]
		frame_agg = np.sum(frame_agg, axis = 0).reshape(bucket_num, bucket_num)

		# generate_heatmap_frame(frame_agg.reshape(bucket_num, bucket_num), bucket_num)

		# Reverse frame agg on axis 0 so that y axis is in increasing order
		frame_agg = frame_agg[::-1]
		ppm = generate_pixel_data(frame_agg)
		img = Image.fromarray(ppm)
		#img.save('thumbnails/' + col_names[a] + '-frame' + str(f) + '.jpeg')
		display(img)


In [None]:
for i, c in enumerate(columns_to_inspect):
	print(str(i) + " " + str(c))

In [None]:
# Print individual aggregates
x_col = 1
y_col = 4
z_col = 2

frames = 4

find_and_display_aggs([z_col, y_col, x_col], aggs, agg_info, frames)

In [None]:
# Put images in groups that share x and y axes
info = []

frames = 4
mode = 2
top_n_image = 8
top_n_groups = 5

for x in range(len(columns_to_inspect) - 1):
  for y in range(x+1, len(columns_to_inspect)):
    print(x, y)
    group_ppms = []
    group_scores = []
    for z in range(len(columns_to_inspect)):
      if z != x and z != y:
        ppms, scores = get_group([z, y, x], aggs, agg_info, frames, mode)
        group_ppms += ppms
        group_scores += scores
    info.append([x, y, group_ppms, group_scores, np.sum(group_scores)])

info = sorted(info, key=itemgetter(4), reverse=True)

In [None]:
# Present images according to score
top_n_image = 10
top_n_groups = 5

g_index = 0
for i in info:
  print(g_index)
  print('X: ' + '+'.join(columns_to_inspect[i[0]]))
  print('Y: ' + '+'.join(columns_to_inspect[i[1]]))
  sorted_imgs = sorted(zip(i[3], i[2]), reverse=True, key=lambda x : (x[0], np.sum(x[1])))
  curr = 0
  for s, ppm in sorted_imgs:
    print(s)
    img = Image.fromarray(ppm)
    display(img)
    curr += 1
    if curr == top_n_image:
      break
  g_index += 1
  if g_index == top_n_groups:
    break

In [212]:
# Define subset of data to analyze for equidistance binning
# e.g. do data cleaning
col_name = "pickup_time"
df_analyze = df.with_columns(
    (pl.col(col_name).dt.hour().cast(pl.Int32) * 60 + pl.col(col_name).dt.minute()).alias(col_name)
)
df_analyze = df_analyze.filter((pl.col("trip_distance") < 20) & (pl.col("trip_distance") >= 0))
df_analyze = df_analyze.filter((pl.col("tip_amount") < 10) & (pl.col("tip_amount") >= 0))

In [213]:
columns_to_inspect = [['pickup_time'], ['trip_distance'], ['PUBorough', 'PUservice_zone', 'PULong', 'PUZone'], ['passenger_count'], ['tip_amount']]

In [214]:
from exec_funcs import equidistance_bin
bins = 128
binned_df = equidistance_bin(df_analyze, columns_to_inspect, 128)

In [215]:
import copy
def calculate_aggregate_equidistance(binned_df, cols):
	agg_shapes = [binned_df[c].max() + 1 for c in cols]
	aggs = np.zeros(agg_shapes, dtype=np.uint32)
	group_by_df = binned_df.group_by(cols).len()
	group_by_df_np = group_by_df.to_numpy()
	for r in group_by_df_np:
		aggs[r[0]][r[1]][r[2]] += r[3]
	return aggs

aggs = []
agg_info = []
col_num = len(binned_df.columns)
for i in range(col_num - 2):
	for j in range(i + 1, col_num - 1):
		for k in range(j + 1, col_num):
			agg = calculate_aggregate_equidistance(binned_df, [binned_df.columns[i], binned_df.columns[j], binned_df.columns[k]])
			aggs.append(agg)
			agg_info.append([i, j, k])

In [None]:
for i, c in enumerate(columns_to_inspect):
	print(str(i) + " " + str(c))

In [None]:
# Print individual aggregates
x_col = 2
y_col = 1
z_col = 0

frames = 4

find_and_display_aggs([z_col, y_col, x_col], aggs, agg_info, frames)