# Imports

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import fitz as fitz
from enum import Enum
import io as io

## Classes and Enums

In [2]:
# using as a value tracker

class RoomOrder(Enum):
	ABF = 0 # 18a, 18b, 18
	BAF = 1 # 18b, 18a, 18
	AFB = 2 # 18a, 18, 18b
	BFA = 3 # 18b, 18, 18a
	FAB = 4 # 18, 18a, 18b
	FBA = 5 # 18, 18b, 18a
	BA = 6 # 18b, 18a
	BF = 7 # 18b, 18
	FB = 8 # 18, 18b
	FA = 9 # 18, 18a
	AB = 10 # 18a, 18b
	AF = 11 # 18a, 18
	A = 12 # 18a
	B = 13 # 18b
	F = 14 # 18

class GroupComp(Enum):
	INDIVIDUAL = 0
	GROUP = 1

class LecternsViewed(Enum):
	N = "None"
	W = "West"
	M = "Middle"
	E = "East"
	WM = "West, Middle"
	WE = "West, East"
	ME = "Middle, East"
	WME = "West, Middle, East"

class VisitorType(Enum):
	BROWSER = 0
	FOLLOWER = 1
	SEARCHER = 2
	RESEARCHER = 3

class TurnDirection(Enum):
	LEFT = 0
	MIDDLE = 1
	RIGHT = 2
	NONE = 3

class TeamMember(Enum):
	Courtney = 0
	Jerry = 1
	Owen = 2
	Ritvik = 3
	Sofia = 4

class Gender(Enum):
	FEMALE = 0
	MALE = 1

class FirstTurnDirection(Enum):
	LEFT = 0
	MIDDLE = 1
	RIGHT = 2

class DayOfWeek(Enum):
	Sunday = 0
	Monday = 1
	Tuesday = 2
	Wednesday = 3
	Thursday = 4
	Friday = 5
	Saturday = 6

# Code

## Constants

In [3]:
# TODO: update all values accordingly
# relative file paths
visitor_xlsx_path: str = "../assets/excel_files/observation_tables.xlsx"
survey_xlsx_path: str = "../assets/excel_files/survey_responses.xlsx"

# sheet name
main_sheet_name: str = "main data"

# export paths
observation_export_path: str = "../assets/output_files/visitor_data_graphs.pdf"
survey_export_path: str = "../assets/output_files/survey_data_graphs.pdf"

## Loading XLSX

In [4]:
visitor_xlsx: pd.ExcelFile = pd.ExcelFile(visitor_xlsx_path)
visitor_df: pd.DataFrame = pd.read_excel(visitor_xlsx, sheet_name=main_sheet_name, index_col=0)

survey_xlsx: pd.ExcelFile = pd.ExcelFile(survey_xlsx_path)
survey_df: pd.DataFrame = pd.read_excel(survey_xlsx, index_col=8)
survey_df = survey_df[1:]

## Operations

### Support Functions

In [5]:
OBSERVATION_CATEGORICAL: list[str] = [
	"observer",
	"tracker",
	"group_comp",
	"gender",
	"room_order",
	"lecterns_visited",
	"visitor_type",
	"chatted_with_visitors",
	"chatted_with_staff",
	"sat_on_bench",
	"split_from_group",
	"used_phone",
	"used_museum_guide",
	"used_headphones",
	"first_turn_direction",
	"18a_took_photos",
	"18a_took_videos",
	"18a_viewed_labels",
	"18b_touched_casts",
	"18b_took_photos",
	"18b_viewed_labels",
]

OBSERVATION_NUMERICAL: list[str] = [
	"total_time", # timedelta
	"18a_total_time", # timedelta
	"18b_total_time", # timedelta
]

SURVEY_CATEGORICAL: list[str] = [
	"Q3",
	"Q4",
	"Q5",
	"Q6",
	"Q7",
	"Q8",
	"Q9",
	"Q10",
	"Q11",
	"Q12",
	"Q13",
	"Q14",
	"Q16",
	"Q17",
	"Q19",
]

SURVEY_NUMERICAL: list[str] = []

ENUM_CLASSES: list[Enum] = [
	LecternsViewed
]

In [6]:
def plot_data(df: pd.DataFrame, categorical_cols: list, numerical_cols: list, enum_classes: list, export_path: str, show_graphs: bool) -> None:
	doc: fitz.Document = fitz.open()

	page_width: int = 595
	page_height: int = 842

	margin: int = 50
	midpoint: int = page_height / 2

	master_map: dict[str, str] = {}
	for enum_cls in enum_classes:
		for member in enum_cls:
			master_map[member.name] = str(member.value)

	all_cols: list[str] = categorical_cols + numerical_cols
	plot_df: pd.DataFrame = df.copy()

	for i, col in enumerate(all_cols):
		if i % 2 == 0:
			page: fitz.Page = doc.new_page(width=page_width, height=page_height)
		
		fig: plt.Figure
		ax: plt.Axes
		fig, ax = plt.subplots(figsize=(8, 5))

		if col in categorical_cols:
			plot_df[col] = plot_df[col].map(lambda x: master_map.get(x, x))
			sorted_order: list[str] = sorted(plot_df[col].dropna().unique().astype(str))

			sns.countplot(
				data=plot_df,
				x=col,
				hue=col,
				order=sorted_order,
				palette='viridis',
				legend=False,
				ax=ax
			)
			ax.set_title("Categorical Distribution: " + col)
		elif col in numerical_cols:
			col_range_name: str = f'{col}_range'
			data: pd.DataFrame = df[[col]].dropna().copy()
			
			bins: pd.Series = pd.cut(data[col], bins=5)
			data[col_range_name] = bins
			data = data.sort_values(col_range_name)
			
			all_possible_bins: list[str] = [str(b) for b in bins.cat.categories]
			data[col_range_name] = data[col_range_name].astype(str)

			sns.countplot(
				data=data, 
				x=col_range_name, 
				hue=col, 
				order=all_possible_bins,
				palette='magma', 
				legend=False, 
				ax=ax
			)
			ax.set_title("Numerical Ranges: " + col)
		else:
			raise Exception("Column " + col + " not in categorical or numerical set.")
		
		plt.xticks(rotation=45)
		plt.subplots_adjust(bottom=0.2, top=0.9)
		image_data: io.BytesIO = io.BytesIO()
		fig.savefig(image_data, format="png", dpi=150)
		if show_graphs:
			plt.show(fig)
		image_data.seek(0)

		image_rect: fitz.Rect
		text_y: int
		if i % 2 == 0:
			image_rect = fitz.Rect(margin, margin, page_width - margin, midpoint - 40)
			text_y = midpoint - 20
		else:
			image_rect = fitz.Rect(margin, midpoint + 20, page_width - margin, page_height - margin - 40)
			text_y = page_height - 30
		
		page.insert_image(image_rect, stream=image_data.read())

		page.insert_text((margin, text_y), "Analysis for column '" + col + "'", fontsize=12)
		plt.close(fig)
	doc.save(export_path)
	doc.close()

### Execution

In [7]:
plot_data(visitor_df, OBSERVATION_CATEGORICAL, OBSERVATION_NUMERICAL, ENUM_CLASSES, observation_export_path, False)

In [None]:
plot_data(survey_df, SURVEY_CATEGORICAL, SURVEY_NUMERICAL, [], survey_export_path, False)