## Library Import

In [2]:
import numpy as np
import pandas as pd
import plotly.express as px

from utils.config_file import CHART_LAYOUT
from utils.plotly_theme import custom_colors

## Data import

In [3]:
df = pd.read_csv("data/lit-metadata.csv")

df["exp-id"] = df["pub-id"].astype(str)

unique_studies = df.drop_duplicates(subset=["exp-id"])

print(df.columns)

df.head(5)

Index(['pub-id', 'exp-id', 'paper', 'authors', 'institutes', 'city', 'country',
       'climate-class', 'pub-year', 'data-avail',
       ...
       'part-meta-health-level', 'part-meta-morningness', 'part-meta-bmr',
       'part-meta-alcohol-use', 'part-meta-mens-timing',
       'part-meta-contraceptive-type', 'part-meta-muscle',
       'part-meta-reg-coffee', 'part-meta-reg-sleep-time',
       'part-meta-reg-work time'],
      dtype='object', length=172)


Unnamed: 0,pub-id,exp-id,paper,authors,institutes,city,country,climate-class,pub-year,data-avail,...,part-meta-health-level,part-meta-morningness,part-meta-bmr,part-meta-alcohol-use,part-meta-mens-timing,part-meta-contraceptive-type,part-meta-muscle,part-meta-reg-coffee,part-meta-reg-sleep-time,part-meta-reg-work time
0,1,1,Effect of elevated air temperature and air vel...,"Chao Cen, Siyu Cheng, Nyuk Hien Wong",National University of Singapore (NUS),Queenstown,Singapore,,2023.0,Upon request,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,1,Effect of elevated air temperature and air vel...,"Chao Cen, Siyu Cheng, Nyuk Hien Wong",National University of Singapore (NUS),Queenstown,Singapore,,2023.0,Upon request,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,1,Effect of elevated air temperature and air vel...,"Chao Cen, Siyu Cheng, Nyuk Hien Wong",National University of Singapore (NUS),Queenstown,Singapore,,2023.0,Upon request,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,1,Effect of elevated air temperature and air vel...,"Chao Cen, Siyu Cheng, Nyuk Hien Wong",National University of Singapore (NUS),Queenstown,Singapore,,2023.0,Upon request,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2,2,Evaluation of cognitive performance in high te...,"Yuyan Chen, Zheng Wang, Xiaoyu Tian, Weiwei Liu","Central South University, Hunan University",Changsha,China,,2023.0,Not available for sharing,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


## PARAMETER

In [4]:
year_counts = unique_studies["pub-year"].value_counts().sort_index().reset_index()
year_counts.columns = ["pub-year", "count"]

fig = px.line(
    year_counts,
    x="pub-year",
    y="count",
    title="Number of Studies Published Each Year",
    width=800,
    height=400,
    template=CHART_LAYOUT.template.value,
)

fig.show()

In [5]:
fig = px.bar(
    year_counts,
    x="pub-year",
    y="count",
    width=800,
    height=400,
    template=CHART_LAYOUT.template.value
)

fig.show()

In [6]:
country_counts = unique_studies["country"].value_counts().reset_index()
country_counts.columns = ["country", "count"]

fig = px.choropleth(
    country_counts,
    locations="country",
    locationmode="country names",
    color="count",
    width=800,
    height=600,
    hover_name="country",
)

fig.show()

In [7]:
function_counts = df["function"].value_counts().reset_index()
function_counts.columns = ["function", "count"]

fig = px.pie(
    function_counts,
    names="function",
    values="count",
    width=600,
    height=400,
    template=CHART_LAYOUT.template.value,
)

fig.show()

## Measurement

In [8]:
parameter_counts = df.groupby("physio-parameter")["exp-id"].nunique()
total_experiments = df["exp-id"].nunique()

parameter_counts = parameter_counts.reset_index(name="exp-id count")
parameter_counts["exp-id percentage"] = (
    parameter_counts["exp-id count"] / total_experiments
) * 100
parameter_counts_sorted = parameter_counts.round(1).sort_values(
    "exp-id percentage", ascending=True
)

fig = px.bar(
    parameter_counts_sorted,
    x="exp-id percentage",
    y="physio-parameter",
    labels={
        "physio-parameter": "Physiological Parameter [-]",
        "exp-id percentage": "Percentage of Studies",
    },
    orientation='h'
)

fig.update_layout(
    xaxis_title="Percentage of Studies [%]",
    yaxis_title="Physiological Parameter [-]",
    template=CHART_LAYOUT.template.value,
)

fig.show()

In [9]:
import plotly.graph_objects as go

testing_location_counts = pd.DataFrame(
    {
        "physio-body-site": [
            "Forehead",
            "Nose",
            "Cheek",
            "Neck",
            "Chest",
            "Back",
            "Upper arm",
            "Abdomen",
            "Lumbar",
            "Forearm",
            "Buttock",
            "Wrist",
            "Finger",
            "Thigh",
            "Shin",
            "Calf",
            "Ankle",
            "Foot",
            "Sole",
        ],
        "count": [10, 15, 7, 20, 5, 8, 12, 14, 9, 11, 6, 13, 4, 16, 3, 18, 2, 17, 1],
    }
)

location_coordinates = pd.DataFrame(
    {
        # Format: 'physio-body-site':     [x, y]
        "Forehead": [-147.5, 262.5],
        "Nose": [-147.5, 240],
        "Cheek": [-160, 225],
        "Neck": [28, 203],
        "Chest": [-147.5, 157.5],
        "Back": [28, 152.5],
        "Upper arm": [-195, 140],
        "Abdomen": [-147.5, 75],
        "Lumbar": [28, 85],
        "Forearm": [-195, 75],
        "Buttock": [45, 55],
        "Wrist": [-205, 45],
        "Finger": [-215, -5],
        "Thigh": [28, -22.5],
        "Shin": [-170, -110],
        "Calf": [12.5, -110],
        "Ankle": [17.5, -165],
        "Foot": [-150, -190],
        "Sole": [17.5, -190],
    }
)

location_coordinates = location_coordinates.transpose().rename(
    columns={0: "x-coordinate", 1: "y-coordinate"}
)
location_coordinates.reset_index(inplace=True)
location_coordinates.rename(columns={"index": "physio-body-site"}, inplace=True)

merged_df = pd.merge(
    location_coordinates, testing_location_counts, on="physio-body-site"
)


fig = go.Figure()

fig.add_layout_image(
    dict(
        source="assets/img/body-site-chart-background.png",
        xref="x",
        yref="y",
        x=10,
        y=10,
        sizex=800,
        sizey=640,
        xanchor="center",
        yanchor="middle",
        # sizing="stretch",
        opacity=1.0,
        layer="below",
    )
)

fig.add_trace(
    go.Scatter(
        x=merged_df["x-coordinate"],
        y=merged_df["y-coordinate"],
        mode="markers",
        marker=dict(size=merged_df["count"] * 2.5, color="#db1492", opacity=0.8),
        text=merged_df["count"],  # Add the counts to the hover text
        hoverinfo="text",
    )
)

fig.update_xaxes(showgrid=False, showticklabels=False, range=[-350, 350])

fig.update_yaxes(showgrid=False, showticklabels=False, range=[-320, 320])

fig.update_layout(
    plot_bgcolor="rgba(0,0,0,0)",
    xaxis=dict(showgrid=False, zeroline=False),
    yaxis=dict(showgrid=False, zeroline=False),
    width=800,
    height=640,
    margin=dict(l=10, r=10, t=10, b=10),
)


fig.show()

In [10]:
skin_temperature = df.loc[df["physio-parameter"] == "Skin temperature"]
location_counts = skin_temperature["physio-body-site"].value_counts()

fig = px.bar(
    location_counts,
)

fig.update_layout(
    xaxis_title="Measurement location [-]",
    yaxis_title="Percentage of Studies [%]",
    template=CHART_LAYOUT.template.value,
)

fig.show()

In [11]:
data = (
    df[["exp-id", "physio-parameter", "physio-sensor-type", "physio-sensor-brand"]]
    .dropna()
    .drop_duplicates(subset=["exp-id", "physio-parameter"])
)

fig = px.sunburst(
    data, path=["physio-parameter", "physio-sensor-type", "physio-sensor-brand"], width=800, height=600, template=CHART_LAYOUT.template.value
)

fig.show()

## Protocol

In [12]:
# filter columns that start with "env-"
env_columns = [col for col in unique_studies.columns if col.startswith("env-")]
env_df = unique_studies[env_columns]

env_coverage = env_df.notna().mean() * 100
env_coverage_df = env_coverage.round(1).reset_index()
env_coverage_df.columns = ["environment_parameter", "coverage_percentage"]

env_coverage_df = env_coverage_df.sort_values("coverage_percentage", ascending=False)

fig = px.bar(
    env_coverage_df,
    y="coverage_percentage",
    x="environment_parameter",
    labels={
        "coverage_percentage": "Coverage Percentage [%]",
        "environment_parameter": "Environmental Parameter"
    },
    template=CHART_LAYOUT.template.value
)

fig.show()

In [13]:
df = unique_studies

df["session-length"] = pd.to_numeric(df["session-length"], errors='coerce')

fig = px.box(
    df,
    y="session-length",
    labels={
        "session-length": "Session length [min]",
    },
    template=CHART_LAYOUT.template.value
)

fig.show()

In [14]:
df = unique_studies
df["normalisation-length"] = pd.to_numeric(df["normalisation-length"], errors='coerce')


fig = px.box(
    df,
    y="normalisation-length",
    width=800,
    height=400,
    template=CHART_LAYOUT.template.value,
)

fig.update_layout(
    xaxis_title=None,
    yaxis_title="Acclimatization time [min]",
)

fig.show()

## Questionnaires

In [15]:
# Filter columns that start with "ques-"
ques_columns = [col for col in unique_studies.columns if col.startswith("ques-")]
ques_df = unique_studies[ques_columns]

value_counts_list = []

for col in ques_columns:

    non_nan_percentage = ques_df[col].notna().mean() * 100
    
    value_counts = ques_df[col].dropna().value_counts(normalize=True) * non_nan_percentage
    value_counts = value_counts.reset_index()
    value_counts.columns = ["response_value", "percentage"]
    value_counts["questionnaire"] = col
    value_counts["total_coverage_percentage"] = non_nan_percentage
    value_counts_list.append(value_counts)

ques_value_counts_df = pd.concat(value_counts_list, ignore_index=True)

ques_value_counts_df = ques_value_counts_df.sort_values(
    "total_coverage_percentage", ascending=False
)

fig = px.bar(
    ques_value_counts_df,
    x="questionnaire",
    y="percentage",
    color="response_value",
    labels={
        "percentage": "Coverage Percentage [%]",
        "questionnaire": "Comfort Questionnaire",
        "response_value": "Response Value"
    },
    template=CHART_LAYOUT.template.value,
    title="Coverage Percentage by Comfort Questionnaire (Stacked by Response Value)",
    category_orders={"questionnaire": ques_value_counts_df["questionnaire"].unique()}
)

fig.show()

TK: Here a filtering option would be nice, so that user can only look into a specific type of questionnaire. Could also be a facet_grid type of plot, where each domain is plotted separately on a grid.

## Participants

In [16]:
fig = px.pie(
    unique_studies,
    names="age-group",
    title="Age Group Distribution",
    width=800,
    height=400,
    template=CHART_LAYOUT.template.value,
)

fig.show()

#### Female-Male Ratio

In [17]:
#! for now, converting percentage string to ratio between 0 and 1: FIX THIS
df = unique_studies
df = df.loc[df["fem-total-ratio"] != "NR"]
df["fem-total-ratio"] = pd.to_numeric(df["fem-total-ratio"].str.replace('%', '')) / 100

fig = px.violin(
    df,
    y="fem-total-ratio",
    # box=True,
    # points="all",
    title="Gender Ratio in Studies",
    width=400,
    height=400,
    template=CHART_LAYOUT.template.value,
)

fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



#### No. Participants

In [18]:
df = unique_studies
df = df.loc[df["part-no-tot"] != "NR"]
df["part-no-tot"] = pd.to_numeric(df["part-no-tot"], errors='coerce')

fig = px.box(
    df,
    y="part-no-tot",
    width=600,
    height=550,
    template=CHART_LAYOUT.template.value,
)

fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



---

## Data Processing and Availability

#### Data availability

In [19]:
fig = px.bar(
    unique_studies,
    x="data-avail",
    width=800,
    height=400,
    title="Data Availability in Studies",
    category_orders={"data-avail": ["Available", "Upon request", "Not available for sharing", "NR"]},
    template=CHART_LAYOUT.template.value,
)

fig.show()

## Misc

In [20]:
df = unique_studies

# Replace NaN values with 0
df = df.fillna(0)

# Set 'exp-id' as the index
df.set_index("pub-id", inplace=True)

# Filter columns that start with 'protocol-'
protocol_columns = df.filter(like="protocol-")

# Transpose the DataFrame
protocol_columns = protocol_columns.transpose()

# Create the heatmap
fig = px.imshow(
    protocol_columns,
    labels=dict(x="Study ID", y="Criteria", color="Fulfilled"),
    x=protocol_columns.columns,
    y=protocol_columns.index,
    color_continuous_scale=["#636EFA", "#EF553B"],  # Colors for 0 and 1
)

# Update layout for better readability
fig.update_layout(
    xaxis_title="Study ID",
    yaxis_title="Criteria",
    xaxis=dict(tickmode="linear"),
    yaxis=dict(tickmode="linear"),
)

fig.show()