In [45]:
%%capture
from pathlib import Path

if Path.cwd().stem == "notebooks":
    %cd ..
%load_ext autoreload
%autoreload 2

In [46]:
import json
import logging
from pathlib import Path

import altair as alt
import holoviews as hv
import hvplot.polars  # noqa
import matplotlib.pyplot as plt
import numpy as np
import polars as pl
from polars import col

from src.data.database_manager import DatabaseManager
from src.log_config import configure_logging

logger = logging.getLogger(__name__.rsplit(".", maxsplit=1)[-1])
configure_logging(
    stream_level=logging.DEBUG,
    ignore_libs=["matplotlib", "Comm", "bokeh", "tornado", "param", "numba"],
)

pl.Config.set_tbl_rows(12)  # for the 12 trials
hv.output(widget_location="bottom", size=130)

In [47]:
"er" in "fgfzftrertt"

True

In [48]:
logger.fatal("Anonymize participants ids before sharing the data")

15:46:48 | [1m[91m[4mCRITICAL[0m| __main__ | Anonymize participants ids before sharing the data


In [49]:
REPORTS_DIR = Path("reports")
PAPER_DIR = Path("/Users/visser/Dropbox/PhD/Papers/PAIN/data")
DIRS = [REPORTS_DIR, PAPER_DIR]

FILE_NAME = "participants.json"

In [50]:
db = DatabaseManager()

In [51]:
with db:
    all_participants = db.get_table(
        "Questionnaire_General", exclude_trials_with_measurement_problems=False
    )
    included_participants = db.get_table(
        "Questionnaire_General", exclude_trials_with_measurement_problems=True
    )

## BMI check for all participants

In [52]:
# Cast height string to float
all_participants = all_participants.with_columns(
    pl.col("height").str.replace(",", ".").cast(pl.Float64)
)
included_participants = included_participants.with_columns(
    pl.col("height").str.replace(",", ".").cast(pl.Float64)
)

# Add BMI column
all_participants = all_participants.with_columns(
    (col("weight") / col("height") ** 2).round().cast(pl.Int8).alias("bmi")
)
included_participants = included_participants.with_columns(
    (col("weight") / col("height") ** 2).round().cast(pl.Int8).alias("bmi")
)


In [54]:
bmi_too_low = all_participants.filter(col("bmi") < 18)["participant_id"].to_list()
bmi_too_high = all_participants.filter(col("bmi") > 30)["participant_id"].to_list()
if bmi_too_low:
    logger.fatal(f"Participant IDs with BMI too low: {bmi_too_low}, has to be excluded")
if bmi_too_high:
    logger.fatal(
        f"Participant IDs with BMI too high: {bmi_too_high}, has to be excluded"
    )

## Stats for included participants

In [55]:
participants_stats = {}

In [None]:
# Number of participants
participants_stats["n_participants"] = included_participants[
    "participant_id"
].n_unique()
participants_stats["n_female"] = included_participants.filter(
    col("gender") == "Female"
).height
participants_stats["n_male"] = included_participants.filter(
    col("gender") == "Male"
).height
participants_stats["n_included_participants"] = included_participants[
    "participant_id"
].n_unique()
participants_stats["n_excluded_participants"] = (
    participants_stats["n_participants"] - participants_stats["n_included_participants"]
)

# Age
participants_stats["mean_age"] = round(included_participants["age"].mean(), 1)
participants_stats["std_age"] = round(included_participants["age"].std(), 1)
participants_stats["min_age"] = included_participants["age"].min()
participants_stats["max_age"] = included_participants["age"].max()

# BMI
participants_stats["mean_bmi"] = round(included_participants["bmi"].mean(), 1)
participants_stats["std_bmi"] = round(included_participants["bmi"].std(), 1)
participants_stats["min_bmi"] = included_participants["bmi"].min()
participants_stats["max_bmi"] = included_participants["bmi"].max()

# Education
key_map = {
    "University degree": "n_university",
    "Trade/technical/vocational training": "n_trade",
    "Abitur (university entrance qualification)": "n_abitur",
    "Secondary school leaving certificate (Mittlere Reife)": "n_secondary",
}
value_count = dict(
    included_participants["education"].value_counts().rows_by_key("education")
)
value_count = {key_map.get(k, k): v[0] for k, v in value_count.items()}  # as int
participants_stats |= dict(value_count)

# Employment
key_map = {
    "Employed part-time": "n_employed_part_time",
    "Employed full-time": "n_employed_full_time",
    "Student": "n_student",
    "Retired": "n_retired",
    "Unemployed": "n_unemployed",
}
value_count = dict(
    included_participants["employment_status"]
    .value_counts()
    .rows_by_key("employment_status")
)
value_count = {key_map.get(k, k): v[0] for k, v in value_count.items()}  # as int
participants_stats |= dict(value_count)

# Physical activity
key_map = {
    "Daily": "n_daily",
    "Several times a week": "n_several_times_week",
    "Once a week": "n_once_week",
    "A few times a month": "n_few_times_month",
    "Rarely": "n_rarely",
    "Never": "n_never",
}
value_count = dict(
    included_participants["physical_activity"]
    .value_counts()
    .rows_by_key("physical_activity")
)
value_count = {key_map.get(k, k): v[0] for k, v in value_count.items()}  # as int
participants_stats |= dict(value_count)

# Medidation
participants_stats["n_meditation"] = included_participants.filter(
    col("meditation") == "Yes"
).height

In [57]:
# Save as json
for DIR in DIRS:
    with open(DIR / FILE_NAME, "w") as f:
        json.dump(participants_stats, f, indent=4)


In [61]:
participants_stats

{'n_participants': 42,
 'n_female': 23,
 'n_male': 19,
 'n_included_participants': 42,
 'n_excluded_participants': 0,
 'mean_age': 26.2,
 'std_age': 5.1,
 'min_age': 18,
 'max_age': 39,
 'mean_bmi': 23.4,
 'std_bmi': 3.0,
 'min_bmi': 19,
 'max_bmi': 30,
 'n_secondary': (1,),
 'n_trade': (4,),
 'n_university': (14,),
 'n_abitur': (23,),
 'n_employed_part_time': (2,),
 'n_student': (34,),
 'n_unemployed': (4,),
 'n_employed_full_time': (2,),
 'n_daily': (2,),
 'n_once_week': (7,),
 'n_few_times_month': (6,),
 'n_several_times_week': (27,),
 'n_meditation': 2}