# Introduction to data science
Author: Gérard Lichtert

## Introduction
This notebook is to clean data from a csv, it removes unnecesary columns, computes means and saves the processed data to a new csv file found in the output folder.

It will also make a new dataframe containing averages per day per participant and save it to a csv

## Variables you can change
In the following code cell you can change the variables as you need

In [3]:
# * This is a list of headers we want to delete (exluding the ones with _TZ, _RT and _TZ)
# * You can concatenate this list with the result of the tz_rt_ts_headers function to
# * include the ones with _TZ, _RT, and _TZ
HEADERS_TO_DELETE: list[str] = [
    "STUDY_ID",
    "STUDY_NAME",
    "STUDY_VERSION",
    "SURVEY_ID",
    "TRIGGER",
    "START_END",
    "RAND_PROB",
    "CONTROLLEVRAAG",
    "INTRO",
    "SLOT",
    "BEDANKT",
    "INLEIDING",
]

## Global constants and functions
First we declare a few global constants and functions. These will load the data in as well as manipulate it and save it. Be very careful when changing this as this might break crucial parts of the code.

In [4]:
from pathlib import Path
from pandas import DataFrame, read_csv, concat, to_numeric

def more_than_5_entries(df: DataFrame, id: str | None = None):
    """Checks if a DataFrame is larger than 5, optionally with a given id"""
    if id is not None:
        return select_by_activity((select_by_name(df, id))).shape[0] >= 5
    else:
        return select_by_activity(df).shape[0] >= 5


def add_daily_mean(df: DataFrame, columns: list[str]) -> DataFrame:
    df[columns] = df[columns].apply(to_numeric)
    df["MEAN"] = df[columns].mean(axis=1)
    return df


def clean_and_add_mean(df: DataFrame) -> DataFrame:
    # TODO: make a for comprehension in the concat function
    res_df: DataFrame = DataFrame()
    for participant in df["PARTICIPANT_ID"].unique():
        df_participant = df.loc[df["PARTICIPANT_ID"] == participant]
        if more_than_5_entries(df_participant):
            res_df = concat(
                [res_df, select_by_activity(df_participant)], ignore_index=True
            )
    res_df = add_daily_mean(res_df, ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"])
    return res_df


def transform_means(df: DataFrame):
    max_means = df["PARTICIPANT_ID"].value_counts().max()
    columns = ["PARTICIPANT_ID"] + [
        "DAG_" + str(day) for day in range(1, max_means + 1)
    ]
    daily_means_df = DataFrame(columns=columns)
    print(daily_means_df.columns.values)
    # TODO: Finish this


def process_obse(headers: list[str]) -> DataFrame | None:
    for file in IN_DIR.iterdir():
        if file.is_file() and file.suffix == ".csv":
            df = read_data_from_csv(file.name)
            df = delete_headers(df, headers, tz_rt_ts_headers_allowed=False)
            processed_df = clean_and_add_mean(df)
            save_data_to_csv(processed_df, file.name)
            print(transform_means(processed_df))

In [1]:
from obse import obse
import polars as pl
csv = obse.load_csv("exports_8gQqgGGHa_export_F005kvQNV.csv")

True True
..\resources\obse\data\exports_8gQqgGGHa_export_F005kvQNV.csv
['PARTICIPANT_TZ', 'EXPORT_TZ', 'CREATED_TS', 'SCHEDULED_TS', 'STARTED_TS', 'COMPLETED_TS', 'EXPIRED_TS', 'UPLOADED_TS', 'TOTAL_RT', 'ACTIVITEIT_RT', 'INLEIDING_RT', '1_RT', '2_RT', '3_RT', '4_RT', '5_RT', '6_RT', '7_RT', '8_RT', '9_RT', '10_RT', 'BEDANKT_RT', 'CONTROLLEVRAAG_RT', 'INTRO_RT', 'SLOT_RT']
