# Introduction to data science
Author: Gérard Lichtert

## Introduction
This notebook is to clean data from a csv, it removes unnecesary columns, computes means and saves the processed data to a new csv file found in the output folder.

It will also make a new dataframe containing averages per day per participant and save it to a csv

## Variables you can change
In the following code cell you can change the variables as you need

In [15]:
# * This is a list of headers we want to delete (exluding the ones with _TZ, _RT and _TZ)
# * You can concatenate this list with the result of the tz_rt_ts_headers function to
# * include the ones with _TZ, _RT, and _TZ
HEADERS_TO_DELETE: list[str] = [
    "STUDY_ID",
    "STUDY_NAME",
    "STUDY_VERSION",
    "SURVEY_ID",
    "TRIGGER",
    "START_END",
    "RAND_PROB",
    "CONTROLLEVRAAG",
    "INTRO",
    "SLOT",
    "BEDANKT",
    "INLEIDING",
]

## Global constants and functions
First we declare a few global constants and functions. These will load the data in as well as manipulate it and save it. Be very careful when changing this as this might break crucial parts of the code.

In [16]:
from pathlib import Path
from pandas import DataFrame, read_csv, concat, to_numeric

# ! GLOBAL CONSTANTS: DO NOT TOUCH
IN_DIR = Path("../resources/obse/data")
OUT_DIR = Path("../resources/obse/out")
if IN_DIR.parent.parent.exists() is False:
    IN_DIR.parent.parent.mkdir()
if IN_DIR.parent.exists() is False:
    IN_DIR.parent.mkdir()
if IN_DIR.exists() is False:
    IN_DIR.mkdir()
if OUT_DIR.exists() is False:
    OUT_DIR.mkdir()

# ! GLOBAL FUNCTIONS: DO NOT TOUCH
def read_data_from_csv(filename: str) -> DataFrame:
    """reads data from a csv file and returns a DataFrame object"""
    with open(f"{IN_DIR.as_posix()}/{filename}") as f:
        return read_csv(f)


def save_data_to_csv(df: DataFrame, filename: str | Path) -> None:
    """Given a DataFrame and a fileneme or a Path object, saves the DataFrame the OUT_DIR as a csv file"""
    if OUT_DIR.exists() is False:
        OUT_DIR.mkdir()
    if isinstance(filename, Path):
        df.to_csv(f"{OUT_DIR.as_posix()}/{filename.name}", index=False)
    else:
        df.to_csv(f"{OUT_DIR.as_posix()}/{filename}", index=False)


def tz_rt_ts_headers(df: DataFrame) -> list[str]:
    """Creates a list containing all headers with _TZ, _RT or _TS in them Feel free to edit"""
    return [
        i for i in list(df.columns.values) if "_TZ" in i or "_RT" in i or "_TS" in i
    ]


def delete_headers(df: DataFrame, headers, tz_rt_ts_headers_allowed=True) -> DataFrame:
    """Given a Dataframe and a list of headers to delete, removes the headers from the DataFrame and returns it.
    Optionally you could also remove the headers with _TZ, _RT or _TS in them by setting tz_rt_ts_headers_allowed to False"""
    if tz_rt_ts_headers_allowed:
        return df.drop(columns=headers, axis=1)
    else:
        return df.drop(columns=headers + tz_rt_ts_headers(df), axis=1)


def select_by_activity(df: DataFrame) -> DataFrame:
    """Selects all the recors where ACTIVITEIT is 1"""
    return df.loc[df["ACTIVITEIT"] == 1]


def select_by_name(df: DataFrame, participant_id: str) -> DataFrame:
    """Selects all records where the PARTICIPANT_ID is equal to the fiven participant_id"""
    return df.loc[df["PARTICIPANT_ID"] == participant_id]


def more_than_5_entries(df: DataFrame, id: str | None = None):
    """Checks if a DataFrame is larger than 5, optionally with a given id"""
    if id is not None:
        return select_by_activity((select_by_name(df, id))).shape[0] >= 5
    else:
        return select_by_activity(df).shape[0] >= 5


def add_daily_mean(df: DataFrame, columns: list[str]) -> DataFrame:
    df[columns] = df[columns].apply(to_numeric)
    df["MEAN"] = df[columns].mean(axis=1)
    return df


def clean_and_add_mean(df: DataFrame):
    res_df: DataFrame = DataFrame()
    for participant in df["PARTICIPANT_ID"].unique():
        df_participant = df.loc[df["PARTICIPANT_ID"] == participant]
        if more_than_5_entries(df_participant):
            res_df = concat(
                [res_df, select_by_activity(df_participant)], ignore_index=True
            )
    res_df = add_daily_mean(
        res_df, ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]
    )
    return res_df

def transform_means(df: DataFrame):
    max_means = df["PARTICIPANT_ID"].value_counts().max()
    columns = ["PARTICIPANT_ID"] + ["DAG_" + str(day) for day in range(1, max_means + 1)]
    daily_means_df = DataFrame(columns=columns)
    print(daily_means_df.columns.values)
    # TODO: Finish this


def process_OBSE() -> DataFrame | None:
    for file in IN_DIR.iterdir():
        if file.is_file() and file.suffix == ".csv":
            df = read_data_from_csv(file.name)
            df = delete_headers(df, HEADERS_TO_DELETE, tz_rt_ts_headers_allowed=False)
            processed_df = clean_and_add_mean(df)
            save_data_to_csv(processed_df, file.name)
            print(transform_means(      processed_df))

            

In [17]:
process_OBSE()
