In [76]:
import pandas as pd
import numpy as np
from skimpy import clean_columns

In [77]:
# Read the CSV file
survey = pd.read_csv("data_raw/baseline_raw_03_06_24.csv")
# Clean column names
survey=clean_columns(survey)
# remove useless columns
survey = survey.drop(columns=["identifier", "id", "started"] + list(survey.loc[:, "location_ts":"context_weather_timezone"].columns))
# remove useless rows
survey = survey.drop([0,1])

In [78]:
# Ensure the 'finished' column is in datetime format
survey['finished'] = pd.to_datetime(survey['finished'], format='ISO8601')

In [79]:
# Define the complex transformation function
def transform_row(row):
    if not pd.isna(row['registration_health_chronics_0']):
        return "None"
    elif not pd.isna(row['registration_health_chronics_1']) and pd.isna(row['registration_health_chronics_2']) and pd.isna(row['registration_health_chronics_3']) and pd.isna(row['registration_health_chronics_4']):
        return "High blood pressure"
    elif pd.isna(row['registration_health_chronics_1']) and not pd.isna(row['registration_health_chronics_2']) and pd.isna(row['registration_health_chronics_3']) and pd.isna(row['registration_health_chronics_4']):
        return "Diabetes"
    elif pd.isna(row['registration_health_chronics_1']) and pd.isna(row['registration_health_chronics_2']) and not pd.isna(row['registration_health_chronics_3']) and pd.isna(row['registration_health_chronics_4']):
        return "Heart diseases"
    elif pd.isna(row['registration_health_chronics_1']) and pd.isna(row['registration_health_chronics_2']) and pd.isna(row['registration_health_chronics_3']) and not pd.isna(row['registration_health_chronics_4']):
        return "Immunosuppression"
    else:
        return "Multiple"

survey['health_condition'] = survey.apply(transform_row, axis=1)

In [None]:
def transform_registration_basics_gender(x):
    if x == "0":
        return "Male"
    elif x == "1":
        return "Female"
    elif x == "2":
        return "Other"
    else:
        return np.nan
    
survey['registration_basics_gender'] = survey['registration_basics_gender'].apply(transform_registration_basics_gender)

In [None]:
def transform_registration_health_smoking(x):
    if x == "0":
        return "Not smoking"
    elif x == "1":
        return "Daily"
    elif x == "2":
        return "Weekly"
    elif x == "3":
        return "Monthly"
    elif x == "4":
        return "Former smoker"
    else:
        return np.nan

survey['registration_health_smoking'] = survey['registration_health_smoking'].apply(transform_registration_health_smoking)

In [None]:
# Filter test data
survey = survey[survey['registration_travel_travel_purpose'] != '4: testing push']
def transform_registration_travel_purpose(x):
    if x == "0":
        return "Leisure/tourist travellers"
    elif x == "1":
        return "Business/corporate travellers"
    elif x == "2":
        return "Visiting friends and relatives (VFR)"
    elif x == "3":
        return "Mass gathering events (Hajj, Olympics, World Cup)"
    elif x == "4: festival":
        return "Mass gathering events (Hajj, Olympics, World Cup)"
    elif x in ["4: Cross border worker (and also family)", "4: work", "4: in einer Schule arbeiten (freiwillig)",
               "4: mission humanitaire", "4: conference ", "4: volunteer work in a school", "4: volunteerwork", "4: Volunteer ",
               "4: Dreharbeiten","4: voluntaria","4: Voyage humanitaire","4: construction","4: NGO project","4: Weltwärts Dienst"]:
        return "Work"
    elif x in ["4: formation ", "4: intership", "4: Internship", "4: Research", "4: research exchange ",
               "4: stage en médecine", "4: stage médecine tropicale ", "4: stage en médecine", "4: studie",
               "4: stage médical ", "4: Uni (Austauschsemester?", "4: Auslandssemester", "4: mission humanitaire",
               "4: conference ", "4: Auslands Semester von der UZH", "4: forschungs Praktikum",
               "4: travail de Master en médecine", "4: Travail de terrain pour mon mémoire de master",
               "4: Austauschstudium","4: Study","4: Tourist und Ausbildung","4: congres","4: conference","4: attending lectures",
               "4: Voyage humanitaire "]:
        return "Education"
    elif x in ["4: Altro", "4: Other", "4: test", "5: moldova", "4: Autre", "4: Andere", "4: Militärdienst", "4: Freiwilligendienst",
               "4: Auswandern"]:
        return "Other"
    elif x in ["4: Hochzeit unseres Sohnes in Indien"]:
        return "Visiting friends and relatives (VFR)"
    elif x in ["4: Tourist travelling and visiting friends", "5: Besuch Eishockeyturnier der Schweizer U16 Nationalmannschaft ",
               "4: Weihnachten/Neujahr", "4: Visiting", "4: Aufenthalt in einem Ashram ", "4: tourism, NGO"]:
        return "Leisure/tourist travellers"
    elif x in ["5: Беженец / мигрант (пожалуйста, укажите страну происхождения)", "5: Біженець / мігрант (будь ласка, введіть країну походження)",
               "5: Ukraine", "5: Ukraine ", "5: Украина ", "5: Украина", "5: украина", "5: Швейцария ",
               "5: Refugee / Migrant (please enter country of origin)", "5: Flüchtling / Migrant (bitte Herkunftsland angeben)",
               "5: refugee/Ukraine ","5: Україна","5: Réfugié / Migrant (veuillez indiquer le pays d'origine)"]:
        return "Refugee/Ukraine"
    elif pd.isna(x):
        return np.nan
    else:
        return x
survey['registration_travel_travel_purpose'] = survey['registration_travel_travel_purpose'].apply(transform_registration_travel_purpose)

In [24]:
survey.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1346 entries, 2 to 1347
Data columns (total 17 columns):
 #   Column                                  Non-Null Count  Dtype         
---  ------                                  --------------  -----         
 0   user_id                                 1346 non-null   object        
 1   finished                                1346 non-null   datetime64[ns]
 2   registration_basics_age                 1346 non-null   object        
 3   registration_basics_gender              1346 non-null   object        
 4   registration_travel_country             1346 non-null   object        
 5   registration_travel_country_identifier  1346 non-null   object        
 6   registration_travel_travel_purpose      1346 non-null   object        
 7   registration_travel_travel_date         1346 non-null   object        
 8   registration_travel_travel_duration     1346 non-null   object        
 9   registration_health_smoking             1345 non-nul