In [131]:
import pandas as pd
import numpy as np
from skimpy import clean_columns
from datetime import datetime

In [132]:
# Read the CSV file
follow_survey = pd.read_csv("data_raw/follow_survey_raw_12_11_24.csv")
# Clean column names
follow_survey=clean_columns(follow_survey)


In [133]:
# remove useless columns survey_diagnosed_infection_medical_diagnosis_3_1
follow_survey = follow_survey.drop(
    columns=["identifier", "id", "started", "survey_symptoms_after_return_skip", "survey_gastro_skip", 
             "survey_resp_skip", "survey_skin_skip", "survey_body_skip", "survey_swelling_skip"] + 
            list(follow_survey.loc[:, "survey_symptoms_skip":"survey_day_1"].columns) + 
            list(follow_survey.loc[:, "location_ts":"context_weather_timezone"].columns) +
            list(follow_survey.loc[:, "survey_contact_skip":"survey_contact_contact_phone"].columns)
)
# remove useless rows
follow_survey = follow_survey.drop([0,1])

In [None]:
# Step 1: List of diagnosis columns
diagnosis_columns = [
    'survey_diagnosed_infection_medical_diagnosis_1',
    'survey_diagnosed_infection_medical_diagnosis_3',
    'survey_diagnosed_infection_medical_diagnosis_0',
    'survey_diagnosed_infection_medical_diagnosis_1_1',
    'survey_diagnosed_infection_medical_diagnosis_2',
    'survey_diagnosed_infection_medical_diagnosis_3_1',
    'survey_diagnosed_infection_medical_diagnosis_4',
    'survey_diagnosed_infection_medical_diagnosis_5',
    'survey_diagnosed_infection_medical_diagnosis_6',
    'survey_diagnosed_infection_medical_diagnosis_7',
    'survey_diagnosed_infection_medical_diagnosis_8'
]

# Step 2: Combine all diagnoses into a single column
follow_survey['combined_diagnosis'] = follow_survey[diagnosis_columns].apply(
    lambda row: ', '.join(row.dropna().astype(str)) if not row.dropna().empty else None,
    axis=1
)

# Step 3: Mapping for cleaning conditions
condition_mapping = {
    "SARS-COV-2 (Covid)": "COVID-19",
    "Campylobacter": "Campylobacter Infection",
    "Amöbeninfektion": "Amoebiasis",
    "Amöben": "Amoebiasis",
    "Borreliose (Lyme-Borreliose)": "Lyme Disease",
    "Ihre Diagnose oder wählen Sie unten": "Unknown Diagnosis",
    "Giardia": "Giardiasis",
    "atemwegsinfekt (mit Antibiotikum behandelt)": "Respiratory Infection",
    "Infektion am Fusszeh nach OP in Vietnam. Pseudomonas und Acytenobacter baumanii": "Foot Infection (Pseudomonas & Acinetobacter)",
    "Uw diagnose of kies hieronder": "Unknown Diagnosis",
    "Influenza (De griep)": "Influenza",
    "gastroenteritis": "Gastroenteritis",
    "post Cold virus bronchospasm": "Post-Cold Bronchospasm",
    "Lungenentzündung": "Pneumonia",
    "Tungiasis": "Tungiasis"
}

# Step 4: Function to clean and map conditions
def clean_conditions(cell):
    if pd.isna(cell):
        return None  # Handle NaN values
    # Remove "TRUE:" prefix, split on commas, map to clean terms
    conditions = [condition_mapping.get(part.strip().replace("TRUE:", ""), 
                                        part.strip().replace("TRUE:", ""))  # Clean prefix
                  for part in cell.split(",")]  # Split on comma for multiple conditions
    # Remove duplicates and filter out 'Unknown Diagnosis' if valid conditions exist
    cleaned_conditions = sorted(set(filter(lambda x: x != "Unknown Diagnosis" or len(set(conditions)) == 1, conditions)))
    return ", ".join(cleaned_conditions) if cleaned_conditions else None

# Step 5: Apply the cleaning function
follow_survey['combined_diagnosis'] = follow_survey['combined_diagnosis'].apply(clean_conditions)

# Drop the diagnosis columns from the DataFrame
follow_survey = follow_survey.drop(columns=diagnosis_columns)

# Step 6: Display the cleaned column
#print(follow_survey['combined_diagnosis'].value_counts(dropna=False))

combined_diagnosis
None                                            447
COVID-19                                          6
Campylobacter Infection                           2
Pneumonia                                         2
Amoebiasis, Lyme Disease                          1
Amoebiasis                                        1
Unknown Diagnosis                                 1
Giardiasis                                        1
Respiratory Infection                             1
Foot Infection (Pseudomonas & Acinetobacter)      1
Influenza                                         1
Gastroenteritis                                   1
COVID-19, Giardiasis                              1
Post-Cold Bronchospasm                            1
Tungiasis                                         1
Name: count, dtype: int64


In [77]:
# Ensure the 'finished' column is in datetime format
follow_survey['finished'] = pd.to_datetime(follow_survey['finished'], format='ISO8601')

In [78]:
# Apply fillna(0) to the selected columns in one line
follow_survey.loc[:, "survey_gastro_gastro_0":"survey_self_treament"]=follow_survey.loc[:, "survey_gastro_gastro_0":"survey_self_treament"].fillna(0)

  follow_survey.loc[:, "survey_gastro_gastro_0":"survey_self_treament"]=follow_survey.loc[:, "survey_gastro_gastro_0":"survey_self_treament"].fillna(0)


In [None]:
# Change character string to numeric
follow_survey.loc[:, 'survey_body_other'] = follow_survey.loc[:, 'survey_body_other'].apply(lambda x: 0 if x == '0' else 1)
follow_survey.loc[:, "survey_swelling_swelling_points_0":"survey_swelling_swelling_points_7"] = follow_survey.loc[:, "survey_swelling_swelling_points_0":"survey_swelling_swelling_points_7"].map(lambda x: 0 if x == 0 else 1)


In [80]:
# Transform to numeric all columns
cols = follow_survey.loc[:, "survey_gastro_gastro_0":"survey_self_treament"].columns
follow_survey[cols] = follow_survey[cols].apply(pd.to_numeric, errors='coerce', axis=0)

In [81]:
# Create the new columns with the conditions
follow_survey['gastro_any'] = np.where(follow_survey.loc[:, 'survey_gastro_gastro_0':'survey_gastro_gastro_4'].max(axis=1, skipna=True) != 0, 'Yes', 'No')
follow_survey['respi_any'] = np.where(follow_survey.loc[:, 'survey_resp_resp_0':'survey_resp_resp_4'].max(axis=1, skipna=True) != 0, 'Yes', 'No')
follow_survey['skin_any'] = np.where(follow_survey.loc[:, 'survey_skin_skin_0':'survey_skin_skin_4'].max(axis=1, skipna=True) != 0, 'Yes', 'No')
follow_survey['body_any'] = np.where(follow_survey.loc[:, 'survey_body_fever':'survey_body_other'].max(axis=1, skipna=True) != 0, 'Yes', 'No')
follow_survey['joint_any'] = np.where(follow_survey.loc[:, 'survey_swelling_swelling_0':'survey_swelling_swelling_1'].max(axis=1, skipna=True) != 0, 'Yes', 'No')


In [83]:
follow_survey.loc[:, 'survey_consulted_doctor':'survey_self_treament']

Unnamed: 0,survey_consulted_doctor,survey_diagnosed_infection_skip,survey_diagnosed_infection_medical_diagnosis_1,survey_diagnosed_infection_medical_diagnosis_3,survey_diagnosed_infection_medical_diagnosis_0,survey_diagnosed_infection_medical_diagnosis_1_1,survey_diagnosed_infection_medical_diagnosis_2,survey_diagnosed_infection_medical_diagnosis_3_1,survey_diagnosed_infection_medical_diagnosis_4,survey_diagnosed_infection_medical_diagnosis_5,survey_diagnosed_infection_medical_diagnosis_6,survey_diagnosed_infection_medical_diagnosis_7,survey_diagnosed_infection_medical_diagnosis_8,survey_diagnosed_infection_diagnosis_date,survey_skip_self_treament,survey_self_treament
2,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0
3,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0
4,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0
5,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0
6,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
465,0.0,,0,0,0,0,0,0,0,0,0,0,0,0.0,,0.0
466,,,0,0,0,0,0,0,0,0,0,0,0,0.0,,0.0
467,0.0,,0,0,0,0,0,0,0,0,0,0,0,0.0,,0.0
468,0.0,,0,0,0,0,0,0,0,0,0,0,0,0.0,,0.0


In [None]:
# Create the new columns with selling locations
points = [
    'survey_swelling_swelling_points_0', 'survey_swelling_swelling_points_1', 'survey_swelling_swelling_points_2',
    'survey_swelling_swelling_points_3', 'survey_swelling_swelling_points_4', 'survey_swelling_swelling_points_5',
    'survey_swelling_swelling_points_6', 'survey_swelling_swelling_points_7'
]

# Define the conditions based on your R code logic
conditions = [
    (survey.loc[:, points[1:]].eq(0).all(axis=1) & (survey[points[0]] == 1)),
    (survey.loc[:, points[2:]].eq(0).all(axis=1) & (survey[points[0]] == 0) & (survey[points[1]] == 1)),
    (survey.loc[:, points[3:]].eq(0).all(axis=1) & (survey.loc[:, points[:2]].eq(0).all(axis=1)) & (survey[points[2]] == 1)),
    (survey.loc[:, points[4:]].eq(0).all(axis=1) & (survey.loc[:, points[:3]].eq(0).all(axis=1)) & (survey[points[3]] == 1)),
    (survey.loc[:, points[5:]].eq(0).all(axis=1) & (survey.loc[:, points[:4]].eq(0).all(axis=1)) & (survey[points[4]] == 1)),
    (survey.loc[:, points[6:]].eq(0).all(axis=1) & (survey.loc[:, points[:5]].eq(0).all(axis=1)) & (survey[points[5]] == 1)),
    ((survey[points[7]] == 0) & (survey.loc[:, points[:6]].eq(0).all(axis=1)) & (survey[points[6]] == 1)),
    (survey.loc[:, points[:7]].eq(0).all(axis=1) & (survey[points[7]] == 1)),
    (survey.loc[:, points].eq(0).all(axis=1))
]

choices = ['Shoulder', 'Elbow', 'Wrist', 'Fingers', 'Hip', 'Knee', 'Ankle', 'Toes', 'None']

# Apply the conditions and choices to create the new column
survey['survey_swelling_location'] = np.select(conditions, choices, default='Multiple')

# Print the value counts for 'survey_swelling_location' to verify the results
# print(survey['survey_swelling_location'].value_counts())

In [64]:
follow_survey.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 468 entries, 2 to 469
Data columns (total 61 columns):
 #   Column                                            Non-Null Count  Dtype         
---  ------                                            --------------  -----         
 0   baseline                                          467 non-null    object        
 1   user_id                                           468 non-null    object        
 2   finished                                          468 non-null    datetime64[ns]
 3   survey_gastro_gastro_0                            468 non-null    int64         
 4   survey_gastro_gastro_1                            468 non-null    int64         
 5   survey_gastro_gastro_2                            468 non-null    int64         
 6   survey_gastro_gastro_3                            468 non-null    int64         
 7   survey_gastro_gastro_4                            468 non-null    int64         
 8   survey_resp_resp_0            

In [52]:
follow_survey.survey_diagnosed_infection_medical_diagnosis_4.value_counts(dropna=False)


survey_diagnosed_infection_medical_diagnosis_4
0                          461
TRUE:SARS-COV-2 (Covid)      7
Name: count, dtype: int64

In [20]:
follow_survey

Unnamed: 0,baseline,user_id,finished,survey_gastro_gastro_0,survey_gastro_gastro_1,survey_gastro_gastro_2,survey_gastro_gastro_3,survey_gastro_gastro_4,survey_resp_resp_0,survey_resp_resp_1,...,survey_diagnosed_infection_medical_diagnosis_4,survey_diagnosed_infection_medical_diagnosis_5,survey_diagnosed_infection_medical_diagnosis_6,survey_diagnosed_infection_medical_diagnosis_7,survey_diagnosed_infection_medical_diagnosis_8,survey_diagnosed_infection_diagnosis_date,survey_skip_self_treament,survey_self_treament,latitude,longitude
0,,,NaT,Nausea,Vomiting,Stomach Pain,Diarrheoa,Constipation,Cough,Sore throat,...,SARS-COV-2 (Covid),Campylobacter,Giardia,Borreliosis (Lyme disease),Tick borne encephalitis (TBE),When were you diagnosed?,Did you self treat for any symptom/illness dur...,Please provide details on the illness and trea...,,
1,,,NaT,(0=none) (1=mild) (2=moderate) (3=bad) (4=very...,(0=none) (1=mild) (2=moderate) (3=bad) (4=very...,(0=none) (1=mild) (2=moderate) (3=bad) (4=very...,(0=none) (1=mild) (2=moderate) (3=bad) (4=very...,(0=none) (1=mild) (2=moderate) (3=bad) (4=very...,(0=none) (1=mild) (2=moderate) (3=bad) (4=very...,(0=none) (1=mild) (2=moderate) (3=bad) (4=very...,...,,,,,,,,,,
2,1f3vgd0kJTIpW9yvurtU,h9nh0rIXxuRhdlGYofwgWIn5N453,2022-07-30 11:01:36.683151,,,,,,,,...,,,,,,,,,47.409564,8.546910
3,FwXkQasmZjN48tEO3J6K,lZsYePM7SvbzhNLUUHxLX84eLjJ2,2022-08-26 17:46:27.256662,,,,,,,,...,,,,,,,,,47.696596,8.632211
4,4PI0Rkw6dKBk1PG1fAcm,K2uAmiD2jHNThUBCMv25ERcka0t1,2022-09-07 17:15:19.772829,,,,,,,,...,,,,,,,,,47.368746,8.521609
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
465,Kf8uuwdg02Y0OsK9a0Fz,HEAnmAtjAPf9GIAreBADkmAyIr62,2024-11-08 01:50:56.118888,,,,,,,,...,,,,,,,false,,13.724232,100.547347
466,EoBezTlaCFXtcbfcgR2M,WqWfFyTW2BMYDteDJHgQ60My4PA2,2024-11-08 17:44:03.237698,0,0,0,2,0,,,...,,,,,,,false,,47.412620,8.545788
467,IRGieJ6RJjjA5qGpd3JF,zEyvfC67WYa9gF5cImiHjITjfkL2,2024-11-10 15:59:27.812436,,,,,,,,...,,,,,,,false,,47.388397,8.515869
468,cAArQBE1k6Cij27yaQwh,pg3kS437HFPyMWHZcTMvRopTyuU2,2024-11-11 21:25:19.392435,,,,,,,,...,,,,,,,false,,47.384487,8.551998


In [25]:
follow_survey.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 470 entries, 0 to 469
Data columns (total 59 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   baseline                                          467 non-null    object 
 1   user_id                                           468 non-null    object 
 2   finished                                          468 non-null    object 
 3   survey_gastro_gastro_0                            32 non-null     object 
 4   survey_gastro_gastro_1                            32 non-null     object 
 5   survey_gastro_gastro_2                            32 non-null     object 
 6   survey_gastro_gastro_3                            32 non-null     object 
 7   survey_gastro_gastro_4                            32 non-null     object 
 8   survey_resp_resp_0                                45 non-null     object 
 9   survey_resp_resp_1   

In [None]:
survey

In [None]:
# Read the CSV file
survey = pd.read_csv("data_raw/survey_raw_12_11_24.csv")
# Clean column names
survey=clean_columns(survey)

# remove useless columns
survey = survey.drop(columns=["identifier", "id", "started", "survey_symptoms_skip", "survey_gastro_skip", "survey_resp_skip", "survey_skin_skip", "survey_body_skip", "survey_swelling_skip"] + list(survey.loc[:, "location_ts":"context_air_quality_random_recommendations_sport"].columns))
# remove useless rows
survey = survey.drop([0,1])