In [713]:
import pandas as pd
import numpy as np
from skimpy import clean_columns
from datetime import datetime

In [714]:
# Read the CSV file
follow_survey = pd.read_csv("data_raw/follow_survey_raw_12_11_24.csv")
# Clean column names
follow_survey=clean_columns(follow_survey)

In [715]:
# remove useless columns
follow_survey_symp = follow_survey.drop(columns=["identifier", "id", "started", "survey_symptoms_after_return_skip", "survey_gastro_skip", "survey_resp_skip", "survey_skin_skip", "survey_body_skip", "survey_swelling_skip"] + list(follow_survey.loc[:, "survey_symptoms_skip":"context_weather_timezone"].columns))
# remove useless rows
follow_survey_symp = follow_survey_symp.drop([0,1])

In [716]:
# Apply fillna(0) to the selected columns in one line
follow_survey_symp.loc[:, "survey_gastro_gastro_0":"survey_day"]=follow_survey_symp.loc[:, "survey_gastro_gastro_0":"survey_day"].fillna(0)

  follow_survey_symp.loc[:, "survey_gastro_gastro_0":"survey_day"]=follow_survey_symp.loc[:, "survey_gastro_gastro_0":"survey_day"].fillna(0)


In [717]:
# Change character string to numeric
follow_survey_symp.loc[:, 'survey_body_other'] = follow_survey_symp.loc[:, 'survey_body_other'].apply(lambda x: 0 if x == '0' else 1)
follow_survey_symp.loc[:, "survey_swelling_swelling_points_0":"survey_swelling_swelling_points_7"] = follow_survey_symp.loc[:, "survey_swelling_swelling_points_0":"survey_swelling_swelling_points_7"].map(lambda x: 0 if x == 0 else 1)

In [718]:
# Transform to numeric all columns
cols = follow_survey_symp.loc[:, "survey_gastro_gastro_0":"survey_day"].columns
follow_survey_symp[cols] = follow_survey_symp[cols].apply(pd.to_numeric, errors='coerce', axis=0)

In [719]:
# Create the new columns with the conditions
follow_survey_symp['gastro_any'] = np.where(follow_survey_symp.loc[:, 'survey_gastro_gastro_0':'survey_gastro_gastro_4'].max(axis=1, skipna=True) != 0, 'Yes', 'No')
follow_survey_symp['respi_any'] = np.where(follow_survey_symp.loc[:, 'survey_resp_resp_0':'survey_resp_resp_4'].max(axis=1, skipna=True) != 0, 'Yes', 'No')
follow_survey_symp['skin_any'] = np.where(follow_survey_symp.loc[:, 'survey_skin_skin_0':'survey_skin_skin_4'].max(axis=1, skipna=True) != 0, 'Yes', 'No')
follow_survey_symp['body_any'] = np.where(follow_survey_symp.loc[:, 'survey_body_fever':'survey_body_other'].max(axis=1, skipna=True) != 0, 'Yes', 'No')
follow_survey_symp['joint_any'] = np.where(follow_survey_symp.loc[:, 'survey_swelling_swelling_0':'survey_swelling_swelling_1'].max(axis=1, skipna=True) != 0, 'Yes', 'No')

In [720]:
# Create the new columns with selling locations
points = [
    'survey_swelling_swelling_points_0', 'survey_swelling_swelling_points_1', 'survey_swelling_swelling_points_2',
    'survey_swelling_swelling_points_3', 'survey_swelling_swelling_points_4', 'survey_swelling_swelling_points_5',
    'survey_swelling_swelling_points_6', 'survey_swelling_swelling_points_7'
]

# Define the conditions based on your R code logic
conditions = [
    (follow_survey_symp.loc[:, points[1:]].eq(0).all(axis=1) & (follow_survey_symp[points[0]] == 1)),
    (follow_survey_symp.loc[:, points[2:]].eq(0).all(axis=1) & (follow_survey_symp[points[0]] == 0) & (follow_survey_symp[points[1]] == 1)),
    (follow_survey_symp.loc[:, points[3:]].eq(0).all(axis=1) & (follow_survey_symp.loc[:, points[:2]].eq(0).all(axis=1)) & (follow_survey_symp[points[2]] == 1)),
    (follow_survey_symp.loc[:, points[4:]].eq(0).all(axis=1) & (follow_survey_symp.loc[:, points[:3]].eq(0).all(axis=1)) & (follow_survey_symp[points[3]] == 1)),
    (follow_survey_symp.loc[:, points[5:]].eq(0).all(axis=1) & (follow_survey_symp.loc[:, points[:4]].eq(0).all(axis=1)) & (follow_survey_symp[points[4]] == 1)),
    (follow_survey_symp.loc[:, points[6:]].eq(0).all(axis=1) & (follow_survey_symp.loc[:, points[:5]].eq(0).all(axis=1)) & (follow_survey_symp[points[5]] == 1)),
    ((follow_survey_symp[points[7]] == 0) & (follow_survey_symp.loc[:, points[:6]].eq(0).all(axis=1)) & (follow_survey_symp[points[6]] == 1)),
    (follow_survey_symp.loc[:, points[:7]].eq(0).all(axis=1) & (follow_survey_symp[points[7]] == 1)),
    (follow_survey_symp.loc[:, points].eq(0).all(axis=1))
]

choices = ['Shoulder', 'Elbow', 'Wrist', 'Fingers', 'Hip', 'Knee', 'Ankle', 'Toes', 'None']

# Apply the conditions and choices to create the new column
follow_survey_symp['survey_swelling_location'] = np.select(conditions, choices, default='Multiple')

# Print the value counts for 'survey_swelling_location' to verify the results
# print(survey['survey_swelling_location'].value_counts())

In [721]:
# Transform numeric to intensity
def transform_value(x):
    if x == 0:
        return "none"
    elif x == 1:
        return "mild"
    elif x == 2:
        return "moderate"
    elif x == 3:
        return "bad"
    elif x == 4:
        return "very bad"
    elif x == 5:
        return "medical"
    else:
        return np.nan

# List of columns to transform
columns_to_transform = (follow_survey_symp.loc[:, "survey_gastro_gastro_0":"survey_skin_skin_4"].columns.tolist() +
                        follow_survey_symp.loc[:, "survey_body_body_0":"survey_body_body_5"].columns.tolist() +
                        follow_survey_symp.loc[:, "survey_swelling_swelling_0":"survey_swelling_swelling_1"].columns.tolist())

# Cast columns to object dtype to allow for string assignment
follow_survey_symp[columns_to_transform] = follow_survey_symp[columns_to_transform].astype(object)

# Apply the transformation
follow_survey_symp[columns_to_transform] = follow_survey_symp[columns_to_transform].map(transform_value)

In [722]:
# Transform specific column
def transform_survey_body_fever(x):
    if x == 0:
        return "none"
    elif x == 1:
        return "not mesured"
    elif x == 2:
        return "over 37.5"
    elif x == 3:
        return "between 37.5 and 39"
    elif x == 4:
        return "over 39"
    else:
        return np.nan

def transform_survey_impact(x):
    if x == 0:
        return "Did not affect my activities"
    elif x == 1:
        return "Had a slight negative impact"
    elif x == 2:
        return "Had a moderate negative impact"
    elif x == 3:
        return "Had a major negative impact"
    elif x == 4:
        return "I couldnt do my daily activities due to my symptoms"
    elif x == 5:
        return "I had to seek medical attention for my symptoms"
    elif x == 6:
        return "I was hospitalised"
    else:
        return np.nan

def transform_survey_day(x):
    if x == 0:
        return "It was a great day"
    elif x == 1:
        return "It was a good day"
    elif x == 2:
        return "It was an okay day"
    elif x == 3:
        return "It was quite a bad day"
    elif x == 4:
        return "It was a really bad day"
    else:
        return np.nan

def transform_survey_body_other(x):
    if x == 0:
        return "No"
    elif x == 1:
        return "Yes"
    else:
        return np.nan

# Apply the transformations
follow_survey_symp['survey_body_fever'] = follow_survey_symp['survey_body_fever'].apply(transform_survey_body_fever)
follow_survey_symp['survey_impact'] = follow_survey_symp['survey_impact'].apply(transform_survey_impact)
follow_survey_symp['survey_day'] = follow_survey_symp['survey_day'].apply(transform_survey_day)
follow_survey_symp['survey_body_other'] = follow_survey_symp['survey_body_other'].apply(transform_survey_body_other)

In [723]:
# Define the columns to rename
rename_columns = {
    'baseline': 'trip_id',
    'survey_gastro_gastro_0': 'nausea',
    'survey_gastro_gastro_1': 'vomiting',
    'survey_gastro_gastro_2': 'stomach_pain',
    'survey_gastro_gastro_3': 'diarrhea',
    'survey_gastro_gastro_4': 'constipation',
    'survey_resp_resp_0': 'cough',
    'survey_resp_resp_1': 'sore_throat',
    'survey_resp_resp_2': 'runny_nose',
    'survey_resp_resp_3': 'out_of_breath_resting',
    'survey_resp_resp_4': 'out_of_breath_running',
    'survey_skin_skin_0': 'rash',
    'survey_skin_skin_1': 'itchy_insect_bite',
    'survey_skin_skin_2': 'itchy_other',
    'survey_skin_skin_3': 'sunburn',
    'survey_skin_skin_4': 'itchy_red_eyes',
    'survey_body_fever': 'fever',
    'survey_body_body_0': 'dizziness',
    'survey_body_body_1': 'ear_ache',
    'survey_body_body_2': 'headache',
    'survey_body_body_3': 'pain_eyes',
    'survey_body_body_4': 'musle_pain',
    'survey_body_body_5': 'aching_limbs',
    'survey_body_other': 'body_other',
    'survey_swelling_swelling_0': 'pain_joint',
    'survey_swelling_swelling_1': 'swelling_joint',
    'survey_swelling_location': 'location_swelling',
    'survey_impact': 'impact',
    'survey_day': 'rating_day',
    'longitude': 'survey_longitude',
    'latitude': 'survey_latitude',

}

# Rename columns
follow_survey_symp.rename(columns=rename_columns, inplace=True)

# Select the required columns in the specified order

selected_columns = [
    'trip_id', 'user_id', 'finished', 'gastro_any', 'nausea', 'vomiting', 'stomach_pain',
    'diarrhea', 'constipation', 'respi_any', 'cough', 'sore_throat', 'runny_nose', 'out_of_breath_resting',
    'out_of_breath_running', 'skin_any', 'rash', 'itchy_insect_bite', 'itchy_other', 'sunburn', 'itchy_red_eyes',
    'body_any', 'fever', 'dizziness', 'ear_ache', 'headache', 'pain_eyes', 'musle_pain', 'aching_limbs',
    'body_other', 'joint_any', 'pain_joint', 'swelling_joint', 'location_swelling', 'impact', 'rating_day'
]

# Create the final DataFrame with selected and renamed columns
follow_survey_symp = follow_survey_symp[selected_columns]

In [724]:
# remove useless columns
follow_survey_extra = follow_survey.drop(columns=["identifier", "id", "started","survey_contact_skip","survey_contact_contact_email","survey_contact_contact_phone",'survey_skip_self_treament',"survey_diagnosed_infection_skip"] + 
                                         list(follow_survey.loc[:, "survey_symptoms_after_return_skip":"survey_day_1"].columns)+
                                         list(follow_survey.loc[:, "latitude":"context_weather_timezone"].columns))
# remove useless rows
follow_survey_extra = follow_survey_extra.drop([0,1])

In [725]:
def transform_survey_consulted_doctor(x):
    if x == "false":
        return "No"
    elif x == "true":
        return "Yes"
    else:
        return "No"
# Apply the transformations
follow_survey_extra['consulted_doctor'] = follow_survey_extra['survey_consulted_doctor'].apply(transform_survey_consulted_doctor)

In [726]:
# Update the function to leave unmatched treatments as-is
def group_treatment_with_raw(value):
    if pd.isna(value):  # Handle NaN directly
        return "None"
    
    value = str(value).lower()  # Normalize to lowercase for easier matching
    categories = []  # To store matching categories

    # Pain Medication
    if any(keyword in value for keyword in ["ibuprofen", "paracetamol", "dafalgan", "painkiller", "headache tablet","acetaminophen","entzündungshemmende schmerzmittel","antipyretic", "temperature", "paracetamol", "ibuprofen","fieber and headache treated with regular prescription free medication"]):
        categories.append("Pain Medication/Antipyretics")
    # Antibiotics
    if any(keyword in value for keyword in ["antibiotic", "antibiotika", "azithromycin", "metronidazole", "streptoquin", "antinal","antibiotischen teopfen","metronidazol"]):
        categories.append("Antibiotics")
    # Antidiarrheal
    if any(keyword in value for keyword in ["imodium", "loperamide", "diarrhea", "antidiarrheal","immodium","kohletabletten","loperamid"]):
        categories.append("Antidiarrheal")
    # Electrolytes/Hydration
    if any(keyword in value for keyword in ["electrolyte", "hydration", "rehydration", "fluid", "iso-brausetabletten","elektrolyten","elektrolytpulver","elektrolyten"]):
        categories.append("Electrolytes/Hydration")
    # Topical Treatments
    if any(keyword in value for keyword in ["fenistil", "cream", "ointment", "fungizyd", "sun lotion", "topical", "after sun","vaseline","topische therapie"]):
        categories.append("Topical Treatments")
    # Nasal/Throat Sprays
    if any(keyword in value for keyword in ["nasal spray", "throat spray", "xylometazolin", "decongestant", "lozenges","nasenspray","nose spray","lutschtabletten","hustensaft","hustensirup","halswehtabletten"]):
        categories.append("Nasal/Throat Sprays")
    # Antihistamines
    if any(keyword in value for keyword in ["antihistamine", "cetirizine", "allergy", "itch", "fenistil","antihistamin"]):
        categories.append("Antihistamines")
    # Natural Remedies
    if any(keyword in value for keyword in ["herbal", "tea", "rest", "natural", "warm", "home remedy","thymian öl","vitamin d und c","zink"]):
        categories.append("Natural Remedies")
    # Other
    if any(keyword in value for keyword in ["bioflorin","over the counter","motilium","primperan","lactoferment","symbicort"]):
        categories.append("Other")
   
    
    # Return the categories or the raw value if no categories match
    return ", ".join(categories) if categories else "None"

# Apply the updated function
follow_survey_extra['self_treament'] = follow_survey_extra['survey_self_treament'].apply(group_treatment_with_raw)

#follow_survey_extra['treatment_type'].value_counts()

In [727]:
# Step 1: List of diagnosis columns
diagnosis_columns = [
    'survey_diagnosed_infection_medical_diagnosis_1',
    'survey_diagnosed_infection_medical_diagnosis_3',
    'survey_diagnosed_infection_medical_diagnosis_0',
    'survey_diagnosed_infection_medical_diagnosis_1_1',
    'survey_diagnosed_infection_medical_diagnosis_2',
    'survey_diagnosed_infection_medical_diagnosis_3_1',
    'survey_diagnosed_infection_medical_diagnosis_4',
    'survey_diagnosed_infection_medical_diagnosis_5',
    'survey_diagnosed_infection_medical_diagnosis_6',
    'survey_diagnosed_infection_medical_diagnosis_7',
    'survey_diagnosed_infection_medical_diagnosis_8'
]

# Step 2: Combine all diagnoses into a single column
follow_survey_extra['combined_diagnosis'] = follow_survey_extra[diagnosis_columns].apply(
    lambda row: ', '.join(row.dropna().astype(str)) if not row.dropna().empty else None,
    axis=1
)

# Step 3: Mapping for cleaning conditions
condition_mapping = {
    "SARS-COV-2 (Covid)": "COVID-19",
    "Campylobacter": "Campylobacter Infection",
    "Amöbeninfektion": "Amoebiasis",
    "Amöben": "Amoebiasis",
    "Borreliose (Lyme-Borreliose)": "Lyme Disease",
    "Ihre Diagnose oder wählen Sie unten": "Unknown Diagnosis",
    "Giardia": "Giardiasis",
    "atemwegsinfekt (mit Antibiotikum behandelt)": "Respiratory Infection",
    "Infektion am Fusszeh nach OP in Vietnam. Pseudomonas und Acytenobacter baumanii": "Foot Infection (Pseudomonas & Acinetobacter)",
    "Uw diagnose of kies hieronder": "Unknown Diagnosis",
    "Influenza (De griep)": "Influenza",
    "gastroenteritis": "Gastroenteritis",
    "post Cold virus bronchospasm": "Post-Cold Bronchospasm",
    "Lungenentzündung": "Pneumonia",
    "Tungiasis": "Tungiasis"
}

# Step 4: Function to clean and map conditions
def clean_conditions(cell):
    if pd.isna(cell):
        return None  # Handle NaN values
    # Remove "TRUE:" prefix, split on commas, map to clean terms
    conditions = [condition_mapping.get(part.strip().replace("TRUE:", ""), 
                                        part.strip().replace("TRUE:", ""))  # Clean prefix
                  for part in cell.split(",")]  # Split on comma for multiple conditions
    # Remove duplicates and filter out 'Unknown Diagnosis' if valid conditions exist
    cleaned_conditions = sorted(set(filter(lambda x: x != "Unknown Diagnosis" or len(set(conditions)) == 1, conditions)))
    return ", ".join(cleaned_conditions) if cleaned_conditions else None

# Step 5: Apply the cleaning function
follow_survey_extra['diagnosis'] = follow_survey_extra['combined_diagnosis'].apply(clean_conditions)

# Drop the diagnosis columns from the DataFrame
follow_survey_extra = follow_survey_extra.drop(columns=diagnosis_columns)

# Step 6: Display the cleaned column
# print(follow_survey_extra['combined_diagnosis'].value_counts(dropna=False))

In [728]:
# Ensure the 'finished' column is in datetime format
follow_survey_extra['survey_diagnosed_infection_diagnosis_date'] = pd.to_datetime(follow_survey_extra['survey_diagnosed_infection_diagnosis_date'], format='ISO8601')
# Create a new column 'finished_day' which contains only the date part of 'finished'
follow_survey_extra['diagnosis_date'] = follow_survey_extra['survey_diagnosed_infection_diagnosis_date'].dt.date

In [729]:
# remove useless columns
follow_survey_extra = follow_survey_extra.drop(columns=["combined_diagnosis","survey_consulted_doctor","survey_diagnosed_infection_diagnosis_date","survey_self_treament"])
follow_survey_extra = follow_survey_extra.rename(columns={'baseline': 'trip_id'})

In [730]:
# Perform the left join on three keys
follow_survey = pd.merge(
    follow_survey_symp, 
    follow_survey_extra, 
    how='left', 
    on=['trip_id', 'user_id',"finished"],
)

In [731]:
follow_survey = follow_survey.dropna(subset=['trip_id'])

In [732]:
follow_survey.to_pickle('data_clean/follow_survey.pkl')