In [None]:
import pandas as pd
import numpy as np
from skimpy import clean_columns
from datetime import datetime

In [None]:
# Read the CSV file
survey = pd.read_csv("data_raw/survez_raw_03_06_24.csv")
# Clean column names
survey=clean_columns(survey)

# remove useless columns
survey = survey.drop(columns=["identifier", "id", "started", "survey_symptoms_skip", "survey_gastro_skip", "survey_resp_skip", "survey_skin_skip", "survey_body_skip", "survey_swelling_skip"] + list(survey.loc[:, "location_ts":"context_air_quality_random_recommendations_sport"].columns))
# remove useless rows
survey = survey.drop([0,1])

# check missing and type
survey.info(verbose=True)

In [None]:
# Ensure the 'finished' column is in datetime format
survey['finished'] = pd.to_datetime(survey['finished'], format='ISO8601')

# Calculate the difference in days from today, multiply by -1, and round to 0 decimal places
survey['finished_day'] = round((survey['finished'] - pd.Timestamp.today()).dt.total_seconds() / (3600 * 24) * (-1), 0)

In [None]:
# Define a function to apply the case_when logic including handling NaN values
def case_when(x):
    if pd.isna(x):
        return 0
    return np.select(
        [x == 0, x == "", x == 1, x == 2, x == 3, x == 4, x == 5],
        [0, 0, 1, 2, 3, 4, 5],
        default=np.nan
    )

# Apply the function across the specified range of columns
survey.loc[:, "survey_gastro_gastro_0":"survey_day"] = survey.loc[:, "survey_gastro_gastro_0":"survey_day"].applymap(lambda x: case_when(x)).astype(float)

In [76]:
# Create the new columns with the conditions
survey['gastro_any'] = np.where(survey.loc[:, 'survey_gastro_gastro_0':'survey_gastro_gastro_4'].max(axis=1, skipna=True) != 0, 'Yes', 'No')
survey['respi_any'] = np.where(survey.loc[:, 'survey_resp_resp_0':'survey_resp_resp_4'].max(axis=1, skipna=True) != 0, 'Yes', 'No')
survey['skin_any'] = np.where(survey.loc[:, 'survey_skin_skin_0':'survey_skin_skin_4'].max(axis=1, skipna=True) != 0, 'Yes', 'No')
survey['body_any'] = np.where(survey.loc[:, 'survey_body_fever':'survey_body_other'].max(axis=1, skipna=True) != 0, 'Yes', 'No')
survey['joint_any'] = np.where(survey.loc[:, 'survey_swelling_swelling_0':'survey_swelling_swelling_1'].max(axis=1, skipna=True) != 0, 'Yes', 'No')

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from pandas.api.types import CategoricalDtype

# Read the CSV file
survey = pd.read_csv('data_raw/survey_raw_29_08.csv')

# Clean column names
survey.columns = survey.columns.str.lower().str.replace(' ', '_')

# Filter out the first two rows and drop unnecessary columns
survey = survey.drop([0, 1]).reset_index(drop=True)
drop_cols = ["identifier", "id", "started", "survey_symptoms_skip", "survey_gastro_skip", "survey_resp_skip",
             "survey_skin_skip", "survey_body_skip", "survey_swelling_skip", "location_ts",
             "context_air_quality_random_recommendations_sport"]
survey = survey.drop(columns=drop_cols)

# Convert relevant columns to numeric
cols_to_convert = survey.loc[:, "survey_gastro_gastro_0":"survey_day"].columns
survey[cols_to_convert] = survey[cols_to_convert].apply(pd.to_numeric, errors='coerce')

# Calculate finished_day
survey['finished'] = pd.to_datetime(survey['finished'], errors='coerce')
survey['finished_day'] = ((datetime.now() - survey['finished']).dt.days * -1).round(0)

# Create binary columns for symptoms
symptom_groups = {
    'gastro_any': ["survey_gastro_gastro_0", "survey_gastro_gastro_1", "survey_gastro_gastro_2", "survey_gastro_gastro_3", "survey_gastro_gastro_4"],
    'respi_any': ["survey_resp_resp_0", "survey_resp_resp_1", "survey_resp_resp_2", "survey_resp_resp_3", "survey_resp_resp_4"],
    'skin_any': ["survey_skin_skin_0", "survey_skin_skin_1", "survey_skin_skin_2", "survey_skin_skin_3", "survey_skin_skin_4"],
    'body_any': ["survey_body_fever", "survey_body_other"],
    'joint_any': ["survey_swelling_swelling_0", "survey_swelling_swelling_1"]
}

for symptom, cols in symptom_groups.items():
    survey[symptom] = survey[cols].max(axis=1, skipna=True).apply(lambda x: 'Yes' if x != 0 else 'No')

# Determine location of swelling
def determine_swelling_location(row):
    locations = ["Shoulder", "Elbow", "Wrist", "Fingers", "Hip", "Knee", "Ankle", "Toes"]
    swelling_points = row.loc[["survey_swelling_swelling_points_" + str(i) for i in range(8)]]
    if swelling_points.eq(0).all():
        return "None"
    for i, point in enumerate(swelling_points):
        if point == 1:
            return locations[i]
    return "Multiple"

survey['survey_swelling_location'] = survey.apply(determine_swelling_location, axis=1)

# Categorize symptom severity
severity_categories = ["none", "mild", "moderate", "bad", "very bad", "medical"]
severity_columns = survey.loc[:, "survey_gastro_gastro_0":"survey_skin_skin_4"].columns
survey[severity_columns] = survey[severity_columns].apply(lambda x: x.map({0: "none", 1: "mild", 2: "moderate", 3: "bad", 4: "very bad", 5: "medical"}).astype(CategoricalDtype(categories=severity_categories, ordered=True)))

# Handle specific columns with different categorizations
survey['survey_body_fever'] = survey['survey_body_fever'].map({0: "none", 1: "not measured", 2: "over 37.5", 3: "between 37.5 and 39", 4: "over 39"})
survey['survey_impact'] = survey['survey_impact'].map({0: "Did not affect my activities", 1: "Had a slight negative impact", 2: "Had a moderate negative impact", 3: "Had a major negative impact", 4: "I couldn't do my daily activities due to my symptoms", 5: "I had to seek medical attention for my symptoms", 6: "I was hospitalized"})
survey['survey_day'] = survey['survey_day'].map({0: "It was a great day", 1: "It was a good day", 2: "It was an okay day", 3: "It was quite a bad day", 4: "It was a really bad day"})
survey['survey_body_other'] = survey['survey_body_other'].map({0: "No", 1: "Yes"})

# Convert to categorical
factor_columns = ['survey_body_fever', 'survey_impact', 'survey_day', 'survey_body_other', 'gastro_any', 'respi_any', 'skin_any', 'body_any', 'joint_any', 'survey_swelling_location']
survey[factor_columns] = survey[factor_columns].astype('category')

# Select relevant columns for the final dataframe
final_columns = ["baseline", "user_id", "finished", "finished_day", "gastro_any", "survey_gastro_gastro_0", "survey_gastro_gastro_1",
                 "survey_gastro_gastro_2", "survey_gastro_gastro_3", "survey_gastro_gastro_4", "respi_any", "survey_resp_resp_0",
                 "survey_resp_resp_1", "survey_resp_resp_2", "survey_resp_resp_3", "survey_resp_resp_4", "skin_any", "survey_skin_skin_0",
                 "survey_skin_skin_1", "survey_skin_skin_2", "survey_skin_skin_3", "survey_skin_skin_4", "body_any", "survey_body_fever",
                 "survey_body_body_0", "survey_body_body_1", "survey_body_body_2", "survey_body_body_3", "survey_body_body_4",
                 "survey_body_body_5", "survey_body_other", "joint_any", "survey_swelling_swelling_0", "survey_swelling_swelling_1",
                 "survey_swelling_location", "survey_impact", "survey_day"]

survey = survey.rename(columns={"baseline": "trip_id", "survey_gastro_gastro_0": "nausea", "survey_gastro_gastro_1": "vomiting",
                                "survey_gastro_gastro_2": "stomach_pain", "survey_gastro_gastro_3": "diarrhea",
                                "survey_gastro_gastro_4": "constipation", "survey_resp_resp_0": "cough",
                                "survey_resp_resp_1": "sore_throat", "survey_resp_resp_2": "runny_nose",
                                "survey_resp_resp_3": "out_of_breath_resting", "survey_resp_resp_4": "out_of_breath_running",
                                "survey_skin_skin_0": "rash", "survey_skin_skin_1": "itchy_insect_bite",
                                "survey_skin_skin_2": "itchy_other", "survey_skin_skin_3": "sunburn",
                                "survey_skin_skin_4": "itchy_red_eyes", "survey_body_body_0": "dizziness",
                                "survey_body_body_1": "ear_ache", "survey_body_body_2": "headache",
                                "survey_body_body_3": "pain_eyes", "survey_body_body_4": "musle_pain",
                                "survey_body_body_5": "aching_limbs", "survey_swelling_swelling_0": "pain_joint",
                                "survey_swelling_swelling_1": "swelling_joint", "survey_impact": "impact",
                                "survey_day": "rating_day"})

# Save the cleaned version
survey.to_csv('data_clean/survey.csv', index=False)