In [362]:
import pandas as pd
import numpy as np
from skimpy import clean_columns
from datetime import datetime

In [363]:
# Read the CSV file
survey = pd.read_csv("data_raw/survez_raw_03_06_24.csv")
# Clean column names
survey=clean_columns(survey)

# remove useless columns
survey = survey.drop(columns=["identifier", "id", "started", "survey_symptoms_skip", "survey_gastro_skip", "survey_resp_skip", "survey_skin_skip", "survey_body_skip", "survey_swelling_skip"] + list(survey.loc[:, "location_ts":"context_air_quality_random_recommendations_sport"].columns))
# remove useless rows
survey = survey.drop([0,1])

  survey = pd.read_csv("data_raw/survez_raw_03_06_24.csv")


In [364]:
# Ensure the 'finished' column is in datetime format
survey['finished'] = pd.to_datetime(survey['finished'], format='ISO8601')

# Calculate the difference in days from today, multiply by -1, and round to 0 decimal places
survey['finished_day'] = round((survey['finished'] - pd.Timestamp.today()).dt.total_seconds() / (3600 * 24) * (-1), 0)

In [365]:
# Apply fillna(0) to the selected columns in one line
survey.loc[:, "survey_gastro_gastro_0":"survey_day"]=survey.loc[:, "survey_gastro_gastro_0":"survey_day"].fillna(0)

In [366]:
# Change character string to numeric
survey.loc[:, 'survey_body_other'] = survey.loc[:, 'survey_body_other'].apply(lambda x: 0 if x == '0' else 1)
survey.loc[:, "survey_swelling_swelling_points_0":"survey_swelling_swelling_points_7"] = survey.loc[:, "survey_swelling_swelling_points_0":"survey_swelling_swelling_points_7"].map(lambda x: 0 if x == 0 else 1)

In [367]:
# Transform to numeric all columns
cols = survey.loc[:, "survey_gastro_gastro_0":"survey_day"].columns
survey[cols] = survey[cols].apply(pd.to_numeric, errors='coerce', axis=0)

In [368]:
# Create the new columns with the conditions
survey['gastro_any'] = np.where(survey.loc[:, 'survey_gastro_gastro_0':'survey_gastro_gastro_4'].max(axis=1, skipna=True) != 0, 'Yes', 'No')
survey['respi_any'] = np.where(survey.loc[:, 'survey_resp_resp_0':'survey_resp_resp_4'].max(axis=1, skipna=True) != 0, 'Yes', 'No')
survey['skin_any'] = np.where(survey.loc[:, 'survey_skin_skin_0':'survey_skin_skin_4'].max(axis=1, skipna=True) != 0, 'Yes', 'No')
survey['body_any'] = np.where(survey.loc[:, 'survey_body_fever':'survey_body_other'].max(axis=1, skipna=True) != 0, 'Yes', 'No')
survey['joint_any'] = np.where(survey.loc[:, 'survey_swelling_swelling_0':'survey_swelling_swelling_1'].max(axis=1, skipna=True) != 0, 'Yes', 'No')


In [369]:
# Create the new columns with selling locations
points = [
    'survey_swelling_swelling_points_0', 'survey_swelling_swelling_points_1', 'survey_swelling_swelling_points_2',
    'survey_swelling_swelling_points_3', 'survey_swelling_swelling_points_4', 'survey_swelling_swelling_points_5',
    'survey_swelling_swelling_points_6', 'survey_swelling_swelling_points_7'
]

# Define the conditions based on your R code logic
conditions = [
    (survey.loc[:, points[1:]].eq(0).all(axis=1) & (survey[points[0]] == 1)),
    (survey.loc[:, points[2:]].eq(0).all(axis=1) & (survey[points[0]] == 0) & (survey[points[1]] == 1)),
    (survey.loc[:, points[3:]].eq(0).all(axis=1) & (survey.loc[:, points[:2]].eq(0).all(axis=1)) & (survey[points[2]] == 1)),
    (survey.loc[:, points[4:]].eq(0).all(axis=1) & (survey.loc[:, points[:3]].eq(0).all(axis=1)) & (survey[points[3]] == 1)),
    (survey.loc[:, points[5:]].eq(0).all(axis=1) & (survey.loc[:, points[:4]].eq(0).all(axis=1)) & (survey[points[4]] == 1)),
    (survey.loc[:, points[6:]].eq(0).all(axis=1) & (survey.loc[:, points[:5]].eq(0).all(axis=1)) & (survey[points[5]] == 1)),
    ((survey[points[7]] == 0) & (survey.loc[:, points[:6]].eq(0).all(axis=1)) & (survey[points[6]] == 1)),
    (survey.loc[:, points[:7]].eq(0).all(axis=1) & (survey[points[7]] == 1)),
    (survey.loc[:, points].eq(0).all(axis=1))
]

choices = ['Shoulder', 'Elbow', 'Wrist', 'Fingers', 'Hip', 'Knee', 'Ankle', 'Toes', 'None']

# Apply the conditions and choices to create the new column
survey['survey_swelling_location'] = np.select(conditions, choices, default='Multiple')

# Print the value counts for 'survey_swelling_location' to verify the results
# print(survey['survey_swelling_location'].value_counts())

survey_swelling_location
None        7062
Multiple      24
Knee          19
Ankle          8
Shoulder       5
Wrist          2
Hip            2
Fingers        1
Name: count, dtype: int64


In [370]:
# Transform numeric to intensity
def transform_value(x):
    if x == 0:
        return "none"
    elif x == 1:
        return "mild"
    elif x == 2:
        return "moderate"
    elif x == 3:
        return "bad"
    elif x == 4:
        return "very bad"
    elif x == 5:
        return "medical"
    else:
        return np.nan

# List of columns to transform
columns_to_transform = (survey.loc[:, "survey_gastro_gastro_0":"survey_skin_skin_4"].columns.tolist() +
                        survey.loc[:, "survey_body_body_0":"survey_body_body_5"].columns.tolist() +
                        survey.loc[:, "survey_swelling_swelling_0":"survey_swelling_swelling_1"].columns.tolist())

# Cast columns to object dtype to allow for string assignment
survey[columns_to_transform] = survey[columns_to_transform].astype(object)

# Apply the transformation
survey[columns_to_transform] = survey[columns_to_transform].map(transform_value)

In [371]:

# Transform specific column
def transform_survey_body_fever(x):
    if x == 0:
        return "none"
    elif x == 1:
        return "not mesured"
    elif x == 2:
        return "over 37.5"
    elif x == 3:
        return "between 37.5 and 39"
    elif x == 4:
        return "over 39"
    else:
        return np.nan

def transform_survey_impact(x):
    if x == 0:
        return "Did not affect my activities"
    elif x == 1:
        return "Had a slight negative impact"
    elif x == 2:
        return "Had a moderate negative impact"
    elif x == 3:
        return "Had a major negative impact"
    elif x == 4:
        return "I couldnt do my daily activities due to my symptoms"
    elif x == 5:
        return "I had to seek medical attention for my symptoms"
    elif x == 6:
        return "I was hospitalised"
    else:
        return np.nan

def transform_survey_day(x):
    if x == 0:
        return "It was a great day"
    elif x == 1:
        return "It was a good day"
    elif x == 2:
        return "It was an okay day"
    elif x == 3:
        return "It was quite a bad day"
    elif x == 4:
        return "It was a really bad day"
    else:
        return np.nan

def transform_survey_body_other(x):
    if x == 0:
        return "No"
    elif x == 1:
        return "Yes"
    else:
        return np.nan

# Apply the transformations
survey['survey_body_fever'] = survey['survey_body_fever'].apply(transform_survey_body_fever)
survey['survey_impact'] = survey['survey_impact'].apply(transform_survey_impact)
survey['survey_day'] = survey['survey_day'].apply(transform_survey_day)
survey['survey_body_other'] = survey['survey_body_other'].apply(transform_survey_body_other)

In [372]:
# Define the columns to rename
rename_columns = {
    'baseline': 'trip_id',
    'survey_gastro_gastro_0': 'nausea',
    'survey_gastro_gastro_1': 'vomiting',
    'survey_gastro_gastro_2': 'stomach_pain',
    'survey_gastro_gastro_3': 'diarrhea',
    'survey_gastro_gastro_4': 'constipation',
    'survey_resp_resp_0': 'cough',
    'survey_resp_resp_1': 'sore_throat',
    'survey_resp_resp_2': 'runny_nose',
    'survey_resp_resp_3': 'out_of_breath_resting',
    'survey_resp_resp_4': 'out_of_breath_running',
    'survey_skin_skin_0': 'rash',
    'survey_skin_skin_1': 'itchy_insect_bite',
    'survey_skin_skin_2': 'itchy_other',
    'survey_skin_skin_3': 'sunburn',
    'survey_skin_skin_4': 'itchy_red_eyes',
    'survey_body_fever': 'fever',
    'survey_body_body_0': 'dizziness',
    'survey_body_body_1': 'ear_ache',
    'survey_body_body_2': 'headache',
    'survey_body_body_3': 'pain_eyes',
    'survey_body_body_4': 'musle_pain',
    'survey_body_body_5': 'aching_limbs',
    'survey_body_other': 'body_other',
    'survey_swelling_swelling_0': 'pain_joint',
    'survey_swelling_swelling_1': 'swelling_joint',
    'survey_swelling_location': 'location_swelling',
    'survey_impact': 'impact',
    'survey_day': 'rating_day',
    'longitude': 'survey_longitude',
    'latitude': 'survey_latitude',

}

# Rename columns
survey.rename(columns=rename_columns, inplace=True)

# Select the required columns in the specified order
# Get all columns from 'latitude' to 'context_weather_timezone' inclusively
latitude_index = survey.columns.get_loc('survey_latitude')
context_weather_timezone_index = survey.columns.get_loc('context_weather_timezone')
between_columns = survey.columns[latitude_index:context_weather_timezone_index + 1].tolist()

selected_columns = [
    'trip_id', 'user_id', 'finished', 'finished_day', 'gastro_any', 'nausea', 'vomiting', 'stomach_pain',
    'diarrhea', 'constipation', 'respi_any', 'cough', 'sore_throat', 'runny_nose', 'out_of_breath_resting',
    'out_of_breath_running', 'skin_any', 'rash', 'itchy_insect_bite', 'itchy_other', 'sunburn', 'itchy_red_eyes',
    'body_any', 'fever', 'dizziness', 'ear_ache', 'headache', 'pain_eyes', 'musle_pain', 'aching_limbs',
    'body_other', 'joint_any', 'pain_joint', 'swelling_joint', 'location_swelling', 'impact', 'rating_day'
] + between_columns

# Create the final DataFrame with selected and renamed columns
survey = survey[selected_columns]

In [373]:
# for col in survey.columns:
#     print(f"Value counts for {col}:")
#     print(survey[col].value_counts())
#     print()

Value counts for trip_id:
trip_id
PzPC8wRJGcpsWAThz9Ii    140
tcrLFAUKqNUfAuctjS56    126
odjGNtAuYS9gnKGke7pm     86
WbsHN9ezAB9cW6LY8Oll     75
F0IwfLPLiGOLViNNs6I6     74
                       ... 
HCmAWDSCDUCfsvX3X282      1
2y3Kd4R2412rRIr4542t      1
OOVy7B71lrVcmOKfl5hh      1
OoTwgVw1Sc34a2sCjJf4      1
O6tgHObMIli19jA2GNAp      1
Name: count, Length: 827, dtype: int64

Value counts for user_id:
user_id
YJGqDTCqAONjhLyD95zyYmI7ZGg1    228
9LCCJ0mRQbPQGs4l4QJRbI7cy4D2    184
AhbpHrPr9OMWbOz2atWAptk9mhy2    140
GJoPYrjavQgzrgVv5YXv7keeazr2    126
Mgg6qw7QNZNboJd2r90H3dAzS4q2    114
                               ... 
ilOLorijeeN7G36ev9tPNhHus4x1      1
oXP9FdvQpvWDuJuTFtEg3rdUnwD3      1
4Xj7izlLWiV69XMKygtBMMuVzhD2      1
COMQ3JVx6NToxaY2z4KjzI2qI162      1
3fde9JTLRnSPZspHSNQYe4TL9ba2      1
Name: count, Length: 723, dtype: int64

Value counts for finished:
finished
2021-12-01 11:48:06.263630    1
2023-08-10 20:00:37.521631    1
2023-08-11 19:59:58.910080    1
2023-08-11 19:31