In [1]:
import pandas as pd
import numpy as np
from skimpy import clean_columns
from datetime import datetime
import json
import pickle

In [2]:
# Read the CSV file
survey = pd.read_csv("data_raw/survey_raw_12_11_24.csv")
# Clean column names
survey=clean_columns(survey)

# remove useless columns
survey = survey.drop(columns=["identifier", "id", "started", "survey_symptoms_skip", "survey_gastro_skip", "survey_resp_skip", "survey_skin_skip", "survey_body_skip", "survey_swelling_skip"] + list(survey.loc[:, "location_ts":"context_air_quality_random_recommendations_sport"].columns))
# remove useless rows
survey = survey.drop([0,1])

  survey = pd.read_csv("data_raw/survey_raw_12_11_24.csv")


In [3]:
# Ensure the 'finished' column is in datetime format
survey['finished'] = pd.to_datetime(survey['finished'], format='ISO8601')

In [4]:
# Apply fillna(0) to the selected columns in one line
survey.loc[:, "survey_gastro_gastro_0":"survey_day"]=survey.loc[:, "survey_gastro_gastro_0":"survey_day"].fillna(0)

In [5]:
# Change character string to numeric
survey.loc[:, 'survey_body_other'] = survey.loc[:, 'survey_body_other'].apply(lambda x: 0 if x == '0' else 1)
survey.loc[:, "survey_swelling_swelling_points_0":"survey_swelling_swelling_points_7"] = survey.loc[:, "survey_swelling_swelling_points_0":"survey_swelling_swelling_points_7"].map(lambda x: 0 if x == 0 else 1)

In [6]:
# Transform to numeric all columns
cols = survey.loc[:, "survey_gastro_gastro_0":"survey_day"].columns
survey[cols] = survey[cols].apply(pd.to_numeric, errors='coerce', axis=0)

In [7]:
# Create the new columns with the conditions
survey['gastro_any'] = np.where(survey.loc[:, 'survey_gastro_gastro_0':'survey_gastro_gastro_4'].max(axis=1, skipna=True) != 0, 'Yes', 'No')
survey['respi_any'] = np.where(survey.loc[:, 'survey_resp_resp_0':'survey_resp_resp_4'].max(axis=1, skipna=True) != 0, 'Yes', 'No')
survey['skin_any'] = np.where(survey.loc[:, 'survey_skin_skin_0':'survey_skin_skin_4'].max(axis=1, skipna=True) != 0, 'Yes', 'No')
survey['body_any'] = np.where(survey.loc[:, 'survey_body_fever':'survey_body_other'].max(axis=1, skipna=True) != 0, 'Yes', 'No')
survey['joint_any'] = np.where(survey.loc[:, 'survey_swelling_swelling_0':'survey_swelling_swelling_1'].max(axis=1, skipna=True) != 0, 'Yes', 'No')


In [8]:
# Create the new columns with selling locations
points = [
    'survey_swelling_swelling_points_0', 'survey_swelling_swelling_points_1', 'survey_swelling_swelling_points_2',
    'survey_swelling_swelling_points_3', 'survey_swelling_swelling_points_4', 'survey_swelling_swelling_points_5',
    'survey_swelling_swelling_points_6', 'survey_swelling_swelling_points_7'
]

# Define the conditions based on your R code logic
conditions = [
    (survey.loc[:, points[1:]].eq(0).all(axis=1) & (survey[points[0]] == 1)),
    (survey.loc[:, points[2:]].eq(0).all(axis=1) & (survey[points[0]] == 0) & (survey[points[1]] == 1)),
    (survey.loc[:, points[3:]].eq(0).all(axis=1) & (survey.loc[:, points[:2]].eq(0).all(axis=1)) & (survey[points[2]] == 1)),
    (survey.loc[:, points[4:]].eq(0).all(axis=1) & (survey.loc[:, points[:3]].eq(0).all(axis=1)) & (survey[points[3]] == 1)),
    (survey.loc[:, points[5:]].eq(0).all(axis=1) & (survey.loc[:, points[:4]].eq(0).all(axis=1)) & (survey[points[4]] == 1)),
    (survey.loc[:, points[6:]].eq(0).all(axis=1) & (survey.loc[:, points[:5]].eq(0).all(axis=1)) & (survey[points[5]] == 1)),
    ((survey[points[7]] == 0) & (survey.loc[:, points[:6]].eq(0).all(axis=1)) & (survey[points[6]] == 1)),
    (survey.loc[:, points[:7]].eq(0).all(axis=1) & (survey[points[7]] == 1)),
    (survey.loc[:, points].eq(0).all(axis=1))
]

choices = ['Shoulder', 'Elbow', 'Wrist', 'Fingers', 'Hip', 'Knee', 'Ankle', 'Toes', 'None']

# Apply the conditions and choices to create the new column
survey['survey_swelling_location'] = np.select(conditions, choices, default='Multiple')

# Print the value counts for 'survey_swelling_location' to verify the results
# print(survey['survey_swelling_location'].value_counts())

In [9]:
# Transform numeric to intensity
def transform_value(x):
    if x == 0:
        return "none"
    elif x == 1:
        return "mild"
    elif x == 2:
        return "moderate"
    elif x == 3:
        return "bad"
    elif x == 4:
        return "very bad"
    elif x == 5:
        return "medical"
    else:
        return np.nan

# List of columns to transform
columns_to_transform = (survey.loc[:, "survey_gastro_gastro_0":"survey_skin_skin_4"].columns.tolist() +
                        survey.loc[:, "survey_body_body_0":"survey_body_body_5"].columns.tolist() +
                        survey.loc[:, "survey_swelling_swelling_0":"survey_swelling_swelling_1"].columns.tolist())

# Cast columns to object dtype to allow for string assignment
survey[columns_to_transform] = survey[columns_to_transform].astype(object)

# Apply the transformation
survey[columns_to_transform] = survey[columns_to_transform].map(transform_value)

In [10]:
# Transform specific column
def transform_survey_body_fever(x):
    if x == 0:
        return "none"
    elif x == 1:
        return "not mesured"
    elif x == 2:
        return "over 37.5"
    elif x == 3:
        return "between 37.5 and 39"
    elif x == 4:
        return "over 39"
    else:
        return np.nan

def transform_survey_impact(x):
    if x == 0:
        return "Did not affect my activities"
    elif x == 1:
        return "Had a slight negative impact"
    elif x == 2:
        return "Had a moderate negative impact"
    elif x == 3:
        return "Had a major negative impact"
    elif x == 4:
        return "I couldnt do my daily activities due to my symptoms"
    elif x == 5:
        return "I had to seek medical attention for my symptoms"
    elif x == 6:
        return "I was hospitalised"
    else:
        return np.nan

def transform_survey_day(x):
    if x == 0:
        return "It was a great day"
    elif x == 1:
        return "It was a good day"
    elif x == 2:
        return "It was an okay day"
    elif x == 3:
        return "It was quite a bad day"
    elif x == 4:
        return "It was a really bad day"
    else:
        return np.nan

def transform_survey_body_other(x):
    if x == 0:
        return "No"
    elif x == 1:
        return "Yes"
    else:
        return np.nan

# Apply the transformations
survey['survey_body_fever'] = survey['survey_body_fever'].apply(transform_survey_body_fever)
survey['survey_impact'] = survey['survey_impact'].apply(transform_survey_impact)
survey['survey_day'] = survey['survey_day'].apply(transform_survey_day)
survey['survey_body_other'] = survey['survey_body_other'].apply(transform_survey_body_other)

In [11]:
# Define the columns to rename
rename_columns = {
    'baseline': 'trip_id',
    'survey_gastro_gastro_0': 'nausea',
    'survey_gastro_gastro_1': 'vomiting',
    'survey_gastro_gastro_2': 'stomach_pain',
    'survey_gastro_gastro_3': 'diarrhea',
    'survey_gastro_gastro_4': 'constipation',
    'survey_resp_resp_0': 'cough',
    'survey_resp_resp_1': 'sore_throat',
    'survey_resp_resp_2': 'runny_nose',
    'survey_resp_resp_3': 'out_of_breath_resting',
    'survey_resp_resp_4': 'out_of_breath_running',
    'survey_skin_skin_0': 'rash',
    'survey_skin_skin_1': 'itchy_insect_bite',
    'survey_skin_skin_2': 'itchy_other',
    'survey_skin_skin_3': 'sunburn',
    'survey_skin_skin_4': 'itchy_red_eyes',
    'survey_body_fever': 'fever',
    'survey_body_body_0': 'dizziness',
    'survey_body_body_1': 'ear_ache',
    'survey_body_body_2': 'headache',
    'survey_body_body_3': 'pain_eyes',
    'survey_body_body_4': 'musle_pain',
    'survey_body_body_5': 'aching_limbs',
    'survey_body_other': 'body_other',
    'survey_swelling_swelling_0': 'pain_joint',
    'survey_swelling_swelling_1': 'swelling_joint',
    'survey_swelling_location': 'location_swelling',
    'survey_impact': 'impact',
    'survey_day': 'rating_day',
    'longitude': 'survey_longitude',
    'latitude': 'survey_latitude',

}

# Rename columns
survey.rename(columns=rename_columns, inplace=True)

# Select the required columns in the specified order
# Get all columns from 'latitude' to 'context_weather_timezone' inclusively
latitude_index = survey.columns.get_loc('survey_latitude')
context_weather_timezone_index = survey.columns.get_loc('context_weather_timezone')
between_columns = survey.columns[latitude_index:context_weather_timezone_index + 1].tolist()

selected_columns = [
    'trip_id', 'user_id', 'finished', 'gastro_any', 'nausea', 'vomiting', 'stomach_pain',
    'diarrhea', 'constipation', 'respi_any', 'cough', 'sore_throat', 'runny_nose', 'out_of_breath_resting',
    'out_of_breath_running', 'skin_any', 'rash', 'itchy_insect_bite', 'itchy_other', 'sunburn', 'itchy_red_eyes',
    'body_any', 'fever', 'dizziness', 'ear_ache', 'headache', 'pain_eyes', 'musle_pain', 'aching_limbs',
    'body_other', 'joint_any', 'pain_joint', 'swelling_joint', 'location_swelling', 'impact', 'rating_day'
] + between_columns

# Create the final DataFrame with selected and renamed columns
survey = survey[selected_columns]

In [12]:
survey.drop(columns=['context_open_weather_air_quality_dt', 'context_open_weather_dt'] + list(survey.loc[:, 'context_open_weather_air_quality_message':'context_weather_timezone'].columns), inplace=True)

In [13]:
# Get the column names between the specified columns
columns_to_rename = survey.loc[:, 'context_open_weather_clouds':'context_open_weather_air_quality_main_aqi'].columns

# Rename the columns by removing the specified substring
survey.rename(columns={col: col.replace("context_open_weather_", "") for col in columns_to_rename}, inplace=True)

In [14]:
def extract_weather_details(weather):
    # Ensure the value is a string
    if not isinstance(weather, str):
        return pd.Series([None, None])
    try:
        # Convert the JSON string to a list of dictionaries
        weather_list = json.loads(weather)
        
        # Extract the first dictionary from the list
        if isinstance(weather_list, list) and len(weather_list) > 0:
            weather_dict = weather_list[0]
        else:
            return pd.Series([None, None])
        
        # Extract the description and main fields
        description = weather_dict.get('description', '').lower()
        main = weather_dict.get('main', '').lower()
        
        # Return the extracted values
        return pd.Series([description, main])
    except json.JSONDecodeError:
        return pd.Series([None, None])

# Apply the transformation to the 'weather' column
survey[['description_weather', 'main_weather']] = survey['weather'].apply(extract_weather_details)

In [15]:
survey = survey[survey.columns[:survey.columns.get_loc('weather') + 1].tolist() + ['main_weather', 'description_weather'] + survey.columns[survey.columns.get_loc('weather') + 1:].difference(['main_weather', 'description_weather']).tolist()]

In [16]:
def transform_air_quality_main_aqi(x):
    if x == 1:
        return "Good"
    elif x == 2:
        return "Fair"
    elif x == 3:
        return "Moderate"
    elif x == 4:
        return "Poor"
    elif x == 5:
        return "Very Poor"
    else:
        return np.nan
    
survey['air_quality_main'] = survey['air_quality_main_aqi'].apply(transform_air_quality_main_aqi)

In [17]:
survey = survey.drop(columns=['weather','air_quality_main_aqi', 'air_quality','air_quality_cod'])

In [18]:
# Define the conditions and corresponding values
conditions = [
    (survey['user_id'] == "1WWbzEPDcCdvF5MYSecOg5jvC9i2") & (survey['finished'] <= "2022-02-22"),
    (survey['user_id'] == "71dWvTtxniTevm5Wrw7wazKjit93") & (survey['finished'] <= "2022-04-10"),
    (survey['user_id'] == "dPhQWSZVVFWrR1kWAGACyMLWXzW2") & (survey['finished'] <= "2022-05-12"),
    (survey['user_id'] == "hheBd0VfyLZLE43RaHbRPAidZmH2") & (survey['finished'] <= "2022-03-15"),
    (survey['user_id'] == "hheBd0VfyLZLE43RaHbRPAidZmH2") & (survey['finished'] >= "2022-05-16") & (survey['finished'] <= "2022-05-20"),
    (survey['user_id'] == "hheBd0VfyLZLE43RaHbRPAidZmH2") & (survey['finished'] >= "2022-05-22") & (survey['finished'] <= "2022-06-13"),
    (survey['user_id'] == "RkLw5FTR8MhwINeTcgdgZ6hArTB3") & (survey['finished'] <= "2022-03-20"),
    (survey['user_id'] == "RkLw5FTR8MhwINeTcgdgZ6hArTB3") & (survey['finished'] >= "2022-03-22") & (survey['finished'] <= "2022-04-15"),
    (survey['user_id'] == "YJGqDTCqAONjhLyD95zyYmI7ZGg1") & (survey['finished'] <= "2022-02-02"),
    (survey['user_id'] == "YJGqDTCqAONjhLyD95zyYmI7ZGg1") & (survey['finished'] >= "2022-02-02") & (survey['finished'] <= "2022-02-15"),
    (survey['user_id'] == "YJGqDTCqAONjhLyD95zyYmI7ZGg1") & (survey['finished'] >= "2022-02-15") & (survey['finished'] <= "2022-02-20"),
    (survey['user_id'] == "YJGqDTCqAONjhLyD95zyYmI7ZGg1") & (survey['finished'] >= "2022-02-20") & (survey['finished'] <= "2022-03-01"),
    (survey['user_id'] == "YJGqDTCqAONjhLyD95zyYmI7ZGg1") & (survey['finished'] >= "2022-06-07") & (survey['finished'] <= "2022-06-18"),
    (survey['user_id'] == "oiAYJMPUzEYF2XBfXowW3ZHMZGC2") & (survey['finished'] <= "2022-06-27"),
    (survey['user_id'] == "o2P057Mx0MOJ6TbH1HEtWfeQ7V42") & (survey['finished'] <= "2022-05-13")
]

# Corresponding trip_id values
trip_ids = [
    "iyqJJDPXtJGOm7w8C0e3", "jyVTWhKB9JR0mgHrd3E8", "hFQOdz2iXfbzZfdLV7W5", "yYKMdTUC4ldMHvIIhpso", "n6Ec93wurCFoe4RyARDi",
    "J2JyYjbwPRjFLcV7ziPl", "Q7TwsPMKyt36xrIaNDRH", "XtQLWU0fmNWcg5r34gAp", "uHxtXugIMW5MT6sgHr9L", "cDMnNEILi40EuLC3OBNL",
    "eA4w4WDx7fCH25Fy4VU2", "A5zwFHqlRMbDJSlrEn8G", "dny3jLpYlL4b8I4s6KaA", np.nan, np.nan
]

# Apply the conditions to update the trip_id
for condition, trip_id in zip(conditions, trip_ids):
    survey.loc[condition, 'trip_id'] = trip_id

# Filter out rows with no finished date or trip_id
survey = survey.dropna(subset=['finished', 'trip_id'])


In [19]:
# Create a new column 'finished_day' which contains only the date part of 'finished'
survey['finished_day'] = survey['finished'].dt.date

# Remove duplicates based on 'trip_id' and 'finished_day', keeping the first occurrence
survey = survey.drop_duplicates(subset=['trip_id', 'finished_day'], keep='first')

# Drop the 'finished_day' column if you no longer need it
survey = survey.drop(columns=['finished_day'])

In [20]:
survey.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 7420 entries, 105 to 8252
Data columns (total 64 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   trip_id                        7420 non-null   object        
 1   user_id                        7420 non-null   object        
 2   finished                       7420 non-null   datetime64[ns]
 3   gastro_any                     7420 non-null   object        
 4   nausea                         7420 non-null   object        
 5   vomiting                       7420 non-null   object        
 6   stomach_pain                   7420 non-null   object        
 7   diarrhea                       7420 non-null   object        
 8   constipation                   7420 non-null   object        
 9   respi_any                      7420 non-null   object        
 10  cough                          7420 non-null   object        
 11  sore_throat         

In [21]:
# for col in survey.columns:
#     print(f"Value counts for {col}:")
#     print(survey[col].value_counts())
#     print()
# survey.info(verbose=True)
# Assuming 'full' is your DataFrame
survey.to_pickle('data_clean/survey_clean.pkl')