In [1]:
def clean_data():
    path = "../data/raw/mxmh_survey_results.csv"
    data = pd.read_csv(path)
    data = handle_BPM_outliers(data)
    data = handle_NAs(data)
    data = make_frequency_cols_ordered(data)
    data = make_mental_health_levels(data)
    data = make_data_time(data)
    data = make_time_of_day(data)
    data = drop_permissions(data)
    return data

In [2]:
def handle_BPM_outliers(data):
    droppedMax = data["BPM"].nlargest(2).index
    droppedMin = data["BPM"].nsmallest(5).index
    droppedOutliers = droppedMax.union(droppedMin)
    data_cleaned = data.drop(index=droppedOutliers)
    return data_cleaned

In [3]:
def handle_NAs(data):
    data["BPM"] = data["BPM"].interpolate(method="linear")
    data.dropna(inplace=True)
    return data

In [4]:
def make_frequency_cols_ordered(data):
    order = ["Never", "Rarely", "Sometimes", "Very frequently"]
    for col in data.columns:
        if "Frequency" in col:
            data[col] = pd.Categorical(data[col], categories=order, ordered=True)
    return data

In [5]:
def make_mental_health_levels(data):
    data['Anxiety_Level'] = pd.cut(
        data['Anxiety'], 
        bins=[0, 3, 6, 10], 
        labels=['Low', 'Medium', 'High'],
        include_lowest=True)

    data['Depression_Level'] = pd.cut(
        data['Depression'], 
        bins=[0, 3, 6, 10],  
        labels=['Low', 'Medium', 'High'],
        include_lowest=True)

    data['Insomnia_Level'] = pd.cut(
        data['Insomnia'], 
        bins=[0, 3, 6, 10],  
        labels=['Low', 'Medium', 'High'],
        include_lowest=True)

    data['OCD_Level'] = pd.cut(
        data['OCD'], 
        bins=[0, 3, 6, 10],  
        labels=['Low', 'Medium', 'High'],
        include_lowest=True)

In [6]:
def make_data_time(data):
    data["Timestamp"] = pd.to_datetime(data["Timestamp"])
    data["Date"] = pd.to_datetime(data["Timestamp"].dt.date)
    data["Time"] = data["Timestamp"].dt.strftime("%H:%M:%S")
    return data;

In [7]:
def make_time_of_day(data):
    time_order = ["Morning", "Afternoon", "Evening", "Night"]
    data["Time of Day"] = data["Timestamp"].apply(categorize_time_of_day)
    data["Time of Day"] = pd.Categorical(
        data["Time of Day"], 
        categories=time_order, 
        ordered=True)
    return data

In [8]:
def categorize_time_of_day(timestamp):
    hour = timestamp.hour
    if 6 <= hour < 12:
        return "Morning"
    elif 12 <= hour < 18:
        return "Afternoon"
    elif 18 <= hour < 24:
        return "Evening"
    else:
        return "Night"

In [9]:
def drop_permissions(data):
    data = data.drop(columns=["Permissions"])
    return data