In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np

In [2]:
# File paths
data_migraine_path = '/data/caysar9/data/short_migraine.csv'
final_output_path = '/data/caysar9/results/final_minimum_migraine.csv'

In [3]:
# Load migraine data
print("Loading migraine data...")
data_migraine = pd.read_csv(data_migraine_path)

data_migraine.head()

Loading migraine data...


Unnamed: 0,query_date,duration_in_secs,painintensity,reported_affected_work,reported_missing_work,all_mols,mol_outcome,acute_doses,nb_unique_acutes,otc_doses,...,acc_z_median,acc_z_q3,acc_z_stddev,sleep_duration,awake_time,nb_interruptions,gender,age,age_group,studyid
0,2019-10-14,54000.0,6.0,,,Acetaminophen,somewhathelpful,,0.0,2.0,...,,,,30120.0,2019-10-15 08:22:00,0.0,F,24,18-24,b4182bff4b3cf75f9e54f4990f9bd153c0c2973c
1,2019-10-14,,,,,,,,,,...,9.787086,9.790544,1.255858,,,,F,57,55-64,3fcfb99ec010d4a8ba364f43169465d91ca39ada
2,2019-10-14,,,,,,,,,,...,,,,30697.0,2019-10-15 06:55:09.724,0.0,F,47,45-54,49e3d046636e06b2d82ee046db8e6eb9a2e11e16
3,2019-10-14,86400.0,9.0,,t,,,,,,...,,,,,,,F,51,45-54,bc15c774dca4499ea6fb42da7d216ca54f8c697e
4,2019-10-14,,,,,,,,,,...,,,,25860.0,2019-10-15 08:11:00,0.0,F,29,25-34,cae91e45aed80f3a3fe285c3c8c1a7e78d82d473


In [4]:
data_migraine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 914164 entries, 0 to 914163
Data columns (total 69 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   query_date              914164 non-null  object 
 1   duration_in_secs        210252 non-null  float64
 2   painintensity           199263 non-null  float64
 3   reported_affected_work  31607 non-null   object 
 4   reported_missing_work   13667 non-null   object 
 5   all_mols                71056 non-null   object 
 6   mol_outcome             71056 non-null   object 
 7   acute_doses             38631 non-null   float64
 8   nb_unique_acutes        71056 non-null   float64
 9   otc_doses               43875 non-null   float64
 10  other_doses             2664 non-null    float64
 11  triptans_doses          30331 non-null   float64
 12  opioids_doses           0 non-null       float64
 13  acute_pro               14150 non-null   float64
 14  otc_pro             

In [5]:
# Preprocess migraine dataset
print("Preprocessing migraine dataset...")
print("Dataset starting shape: ", data_migraine.shape)

columns_of_interest = ['query_date','studyid',
    'all_locations', 'all_triggers', 'all_relieves', 'all_aff',
    'gender', 'age_group','duration_in_secs', 'painintensity', 'ua_still', 'ua_walking',
    'ua_cycling', 'ua_running', 'ua_wor','reported_anxiety','reported_depression',
    'sleep_duration', 'age']

data_migraine = data_migraine[columns_of_interest]

# Convert to datetime
data_migraine["query_date"] = pd.to_datetime(data_migraine["query_date"], format="ISO8601")

# create year, month and year_month columns
data_migraine["year"] = data_migraine["query_date"].dt.year
data_migraine["month"] = data_migraine["query_date"].dt.month
data_migraine["year_month"] = data_migraine["query_date"].dt.to_period("M")

# Filter age range
data_migraine = data_migraine[(data_migraine["age"] <= 65) & (data_migraine["age"] >= 18)]

data_migraine["attack_duration_days"] = data_migraine["duration_in_secs"] / 3600 / 24


# Filter data to select individuals with 1 to 7 total migraine days
df = (
    data_migraine.groupby(["studyid", "year_month"])
    .filter(
        lambda x: (
        
            0 < x["attack_duration_days"].sum() <= 7
        )
    )
)


# Group by `studyid` and calculate the total `attack_duration_days` per user in the filtered data
total_attack_duration_days_per_user = df.groupby("studyid")["attack_duration_days"].sum()

# Get descriptive statistics for the total migraine days per user
describe_total_attack_duration_days = total_attack_duration_days_per_user.describe()

print("Descriptive statistics for total migraine days per user (filtered):")
print(describe_total_attack_duration_days)


# Calculate the max attack_duration_days for individual episodes
print("Max attack_duration_days in data_migraine (after filtering):", data_migraine["attack_duration_days"].max())



# Filter by migraine duration < 72 hours
df["duration_in_hours"] = df["duration_in_secs"] / 3600
df = df[df["duration_in_hours"] <= 72]
df = df[df["duration_in_hours"] >= 4]

# Filter by sleep duration < 24 hours
df["sleep_duration_hours"] = df["sleep_duration"] / 3600
df = df[df["sleep_duration_hours"] <= 24]
print("Shape after filtering: ", df.shape)

# Drop NaN values
minimum_migraine = df.dropna()

# Save the final merged dataset to a CSV file
minimum_migraine.to_csv(final_output_path, index=False)

# Print final migraine dataset summary
print("Final migraine dataset:")
print("Shape: ", minimum_migraine.shape)
print("Earliest date", minimum_migraine["query_date"].dt.to_period("d").min())
print("Latest date", minimum_migraine["query_date"].dt.to_period("d").max())
print("Unique users", minimum_migraine["studyid"].nunique())

Preprocessing migraine dataset...
Dataset starting shape:  (914164, 69)


Descriptive statistics for total migraine days per user (filtered):
count    22957.000000
mean         1.786892
std          1.634869
min          0.000694
25%          0.518056
50%          1.248611
75%          2.593750
max          7.000000
Name: attack_duration_days, dtype: float64
Max attack_duration_days in data_migraine (after filtering): 1.0
Shape after filtering:  (35260, 25)
Final migraine dataset:
Shape:  (14402, 25)
Earliest date 2019-10-01
Latest date 2019-10-31
Unique users 7119


In [6]:
minimum_migraine.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14402 entries, 115 to 914053
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   query_date            14402 non-null  datetime64[ns]
 1   studyid               14402 non-null  object        
 2   all_locations         14402 non-null  object        
 3   all_triggers          14402 non-null  object        
 4   all_relieves          14402 non-null  object        
 5   all_aff               14402 non-null  object        
 6   gender                14402 non-null  object        
 7   age_group             14402 non-null  object        
 8   duration_in_secs      14402 non-null  float64       
 9   painintensity         14402 non-null  float64       
 10  ua_still              14402 non-null  float64       
 11  ua_walking            14402 non-null  float64       
 12  ua_cycling            14402 non-null  float64       
 13  ua_running        