In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np

In [2]:
final_output_path = '/data/caysar9/results/final_short.csv'

In [3]:
#put the dataset that comes from short_dataset_preprocess.ipynb file
file_path = '/data/caysar9/results/final_minimum_migraine.csv'
# Load migraine data
print("Loading data...")
df = pd.read_csv(file_path)
df.head()

Loading data...


Unnamed: 0,query_date,studyid,all_locations,all_triggers,all_relieves,all_aff,gender,age_group,duration_in_secs,painintensity,...,reported_anxiety,reported_depression,sleep_duration,age,year,month,year_month,attack_duration_days,duration_in_hours,sleep_duration_hours
0,2019-10-14,75939fe8d410b193f4e8b4e5b153473c48db3dd0,Traveling,"MenstrualCycleStatus:NO,Allergies/Asthma,Skipp...","Rest/relax,Full meal,Sleep,Stay indoor",Not affected,F,25-34,23400.0,5.0,...,t,f,19080.0,29,2019,10,2019-10,0.270833,6.5,5.3
1,2019-10-14,d489170372ec75900dc619ccf5bca5fae5b0aaf9,Cricket,"Weather,MenstrualCycleStatus:NO","Dark room rest,Sleep",Missed family time,F,45-54,14400.0,9.0,...,f,f,26690.0,45,2019,10,2019-10,0.166667,4.0,7.413889
2,2019-10-14,47e23d3867c5c22f889fdc7595bdc42527058e52,In transit/commuting,"Physical exertion,Bright light,Anxiety,The wea...",Drink water,"Slow work,Pushed through",F,45-54,49200.0,5.0,...,t,f,41760.0,48,2019,10,2019-10,0.569444,13.666667,11.6
3,2019-10-14,800ecda24ab67e8319d66220670fda50e70e3ad7,Work,"Lack of sleep,Dehydrated,Anxiety",Exercise,Not affected,F,45-54,28140.0,4.0,...,t,f,25200.0,52,2019,10,2019-10,0.325694,7.816667,7.0
4,2019-10-12,be5fc3b68b752ca261e063fcaaef9500bc529be6,Show,"Physical exertion,Lack of sleep,Stress,Neck St...","Soda,Hot Bath w/ Epsom",Hard to drive,F,35-44,32400.0,7.0,...,f,t,37020.0,43,2019,10,2019-10,0.375,9.0,10.283333


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14402 entries, 0 to 14401
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   query_date            14402 non-null  object 
 1   studyid               14402 non-null  object 
 2   all_locations         14402 non-null  object 
 3   all_triggers          14402 non-null  object 
 4   all_relieves          14402 non-null  object 
 5   all_aff               14402 non-null  object 
 6   gender                14402 non-null  object 
 7   age_group             14402 non-null  object 
 8   duration_in_secs      14402 non-null  float64
 9   painintensity         14402 non-null  float64
 10  ua_still              14402 non-null  float64
 11  ua_walking            14402 non-null  float64
 12  ua_cycling            14402 non-null  float64
 13  ua_running            14402 non-null  float64
 14  ua_wor                14402 non-null  float64
 15  reported_anxiety   

In [5]:
# create migraine days per month (the number of unique migraine days)

# Ensure `query_date` is in datetime format, and then extract the day to define unique migraine days
df["query_date"] = pd.to_datetime(df["query_date"])
df["day"] = df["query_date"].dt.to_period("D") 

# Filter to include only rows with a positive `attack_duration_days` to count only actual migraine days
df_with_migraine_days = df[df["attack_duration_days"] > 0]

# Group by `studyid` and count unique migraine days in the month
migraine_days_per_month = df_with_migraine_days.groupby("studyid")["day"].nunique().reset_index()

# Rename the column to represent the count of unique migraine days per month
migraine_days_per_month.rename(columns={"day": "migraine_days_per_month"}, inplace=True)

df = df.merge(migraine_days_per_month, on="studyid", how="left")


In [6]:
# Convert the 'all_triggers' column to lowercase
df['all_triggers'] = df['all_triggers'].str.lower()

# keyword patterns for each category
# Broad sleep-related words for poor sleep quality
trigger_poor_sleep_keywords = ['sleep', 'nap', 'nightmare', 'restless', 'insomnia', 'vivid dream','interrupted']

# Broad words for lack of sleep
trigger_lack_sleep_keywords = ['lack of sleep', 'up late', 'late to bed', 'slept late', 'not enough sleep', 'insufficient sleep', 'no sleep', 'woke up early']

# Broad words for physical activity
trigger_physical_activity_keywords = ['physical','activity', 'exertion', 'exercise', 'running', 'walking', 'biking', 'yoga', 'work out', 'muscle']

# Broad words for lack of physical activity
trigger_lack_physical_activity_keywords = ['inactivity', 'no exercise', 'lack of exercise', 'sitting', 'skipped', 'missed work out', 'not enough movement','not enough exercice']

# Broad words for stress-related triggers
trigger_stress_keywords = ['stress', 'anxiety', 'overthinking', 'depression', 'emotional', 'worry', 'overwhelmed', 'nervous', 'mental', 'upset']

In [7]:
def create_flag_column(df, column, keywords):
    pattern = '|'.join(keywords)
    return df[column].str.contains(pattern, case=False, na=False).astype(int)

# Create trigger columns
df['trigger_poor_sleep'] = create_flag_column(df, 'all_triggers', trigger_poor_sleep_keywords)
df['trigger_lack_sleep'] = create_flag_column(df, 'all_triggers', trigger_lack_sleep_keywords)
df['trigger_physical_activity'] = create_flag_column(df, 'all_triggers', trigger_physical_activity_keywords)
df['trigger_lack_physical_activity'] = create_flag_column(df, 'all_triggers', trigger_lack_physical_activity_keywords)
df['trigger_stress'] = create_flag_column(df, 'all_triggers', trigger_stress_keywords)

In [8]:
# Convert the 'reliefs' column to lowercase
df['all_relieves'] = df['all_relieves'].str.lower()

# keyword patterns for each category

# Broad sleep-related words for sleep 
relief_sleep_keywords = ['sleep', 'nap', 'couch', 'bed','rest','lay']

# Broad words for physical activity
relief_physical_activity_keywords = ['physical','activity', 'exertion', 'exercise', 'running', 'walking', 'walk','biking','bike ride', 'yoga', 'work out', 'muscle','stretch','movement','moving']

# Broad words for lack of physical activity
relief_lack_physical_activity_keywords = ['no movement', 'stop activity', 'stay still', 'no exercice', 'stay motionless', 'minimal activity', 'limited movement']


In [9]:
# Create reliefs columns
df['relief_sleep'] = create_flag_column(df, 'all_relieves', relief_sleep_keywords)
df['relief_physical_activity'] = create_flag_column(df, 'all_relieves', relief_physical_activity_keywords)
df['relief_lack_physical_activity'] = create_flag_column(df, 'all_relieves', relief_lack_physical_activity_keywords)

In [10]:
df['all_aff'] = df['all_aff'].str.lower()
# Affected activities related to Quality of Life
quality_of_life_aff_activities_keywords = ['hard to concentrate','slower [at home]', 'missed work', 'missed family time', 'slower [at work]', 'missed social activity', 'missed school', 'concentration', 'tired', 'unproductive', 'not social', 'no energy', 'no appetite', 'slow/soft [house]', 'slow/soft [work]', 'family time', 'difficulties to concentrate', 'school missed']

In [11]:
# Create reliefs columns
df['affected_activity_QoL'] = create_flag_column(df, 'all_aff', quality_of_life_aff_activities_keywords)

In [12]:
# create activity groups based on user activity passive data

# calculate total activity per day 
df['total_physical_activity'] = df['ua_running'] + df['ua_walking'] + df['ua_cycling'] + df['ua_wor']

df['query_date'] = pd.to_datetime(df['query_date'])

# Calculate total activity per week for each user based on studyid
# Group by hashed_uid and week number, then sum the total activity
df['week'] = df['query_date'].dt.isocalendar().week  
weekly_activity = df.groupby(['studyid', 'week']).agg(
    total_activity_per_week=('total_physical_activity', 'sum')
).reset_index()

# Calculate the mean activity per user (over multiple weeks)
user_weekly_activity_means = weekly_activity.groupby('studyid').agg(
    mean_total_activity_per_week=('total_activity_per_week', 'mean')
).reset_index()

# Categorize users based on the mean weekly activity thresholds based on WHO's activity recommendations
# Less than 150 minutes per week = Sedentary
# 150–300 minutes per week = Active
# More than 300 minutes per week = Highly Active
user_weekly_activity_means['activity_group'] = pd.cut(user_weekly_activity_means['mean_total_activity_per_week'],
                                                      bins=[-np.inf, 150, 300, np.inf],
                                                      labels=['Sedentary', 'Active', 'Highly Active'])

df = pd.merge(df, user_weekly_activity_means[['studyid', 'activity_group']], on='studyid', how='left')



In [13]:
# create sleep groups 

# Group by hashed_uid to calculate the mean sleep duration for each user
user_sleep_means = df.groupby('studyid').agg(
    mean_sleep_duration=('sleep_duration_hours', 'mean')
).reset_index()

# Categorize users based on the mean sleep duration
# Short sleep: less than 7 hours
# Adequate sleep: between 7 and 9 hours
# Excessive sleep: more than 9 hours
user_sleep_means['sleep_group'] = pd.cut(user_sleep_means['mean_sleep_duration'],
                                         bins=[-np.inf, 7, 9, np.inf],
                                         labels=['Short Sleep', 'Adequate Sleep', 'Excessive Sleep'])

df = pd.merge(df, user_sleep_means[['studyid', 'sleep_group']], on='studyid', how='left')

In [14]:
#feature engineering - creating new columns to be used in the models

df = df.sort_values(by=['studyid', 'query_date'])

# Average for Sleep Duration (Past 7 Days)
df['sleep_duration_past_7_days'] = df.groupby('studyid')['sleep_duration_hours'].rolling(window=7, min_periods=1).mean().reset_index(0, drop=True)

# Create a count of migraine_attacks_past7days (number of migraine attacks over the past 7 days)
df['migraine_attacks_past7days'] = (df.assign(migraine_occurred=(df['duration_in_hours'] > 0).astype(int)).groupby('studyid')['migraine_occurred'].rolling(window=7, min_periods=1).sum().reset_index(0, drop=True))

# Create the 'mean_migraine_duration_past7days' column (avearage migraine duration of migraine attacks over the past 7 days)
df['mean_migraine_duration_past7days'] = (df.groupby('studyid')['duration_in_hours'].rolling(window=7, min_periods=1).mean().reset_index(0, drop=True))                                    

In [15]:
# Encoding 'reported_anxiety' and 'reported_depression' columns
df['reported_anxiety'] = df['reported_anxiety'].map({'t': 1, 'f': 0})
df['reported_depression'] = df['reported_depression'].map({'t': 1, 'f': 0})

In [16]:
# create next migraine window (7 days)

# Sort by user and date
df = df.sort_values(by=['studyid', 'query_date']).reset_index(drop=True)

# query_date is in datetime format
df['query_date'] = pd.to_datetime(df['query_date'])

# Define a 7-day time window
time_window = 7

# Function to check for a migraine occurrence within the next 'time_window' days, in my case: 7
def next_migraine_within_window(df, time_window):
    for idx in range(len(df)):
        current_day = df.iloc[idx]['query_date']
        future_rows = df[(df['query_date'] > current_day) & 
                         (df['query_date'] <= current_day + pd.Timedelta(days=time_window))]
        # If any of the future rows have 'duration_in_hours' > 0, set 'next_migraine_in_window' to 1
        if any(future_rows['duration_in_hours'] > 0):
            df.iloc[idx, df.columns.get_loc('next_migraine_in_window')] = 1  
        else:
            df.iloc[idx, df.columns.get_loc('next_migraine_in_window')] = 0
    return df

df['next_migraine_in_window'] = 0

# Apply the function for each user
df = df.groupby('studyid').apply(next_migraine_within_window, time_window=time_window)

df.reset_index(drop=True, inplace=True)


  df = df.groupby('studyid').apply(next_migraine_within_window, time_window=time_window)


In [17]:
df['next_migraine_in_window'].value_counts()

next_migraine_in_window
0    9340
1    5062
Name: count, dtype: int64

In [18]:
# create next migraine next day

# Sort by user and date
df = df.sort_values(by=['studyid', 'query_date']).reset_index(drop=True)

# Ensure query_date is in datetime format
df['query_date'] = pd.to_datetime(df['query_date'])

# Define a 1-day time window for the next day
time_window = 1

# Function to check for a migraine occurrence the next day
def next_migraine_next_day(df):
    for idx in range(len(df)):
        current_day = df.iloc[idx]['query_date']
        future_rows = df[(df['query_date'] == current_day + pd.Timedelta(days=time_window))]
        # If any of the future rows have 'duration_in_hours' > 0, set 'next_migraine_next_day' to 1
        if any(future_rows['duration_in_hours'] > 0):
            df.iloc[idx, df.columns.get_loc('next_migraine_next_day')] = 1  
        else:
            df.iloc[idx, df.columns.get_loc('next_migraine_next_day')] = 0
    return df

# Add a new column 'next_migraine_next_day'
df['next_migraine_next_day'] = 0

# Apply the function for each user
df = df.groupby('studyid').apply(next_migraine_next_day)

df.reset_index(drop=True, inplace=True)


  df = df.groupby('studyid').apply(next_migraine_next_day)


In [19]:
df['next_migraine_next_day'].value_counts()

next_migraine_next_day
0    12943
1     1459
Name: count, dtype: int64

In [20]:
# Defined thresholds for severe migraine
severe_pain_threshold = 7
severe_duration_threshold = 24  # in hours

# Create a new column 'severe_migraine' based on the thresholds
df['severe_migraine'] = (
    (df['painintensity'] >= severe_pain_threshold) &
    (df['duration_in_hours'] >= severe_duration_threshold)
).astype(int)


In [21]:
df['severe_migraine'].value_counts() # there are nearly no severe migraine events in the dataset

severe_migraine
0    14395
1        7
Name: count, dtype: int64

In [22]:
df['affected_activity_QoL'].value_counts()

affected_activity_QoL
0    8798
1    5604
Name: count, dtype: int64

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14402 entries, 0 to 14401
Data columns (total 46 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   query_date                        14402 non-null  datetime64[ns]
 1   studyid                           14402 non-null  object        
 2   all_locations                     14402 non-null  object        
 3   all_triggers                      14402 non-null  object        
 4   all_relieves                      14402 non-null  object        
 5   all_aff                           14402 non-null  object        
 6   gender                            14402 non-null  object        
 7   age_group                         14402 non-null  object        
 8   duration_in_secs                  14402 non-null  float64       
 9   painintensity                     14402 non-null  float64       
 10  ua_still                          14402 non-nu

In [24]:
# Save the final dataset to a CSV file
df.to_csv(final_output_path, index=False)