# Merging data
Data from one patient was collected to create a core concept for the AI system: pre-meal blood glucose, carb intake, injected insulin doses (manual Excel & dietary app); physical activity (Health App/Apple Watch); and weather records (reputable provider). Collected data from diverse sources were consolidated into a single dataset.

In [1]:
import pandas as pd
import xml.etree.ElementTree as ET

### Preparing row data

#### Patient Records [Excel + Dietary app]

In [2]:
# Load patient's records
df_pr = pd.read_csv('PatientRecords.csv',parse_dates=['Date'])
df_pr

Unnamed: 0,Date,Interval,DV,BG1,BG2,bID,shID
0,2022-12-09,breakfast-lunch,1.756,4.7,4.7,3.0,3.5
1,2022-12-09,lunch-dinner,2.511,4.7,4.3,2.5,2.0
2,2022-12-09,dinner-breakfast,3.442,4.3,6.5,2.0,4.0
3,2022-12-10,breakfast-lunch,1.584,6.5,5.6,3.0,4.0
4,2022-12-10,lunch-dinner,2.562,5.6,6.2,2.5,2.0
...,...,...,...,...,...,...,...
233,2023-02-24,dinner-breakfast,2.389,6.2,5.1,3.0,2.5
234,2023-02-25,breakfast-lunch,1.706,5.1,6.7,3.0,3.0
235,2023-02-25,lunch-dinner,1.952,6.7,3.8,3.0,3.0
236,2023-02-25,dinner-breakfast,2.677,3.8,4.3,3.0,2.5


In [3]:
df_pr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 238 entries, 0 to 237
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   Date      238 non-null    datetime64[ns]
 1   Interval  238 non-null    object        
 2   DV        238 non-null    float64       
 3   BG1       238 non-null    float64       
 4   BG2       238 non-null    float64       
 5   bID       238 non-null    float64       
 6   shID      238 non-null    float64       
dtypes: datetime64[ns](1), float64(5), object(1)
memory usage: 13.1+ KB


#### Physical activity [Health App iOS]

In [5]:
# Load the Health App XML file
tree = ET.parse('HealthAppExport.xml')
root = tree.getroot()

In [6]:
# Find all step count data entries
step_count_data = root.findall(".//Record[@type='HKQuantityTypeIdentifierStepCount']")

In [7]:
# Extract step count and timestamp information
SC = []
Date = []
for entry in step_count_data:
    step_count = entry.attrib['value']
    SC.append(step_count)
    timestamp = entry.attrib['creationDate']
    Date.append(timestamp)

In [8]:
df_sc = pd.DataFrame({'Date':Date,'SC':SC})
df_sc

Unnamed: 0,Date,SC
0,2018-10-28 21:05:47 +1000,16
1,2018-10-28 21:33:21 +1000,8
2,2018-10-28 22:25:01 +1000,2
3,2018-10-28 22:28:22 +1000,11
4,2018-10-29 08:00:34 +1000,28
...,...,...
72614,2023-05-12 14:15:33 +1000,221
72615,2023-05-12 14:25:26 +1000,6
72616,2023-05-12 15:07:35 +1000,324
72617,2023-05-12 15:23:29 +1000,220


In [9]:
df_sc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72619 entries, 0 to 72618
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Date    72619 non-null  object
 1   SC      72619 non-null  object
dtypes: object(2)
memory usage: 1.1+ MB


In [11]:
# Converting dates to datetime objects
df_sc['Date'] = pd.to_datetime(df_sc['Date'],format='%Y-%m-%d %H:%M:%S')

In [12]:
# Removing timezone
df_sc['Date'] = df_sc['Date'].dt.tz_localize(None)

In [13]:
# Converting step counts to integers
df_sc['SC'] = df_sc['SC'].astype(int)

In [14]:
df_sc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72619 entries, 0 to 72618
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    72619 non-null  datetime64[ns]
 1   SC      72619 non-null  int64         
dtypes: datetime64[ns](1), int64(1)
memory usage: 1.1 MB


In [15]:
# Patient records extracted earlier contain data from 09.12.2022 until 26.02.2023
# Keeping the step count within that date range
df_sc = df_sc[(df_sc['Date']>='2022-12-09')&(df_sc['Date']<='2023-02-27')]

In [16]:
# Resetting index
df_sc = df_sc.sort_values('Date').reset_index(drop=True)

In [17]:
df_sc

Unnamed: 0,Date,SC
0,2022-12-09 08:07:29,40
1,2022-12-09 10:33:30,970
2,2022-12-09 10:43:02,527
3,2022-12-09 11:12:18,216
4,2022-12-09 11:40:49,67
...,...,...
1752,2023-02-26 16:05:52,770
1753,2023-02-26 16:15:52,374
1754,2023-02-26 16:25:18,757
1755,2023-02-26 19:22:17,20


#### Weather Data [[Visual Crossing]](https://www.visualcrossing.com/)

In [18]:
# Load weather data
df_w = pd.read_csv('Weather.csv')
df_w

Unnamed: 0,datetime,temp,humidity
0,2022-12-09T00:00:00,-14.0,65.87
1,2022-12-09T01:00:00,-12.1,61.16
2,2022-12-09T02:00:00,-13.0,66.11
3,2022-12-09T03:00:00,-14.0,65.87
4,2022-12-09T04:00:00,-15.0,72.17
...,...,...,...
1915,2023-02-26T19:00:00,-8.6,45.25
1916,2023-02-26T20:00:00,-9.0,48.09
1917,2023-02-26T21:00:00,-11.0,56.31
1918,2023-02-26T22:00:00,-11.8,56.08


In [19]:
df_w.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1920 entries, 0 to 1919
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   datetime  1920 non-null   object 
 1   temp      1920 non-null   float64
 2   humidity  1920 non-null   float64
dtypes: float64(2), object(1)
memory usage: 45.1+ KB


In [20]:
# Converting dates to datetime format
df_w['datetime'] = pd.to_datetime(df_w['datetime'],format='%Y-%m-%d %H:%M:%S')

In [21]:
df_w.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1920 entries, 0 to 1919
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   datetime  1920 non-null   datetime64[ns]
 1   temp      1920 non-null   float64       
 2   humidity  1920 non-null   float64       
dtypes: datetime64[ns](1), float64(2)
memory usage: 45.1 KB


In [22]:
# Renaming columns to match the other dfs
df_w.rename(columns={'datetime':'Date','temp':'Temp','humidity':'Humid'},inplace=True)

In [23]:
df_w

Unnamed: 0,Date,Temp,Humid
0,2022-12-09 00:00:00,-14.0,65.87
1,2022-12-09 01:00:00,-12.1,61.16
2,2022-12-09 02:00:00,-13.0,66.11
3,2022-12-09 03:00:00,-14.0,65.87
4,2022-12-09 04:00:00,-15.0,72.17
...,...,...,...
1915,2023-02-26 19:00:00,-8.6,45.25
1916,2023-02-26 20:00:00,-9.0,48.09
1917,2023-02-26 21:00:00,-11.0,56.31
1918,2023-02-26 22:00:00,-11.8,56.08


### Merging

In [25]:
# Concatenate three dfs
df = pd.concat([df_pr,df_sc,df_w])
df

Unnamed: 0,Date,Interval,DV,BG1,BG2,bID,shID,SC,Temp,Humid
0,2022-12-09 00:00:00,breakfast-lunch,1.756,4.7,4.7,3.0,3.5,,,
1,2022-12-09 00:00:00,lunch-dinner,2.511,4.7,4.3,2.5,2.0,,,
2,2022-12-09 00:00:00,dinner-breakfast,3.442,4.3,6.5,2.0,4.0,,,
3,2022-12-10 00:00:00,breakfast-lunch,1.584,6.5,5.6,3.0,4.0,,,
4,2022-12-10 00:00:00,lunch-dinner,2.562,5.6,6.2,2.5,2.0,,,
...,...,...,...,...,...,...,...,...,...,...
1915,2023-02-26 19:00:00,,,,,,,,-8.6,45.25
1916,2023-02-26 20:00:00,,,,,,,,-9.0,48.09
1917,2023-02-26 21:00:00,,,,,,,,-11.0,56.31
1918,2023-02-26 22:00:00,,,,,,,,-11.8,56.08


In [29]:
df.to_csv('FullData_merged.csv',index=False)