# Data merging

The previously preprocessed data from four different sources were merged into one data frame. To ensure all features were present per day, days where at least one parameter was missing were dropped.

In [1]:
import pandas as pd

In [2]:
hd = pd.read_csv('/Users/polzovatel/Desktop/Python/Portfolio/Diabetes/HealthAppData/HealthData.csv',
                 parse_dates=['Date'])

In [3]:
pr1 = pd.read_csv('/Users/polzovatel/Desktop/Python/Portfolio/Diabetes/PatientRecords/PatientRecords_DA.csv',
                 parse_dates=['Date'])

In [4]:
pr2 = pd.read_csv('/Users/polzovatel/Desktop/Python/Portfolio/Diabetes/PatientRecords/PatientRecords_E.csv',
                 parse_dates=['Date'])

In [5]:
wd = pd.read_csv('/Users/polzovatel/Desktop/Python/Portfolio/Diabetes/WeatherData/WeatherData.csv',
                parse_dates=['Date'])

In [6]:
df = pd.concat([pr1,pr2,hd,wd])

In [7]:
df

Unnamed: 0,Date,DV,BG,shID,bID,SC,HR,Temp,Humid
0,2017-08-25 06:00:00,0.00,5.3,1.0,0.0,,,,
1,2017-08-25 08:00:00,6.62,6.7,4.0,0.0,,,,
2,2017-08-25 12:00:00,4.46,5.5,4.0,0.0,,,,
3,2017-08-25 15:00:00,1.15,3.5,1.0,0.0,,,,
4,2017-08-25 18:00:00,1.00,4.9,1.0,0.0,,,,
...,...,...,...,...,...,...,...,...,...
26299,2021-12-31 19:00:00,,,,,,,-25.2,63.06
26300,2021-12-31 20:00:00,,,,,,,-25.0,69.27
26301,2021-12-31 21:00:00,,,,,,,-24.0,76.22
26302,2021-12-31 22:00:00,,,,,,,-23.0,69.98


In [8]:
#setting index to date
df = df.set_index('Date')
df = df.sort_index()

In [9]:
df

Unnamed: 0_level_0,DV,BG,shID,bID,SC,HR,Temp,Humid
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-08-25 06:00:00,0.00,5.3,1.0,0.0,,,,
2017-08-25 08:00:00,6.62,6.7,4.0,0.0,,,,
2017-08-25 12:00:00,4.46,5.5,4.0,0.0,,,,
2017-08-25 15:00:00,1.15,3.5,1.0,0.0,,,,
2017-08-25 18:00:00,1.00,4.9,1.0,0.0,,,,
...,...,...,...,...,...,...,...,...
2022-10-19 09:09:51,,,,,16.0,,,
2022-10-19 10:42:18,,,,,2.0,,,
2022-10-19 10:58:40,,,,,611.0,,,
2022-10-19 11:13:04,,,,,348.0,,,


In [10]:
#create a list of unique dates
unique_dates = df.index.map(lambda i: str(i.date())).unique()
unique_dates

Index(['2017-08-25', '2017-08-26', '2017-08-27', '2017-08-28', '2017-08-29',
       '2017-08-30', '2017-08-31', '2017-10-02', '2017-10-03', '2017-10-04',
       ...
       '2022-10-10', '2022-10-11', '2022-10-12', '2022-10-13', '2022-10-14',
       '2022-10-15', '2022-10-16', '2022-10-17', '2022-10-18', '2022-10-19'],
      dtype='object', name='Date', length=1475)

In [11]:
#check if there is any column with missing values per unique date
#create a list of dates to drop
to_drop_dates = []  

for i in unique_dates:
    for c in df.columns:
        if df.loc[i,c].isna().sum() == len(df.loc[i]):
            to_drop_dates.extend(df.loc[i].index.tolist())
            break
to_drop_dates

[Timestamp('2017-08-25 06:00:00'),
 Timestamp('2017-08-25 08:00:00'),
 Timestamp('2017-08-25 12:00:00'),
 Timestamp('2017-08-25 15:00:00'),
 Timestamp('2017-08-25 18:00:00'),
 Timestamp('2017-08-25 19:00:00'),
 Timestamp('2017-08-25 22:00:00'),
 Timestamp('2017-08-26 07:00:00'),
 Timestamp('2017-08-26 08:00:00'),
 Timestamp('2017-08-26 11:00:00'),
 Timestamp('2017-08-26 13:00:00'),
 Timestamp('2017-08-26 17:00:00'),
 Timestamp('2017-08-26 19:00:00'),
 Timestamp('2017-08-26 22:00:00'),
 Timestamp('2017-08-26 23:00:00'),
 Timestamp('2017-08-27 07:00:00'),
 Timestamp('2017-08-27 08:00:00'),
 Timestamp('2017-08-27 11:00:00'),
 Timestamp('2017-08-27 19:00:00'),
 Timestamp('2017-08-27 22:00:00'),
 Timestamp('2017-08-28 07:00:00'),
 Timestamp('2017-08-28 12:00:00'),
 Timestamp('2017-08-28 16:00:00'),
 Timestamp('2017-08-28 19:00:00'),
 Timestamp('2017-08-28 22:00:00'),
 Timestamp('2017-08-29 06:00:00'),
 Timestamp('2017-08-29 07:30:00'),
 Timestamp('2017-08-29 10:00:00'),
 Timestamp('2017-08-

In [12]:
#dropping dates with missing values
df.drop(index=to_drop_dates,inplace=True)

In [13]:
df

Unnamed: 0_level_0,DV,BG,shID,bID,SC,HR,Temp,Humid
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019-01-08 00:00:00,,,,,,,-17.0,59.64
2019-01-08 01:00:00,,,,,,,-19.3,71.06
2019-01-08 02:00:00,,,,,,,-17.0,65.13
2019-01-08 03:00:00,,,,,,,-17.0,59.64
2019-01-08 03:30:00,0.0,3.7,0.0,2.0,,,,
...,...,...,...,...,...,...,...,...
2021-06-23 21:00:00,,,,,,,21.0,60.23
2021-06-23 22:00:00,0.0,0.0,0.0,0.0,,,,
2021-06-23 22:00:00,0.0,0.0,0.0,0.0,,,,
2021-06-23 22:00:00,,,,,,,20.2,58.80


In [14]:
#reordering data frame
df = df[['bID','BG','DV','shID','SC','HR','Temp','Humid']]

In [15]:
df

Unnamed: 0_level_0,bID,BG,DV,shID,SC,HR,Temp,Humid
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019-01-08 00:00:00,,,,,,,-17.0,59.64
2019-01-08 01:00:00,,,,,,,-19.3,71.06
2019-01-08 02:00:00,,,,,,,-17.0,65.13
2019-01-08 03:00:00,,,,,,,-17.0,59.64
2019-01-08 03:30:00,2.0,3.7,0.0,0.0,,,,
...,...,...,...,...,...,...,...,...
2021-06-23 21:00:00,,,,,,,21.0,60.23
2021-06-23 22:00:00,0.0,0.0,0.0,0.0,,,,
2021-06-23 22:00:00,0.0,0.0,0.0,0.0,,,,
2021-06-23 22:00:00,,,,,,,20.2,58.80


In [16]:
df.to_csv('FullData.csv')