# Patient records from Diabet app_data preprocessing

The data from an app called Diabet, which tracked carbs intake, blood glucose levels and insulin deliveries, was pre-processed in order to join it with the other records.

In [1]:
import pandas as pd
import numpy as np
import re

### 1. Data exploration

Conversion of dates and times recorded in an inappropriate format into datetime objects posed a major challenge in the data preprocessing stage.

In [2]:
pr = pd.read_csv('PatientRecords_DiabetApp.csv')

In [3]:
pr

Unnamed: 0,Time,DV,BG,shID,bID
0,25 August 2017 (Friday),,,,
1,завтрак (06:00),0.00,5.3,1.0,0.0
2,завтрак (08:00),6.62,6.7,4.0,0.0
3,обед (12:00),4.46,5.5,4.0,0.0
4,обед (15:00),1.15,3.5,1.0,0.0
...,...,...,...,...,...
1039,обед (17:00),34.38,4.0,4.0,0.0
1040,29 January 2022 (Saturday),,,,
1041,завтрак (11:30),49.78,5.2,5.5,0.0
1042,31 January 2022 (Monday),,,,


### 2. Data cleaning

The Regular Expression module was used to locate the dates in the column, followed by the string spilt() method to remove the days of the week that had been appended to the dates in brackets. Similarly the column was edited to remove the names of the meals, leaving only the times. Dates were affixed to the times and the column values were transformed into datetime objects.

In [4]:
#searching for dates and removing days of the week
def datesearch(date):
    if re.search(r'\d{2}\s\w+\s\d{4}',date):
        date = date.split('(')[0] 
    return date

In [5]:
pr['Time'] = pr['Time'].map(datesearch)

In [6]:
pr

Unnamed: 0,Time,DV,BG,shID,bID
0,25 August 2017,,,,
1,завтрак (06:00),0.00,5.3,1.0,0.0
2,завтрак (08:00),6.62,6.7,4.0,0.0
3,обед (12:00),4.46,5.5,4.0,0.0
4,обед (15:00),1.15,3.5,1.0,0.0
...,...,...,...,...,...
1039,обед (17:00),34.38,4.0,4.0,0.0
1040,29 January 2022,,,,
1041,завтрак (11:30),49.78,5.2,5.5,0.0
1042,31 January 2022,,,,


In [7]:
#searching for times and removing the names of the meals
def timesearch(time):
    if re.search(r'\d{2}:\d{2}',time):            
        time = time.split('(')[-1][:-1] 
        
    return time

In [8]:
pr['Time'] = pr['Time'].map(timesearch)

In [9]:
pr

Unnamed: 0,Time,DV,BG,shID,bID
0,25 August 2017,,,,
1,06:00,0.00,5.3,1.0,0.0
2,08:00,6.62,6.7,4.0,0.0
3,12:00,4.46,5.5,4.0,0.0
4,15:00,1.15,3.5,1.0,0.0
...,...,...,...,...,...
1039,17:00,34.38,4.0,4.0,0.0
1040,29 January 2022,,,,
1041,11:30,49.78,5.2,5.5,0.0
1042,31 January 2022,,,,


In [10]:
#affixing the dates 
for i in pr.index:
    if re.search(r'\d{2}\s\w+\s\d{4}',pr.loc[i,'Time']):
        date = pr.loc[i,'Time']
    else:
        pr.loc[i,'Time'] = date + pr.loc[i,'Time']

In [11]:
pr

Unnamed: 0,Time,DV,BG,shID,bID
0,25 August 2017,,,,
1,25 August 2017 06:00,0.00,5.3,1.0,0.0
2,25 August 2017 08:00,6.62,6.7,4.0,0.0
3,25 August 2017 12:00,4.46,5.5,4.0,0.0
4,25 August 2017 15:00,1.15,3.5,1.0,0.0
...,...,...,...,...,...
1039,28 January 2022 17:00,34.38,4.0,4.0,0.0
1040,29 January 2022,,,,
1041,29 January 2022 11:30,49.78,5.2,5.5,0.0
1042,31 January 2022,,,,


In [12]:
pr.rename(columns={'Time':'Date'},inplace=True)

In [13]:
pr.dropna(axis=0,inplace=True)

In [14]:
pr['Date'] = pd.to_datetime(pr['Date'])

In [17]:
pr = pr.reset_index(drop=True)

In [18]:
pr

Unnamed: 0,Date,DV,BG,shID,bID
0,2017-08-25 06:00:00,0.00,5.3,1.0,0.0
1,2017-08-25 08:00:00,6.62,6.7,4.0,0.0
2,2017-08-25 12:00:00,4.46,5.5,4.0,0.0
3,2017-08-25 15:00:00,1.15,3.5,1.0,0.0
4,2017-08-25 18:00:00,1.00,4.9,1.0,0.0
...,...,...,...,...,...
877,2022-01-28 11:00:00,30.31,4.8,6.0,0.0
878,2022-01-28 13:45:00,0.00,3.9,0.0,2.5
879,2022-01-28 17:00:00,34.38,4.0,4.0,0.0
880,2022-01-29 11:30:00,49.78,5.2,5.5,0.0


In [19]:
pr.to_csv('PatientRecords_DA.csv',index=False)