In [5]:
import pandas as pd
import numpy as np
import os
import pathlib
from pdfminer.high_level import extract_text
from PyPDF2 import PdfFileMerger
from datetime import datetime

# User-specified inputs

In [18]:
patient_id = 'PR04'
start, stop = '20231218_1845', '20231219_0726'
INDIR = '/Users/daniela/Documents/Presidio/scripts/datasets/interrogation/'
OUTDIR = '/Users/daniela/Documents/Presidio/scripts/datasets/interrogation/'
fn_merged = f'{patient_id}_{start}-{stop}_merged.pdf'
fn_parsed = f'{patient_id}_{start}-{stop}_parsed.csv'

In [3]:
#Get list of files specifying folder path and file format as string
def GetFilePaths(FileDirectory, FileFormat):
    FileNames = sorted(filter(lambda x: True if FileFormat in x else False, os.listdir(FileDirectory)))
    FilePaths = []
    for i in range(len(FileNames)):
        FilePaths.append(FileDirectory+FileNames[i])
    return FilePaths

In [13]:
file_paths = GetFilePaths(INDIR, 'pdf')

# Merge files

In [19]:
#Merge interrogation reports in one file for parsing

#create and instance of PdfFileMerger() class
merger = PdfFileMerger()

#iterate over the list of file paths
for file in file_paths:
    #Append PDF files
    merger.append(file)
#write out the merged PDF
merger.write(f'{OUTDIR}{fn_merged}')
merger.close()

# Assign path to stored merged pdf
merged_pdf_path = f'{OUTDIR}{fn_merged}'

# Extract text

In [17]:
#Extract all text from pdf 
text = extract_text(merged_pdf_path)

#Create list with strings
lines = text.split('\n')

In [24]:
#Select timestamp, duration and type of event 

EventTimestamp = [a for a in lines if ('Mon,' in a or 'Tue,' in a or 'Wed,' in a or 'Thu,' in a or 'Fri,' in a 
                                       or 'Sat,' in a or 'Sun,' in a)]

EventDuration = [b for b in lines if ('seconds' in b or 'second' in b)]

EventType = [c for c in lines if ('Pattern A2' in c 
                                   or 'Pattern A1' in c
                                   or 'Pattern B1' in c
                                   or 'Pattern B2' in c
                                   or 'Pattern A1A2' in c
                                   or 'Pattern B1B2' in c
                                   or 'Magnet applied' in c)]

In [25]:
#Make sure the list sizes coincide otherwise 
#there might be a missing or additional set of strings (edit cell above to correct)
print(len(EventTimestamp));print(len(EventDuration));print(len(EventType))

3948
3948
3948


# Create initial df

In [27]:
#Create list of lists that will be use as input data for final dataframe columns
data_for_df = []
for i in range(len(EventTimestamp)):
    data_for_df.append([EventTimestamp[i], EventDuration[i], EventType[i]])
    i+=1    

In [28]:
#Create initial dataframe
to_df = pd.DataFrame(data_for_df, columns = ['EventTimestamp', 'EventDuration', 'EventType'])
to_df['EventDuration'] = to_df['EventDuration'].str.replace('seconds','') #get rid of string so duration is numerical data

In [29]:
#Convert timestamps to datetime object
timestamps_list = list(to_df['EventTimestamp'])
timestamps_datetime =  []

for i in range(len(timestamps_list)):
    timestamps_datetime.append(datetime.strptime(timestamps_list[i], '%a, %b %d, %Y %H:%M:%S'))
    i+=1

#update timestamps in dataframe    
to_df['EventTimestamp'] = timestamps_datetime
to_df = to_df.sort_values(by='EventTimestamp')

In [30]:
to_df

Unnamed: 0,EventTimestamp,EventDuration,EventType
0,2023-12-18 11:09:23,9.5,Pattern A1 ; 1 Responsive Therapy; Episode End...
1,2023-12-18 11:09:48,3.0,Pattern A1 Therapy Delivery Inhibited by Post-...
2,2023-12-18 11:09:53,3.0,Pattern A1 Therapy Delivery Inhibited by Post-...
3,2023-12-18 11:10:14,9.5,Pattern A1 ; 1 Responsive Therapy; Episode End...
4,2023-12-18 11:10:44,3.0,Pattern A1 Therapy Delivery Inhibited by Post-...
...,...,...,...
3943,2023-12-19 07:24:04,3.0,Pattern A1 Therapy Delivery Inhibited by Post-...
3944,2023-12-19 07:24:29,9.5,Pattern A1 ; 1 Responsive Therapy; Episode End...
3945,2023-12-19 07:25:07,3.0,Pattern A1 Therapy Delivery Inhibited by Post-...
3946,2023-12-19 07:25:12,3.0,Pattern A1 Therapy Delivery Inhibited by Post-...


# Add detection and stim counts

In [32]:
parsed_df = to_df.copy()

In [33]:
#Assign detections for pattern A1

conditions = [
    (to_df['EventType'].isin([d for d in to_df['EventType'] if ('Pattern A1 ; 1 Responsive Therapy' in d or 'Pattern A1 ; 1 Responsive Therapy; Insufficient Charge;' in d)])),
    (to_df['EventType'].isin([e for e in to_df['EventType'] if ('Pattern A1 ; 2 Responsive Therapies' in e or 'Pattern A1 ; 2 Responsive Therapies; Insufficient Charge;' in e)])),
    (to_df['EventType'].isin([f for f in to_df['EventType'] if ('Pattern A1 ; 3 Responsive Therapies' in f or 'Pattern A1 ; 3 Responsive Therapies; Insufficient Charge;' in f)])),
    (to_df['EventType'].isin([g for g in to_df['EventType'] if ('Pattern A1 ; 4 Responsive Therapies' in g or 'Pattern A1 ; 4 Responsive Therapies; Insufficient Charge;' in g)])),
    (to_df['EventType'].isin([j for j in to_df['EventType'] if ('Pattern A1 ; 5 Responsive Therapies' in j or 'Pattern A1 ; 5 Responsive Therapies; Insufficient Charge;' in j)])),
    (to_df['EventType'].isin([k for k in to_df['EventType'] if ('Pattern A1 Therapy Delivery Inhibited by' in k)])), #will consider events inhibited by cap limit or PEI
    (to_df['EventType'] == 'Magnet applied')
    ]

# create a list of the values we want to assign for each condition
values = ['1', '2', '3', '4', '5', '1', '0']

# create a new column and use np.select to assign values to it using our lists as arguments
parsed_df['Pattern_A1'] = np.select(conditions, values)

In [34]:
#Assign detections for pattern A2

conditions = [
    (to_df['EventType'].isin([d for d in to_df['EventType'] if ('Pattern A2 ; 1 Responsive Therapy' in d or 'Pattern A2 ; 1 Responsive Therapy; Insufficient Charge;' in d)])),
    (to_df['EventType'].isin([e for e in to_df['EventType'] if ('Pattern A2 ; 2 Responsive Therapies' in e or 'Pattern A2 ; 2 Responsive Therapies; Insufficient Charge;' in e)])),
    (to_df['EventType'].isin([f for f in to_df['EventType'] if ('Pattern A2 ; 3 Responsive Therapies' in f or 'Pattern A2 ; 3 Responsive Therapies; Insufficient Charge;' in f)])),
    (to_df['EventType'].isin([g for g in to_df['EventType'] if ('Pattern A2 ; 4 Responsive Therapies' in g or 'Pattern A2 ; 4 Responsive Therapies; Insufficient Charge;' in g)])),
    (to_df['EventType'].isin([j for j in to_df['EventType'] if ('Pattern A2 ; 5 Responsive Therapies' in j or 'Pattern A2 ; 5 Responsive Therapies; Insufficient Charge;' in j)])),
    (to_df['EventType'].isin([k for k in to_df['EventType'] if ('Pattern A2 Therapy Delivery Inhibited by' in k)])),
    (to_df['EventType'] == 'Magnet applied')
    ]

# create a list of the values we want to assign for each condition
values = ['1', '2', '3', '4', '5', '1', '0']

# create a new column and use np.select to assign values to it using our lists as arguments
parsed_df['Pattern_A2'] = np.select(conditions, values)

In [35]:
#Assign detections for pattern B1

conditions = [
    (to_df['EventType'].isin([d for d in to_df['EventType'] if ('Pattern B1 ; 1 Responsive Therapy' in d or 'Pattern B1 ; 1 Responsive Therapy; Insufficient Charge;' in d)])),
    (to_df['EventType'].isin([e for e in to_df['EventType'] if ('Pattern B1 ; 2 Responsive Therapies' in e or 'Pattern B1 ; 2 Responsive Therapies; Insufficient Charge;' in e)])),
    (to_df['EventType'].isin([f for f in to_df['EventType'] if ('Pattern B1 ; 3 Responsive Therapies' in f or 'Pattern B1 ; 3 Responsive Therapies; Insufficient Charge;' in f)])),
    (to_df['EventType'].isin([g for g in to_df['EventType'] if ('Pattern B1 ; 4 Responsive Therapies' in g or 'Pattern B1 ; 4 Responsive Therapies; Insufficient Charge;' in g)])),
    (to_df['EventType'].isin([j for j in to_df['EventType'] if ('Pattern B1 ; 5 Responsive Therapies' in j or 'Pattern B1 ; 5 Responsive Therapies; Insufficient Charge;' in j)])),
    (to_df['EventType'].isin([k for k in to_df['EventType'] if ('Pattern B1 Therapy Delivery Inhibited by' in k)])),
    (to_df['EventType'] == 'Magnet applied')
    ]

# create a list of the values we want to assign for each condition
values = ['1', '2', '3', '4', '5', '1', '0']

# create a new column and use np.select to assign values to it using our lists as arguments
parsed_df['Pattern_B1'] = np.select(conditions, values)

In [36]:
#Assign detections for pattern B2

conditions = [
    (to_df['EventType'].isin([d for d in to_df['EventType'] if ('Pattern B2 ; 1 Responsive Therapy' in d or 'Pattern B2 ; 1 Responsive Therapy; Insufficient Charge;' in d)])),
    (to_df['EventType'].isin([e for e in to_df['EventType'] if ('Pattern B2 ; 2 Responsive Therapies' in e or 'Pattern B2 ; 2 Responsive Therapies; Insufficient Charge;' in e)])),
    (to_df['EventType'].isin([f for f in to_df['EventType'] if ('Pattern B2 ; 3 Responsive Therapies' in f or 'Pattern B2 ; 3 Responsive Therapies; Insufficient Charge;' in f)])),
    (to_df['EventType'].isin([g for g in to_df['EventType'] if ('Pattern B2 ; 4 Responsive Therapies' in g or 'Pattern B2 ; 4 Responsive Therapies; Insufficient Charge;' in g)])),
    (to_df['EventType'].isin([j for j in to_df['EventType'] if ('Pattern B2 ; 5 Responsive Therapies' in j or 'Pattern B2 ; 5 Responsive Therapies; Insufficient Charge;' in j)])),
    (to_df['EventType'].isin([k for k in to_df['EventType'] if ('Pattern B2 Therapy Delivery Inhibited by' in k)])),
    (to_df['EventType'] == 'Magnet applied')
    ]

# create a list of the values we want to assign for each condition
values = ['1', '2', '3', '4', '5', '1', '0']

# create a new column and use np.select to assign values to it using our lists as arguments
parsed_df['Pattern_B2'] = np.select(conditions, values)

In [37]:
#Assign therapies count

conditions = [
        (to_df['EventType'].isin([x for x in to_df['EventType'] if ('Pattern A1 ; 1 Responsive Therapy' in x or 'Pattern A1 ; 1 Responsive Therapy; Insufficient Charge;' in x)])),
    (to_df['EventType'].isin([x for x in to_df['EventType'] if ('Pattern A1 ; 2 Responsive Therapies' in x or 'Pattern A1 ; 2 Responsive Therapies; Insufficient Charge;' in x)])),
    (to_df['EventType'].isin([f for f in to_df['EventType'] if ('Pattern A1 ; 3 Responsive Therapies' in f or 'Pattern A1 ; 3 Responsive Therapies; Insufficient Charge;' in f)])),
    (to_df['EventType'].isin([g for g in to_df['EventType'] if ('Pattern A1 ; 4 Responsive Therapies' in g or 'Pattern A1 ; 4 Responsive Therapies; Insufficient Charge;' in g)])),
    (to_df['EventType'].isin([j for j in to_df['EventType'] if ('Pattern A1 ; 5 Responsive Therapies' in j or 'Pattern A1 ; 5 Responsive Therapies; Insufficient Charge;' in j)])),
    (to_df['EventType'].isin([k for k in to_df['EventType'] if ('Pattern A1 Therapy Delivery Inhibited by' in k)])),
    (to_df['EventType'].isin([l for l in to_df['EventType'] if ('Pattern A2 ; 1 Responsive Therapy' in l or 'Pattern A2 ; 1 Responsive Therapy; Insufficient Charge;' in l)])),
    (to_df['EventType'].isin([m for m in to_df['EventType'] if ('Pattern A2 ; 2 Responsive Therapies' in m or 'Pattern A2 ; 2 Responsive Therapies; Insufficient Charge;' in m)])),
    (to_df['EventType'].isin([n for n in to_df['EventType'] if ('Pattern A2 ; 3 Responsive Therapies' in n or 'Pattern A2 ; 3 Responsive Therapies; Insufficient Charge;' in n)])),
    (to_df['EventType'].isin([o for o in to_df['EventType'] if ('Pattern A2 ; 4 Responsive Therapies' in o or 'Pattern A2 ; 4 Responsive Therapies; Insufficient Charge;' in o)])),
    (to_df['EventType'].isin([p for p in to_df['EventType'] if ('Pattern A2 ; 5 Responsive Therapies' in p or 'Pattern A2 ; 5 Responsive Therapies; Insufficient Charge;' in p)])),
    (to_df['EventType'].isin([q for q in to_df['EventType'] if ('Pattern A2 Therapy Delivery Inhibited by' in q)])),
    (to_df['EventType'] == 'Magnet applied')
    ]

# create a list of the values we want to assign for each condition
values = ['1', '2', '3', '4', '5', '0', '1', '2', '3', '4', '5', '0', '0']

# create a new column and use np.select to assign values to it using our lists as arguments
parsed_df['Therapies'] = np.select(conditions, values)

In [38]:
#Check tabulated data
parsed_df

Unnamed: 0,EventTimestamp,EventDuration,EventType,Pattern_A1,Pattern_A2,Pattern_B1,Pattern_B2,Therapies
0,2023-12-18 11:09:23,9.5,Pattern A1 ; 1 Responsive Therapy; Episode End...,1,0,0,0,1
1,2023-12-18 11:09:48,3.0,Pattern A1 Therapy Delivery Inhibited by Post-...,1,0,0,0,0
2,2023-12-18 11:09:53,3.0,Pattern A1 Therapy Delivery Inhibited by Post-...,1,0,0,0,0
3,2023-12-18 11:10:14,9.5,Pattern A1 ; 1 Responsive Therapy; Episode End...,1,0,0,0,1
4,2023-12-18 11:10:44,3.0,Pattern A1 Therapy Delivery Inhibited by Post-...,1,0,0,0,0
...,...,...,...,...,...,...,...,...
3943,2023-12-19 07:24:04,3.0,Pattern A1 Therapy Delivery Inhibited by Post-...,1,0,0,0,0
3944,2023-12-19 07:24:29,9.5,Pattern A1 ; 1 Responsive Therapy; Episode End...,1,0,0,0,1
3945,2023-12-19 07:25:07,3.0,Pattern A1 Therapy Delivery Inhibited by Post-...,1,0,0,0,0
3946,2023-12-19 07:25:12,3.0,Pattern A1 Therapy Delivery Inhibited by Post-...,1,0,0,0,0


# Save final df

In [18]:
parsed_df.to_csv(f'{OUTDIR}{fn_parsed}')