In [1]:
import pandas as pd
import numpy as np
import os
import pathlib
import pdfminer
from pdfminer.high_level import extract_text
from PyPDF2 import PdfFileMerger
from datetime import datetime

# User-specified inputs

In [2]:
patient_id = 'PR05'
start, stop = '20231218_1845', '20231219_0726'
INDIR = '/userdata/dastudillo/patient_data/tmp/'
OUTDIR = '/userdata/dastudillo/patient_data/tmp/'
fn_merged = f'{patient_id}_{start}-{stop}_merged.pdf'
fn_parsed = f'{patient_id}_{start}-{stop}_parsed.csv'

In [3]:
#Get list of files specifying folder path and file format as string
def GetFilePaths(FileDirectory, FileFormat):
    FileNames = sorted(filter(lambda x: True if FileFormat in x else False, os.listdir(FileDirectory)))
    FilePaths = []
    for i in range(len(FileNames)):
        FilePaths.append(FileDirectory+FileNames[i])
    return FilePaths

In [4]:
file_paths = GetFilePaths(INDIR, 'pdf')

In [5]:
file_paths

['/userdata/dastudillo/patient_data/tmp/market_pdf_report_xmlfile_70.php-3.pdf',
 '/userdata/dastudillo/patient_data/tmp/market_pdf_report_xmlfile_70.php-4.pdf']

# Merge files

In [6]:
#Merge interrogation reports in one file for parsing

#create and instance of PdfFileMerger() class
merger = PdfFileMerger()

#iterate over the list of file paths
for file in file_paths:
    #Append PDF files
    merger.append(file)
#write out the merged PDF
merger.write(f'{OUTDIR}{fn_merged}')
merger.close()

# Assign path to stored merged pdf
merged_pdf_path = f'{OUTDIR}{fn_merged}'

# Extract text

In [7]:
#Extract all text from pdf 
text = extract_text(merged_pdf_path)

#Create list with strings
lines = text.split('\n')

In [8]:
#Select timestamp, duration and type of event 

EventTimestamp = [a for a in lines if ('Mon,' in a or 'Tue,' in a or 'Wed,' in a or 'Thu,' in a or 'Fri,' in a 
                                       or 'Sat,' in a or 'Sun,' in a)]

EventDuration = [b for b in lines if ('seconds' in b or 'second' in b)]

EventType = [c for c in lines if ('Pattern A2' in c 
                                   or 'Pattern A1' in c
                                   or 'Pattern B1' in c
                                   or 'Pattern B2' in c
                                   or 'Pattern A1A2' in c
                                   or 'Pattern B1B2' in c
                                   or 'Magnet applied' in c)]

In [9]:
#Make sure the list sizes coincide otherwise 
#there might be a missing or additional set of strings (edit cell above to correct)
print(len(EventTimestamp));print(len(EventDuration));print(len(EventType))

406
406
406


# Create initial df

In [10]:
#Create list of lists that will be use as input data for final dataframe columns
data_for_df = []
for i in range(len(EventTimestamp)):
    data_for_df.append([EventTimestamp[i], EventDuration[i], EventType[i]])
    i+=1    

In [11]:
#Create initial dataframe
to_df = pd.DataFrame(data_for_df, columns = ['EventTimestamp', 'EventDuration', 'EventType'])
to_df['EventDuration'] = to_df['EventDuration'].str.replace('seconds','') #get rid of string so duration is numerical data

In [12]:
#Convert timestamps to datetime object
timestamps_list = list(to_df['EventTimestamp'])
timestamps_datetime =  []

for i in range(len(timestamps_list)):
    timestamps_datetime.append(datetime.strptime(timestamps_list[i], '%a, %b %d, %Y %H:%M:%S'))
    i+=1

#update timestamps in dataframe    
to_df['EventTimestamp'] = timestamps_datetime
to_df = to_df.sort_values(by='EventTimestamp')

# Add detection and stim counts

In [14]:
parsed_df = to_df.copy()

In [15]:
#Assign detections for pattern A1

conditions = [
    (to_df['EventType'].isin([d for d in to_df['EventType'] if ('Pattern A1 ; 1 Responsive Therapy' in d or 'Pattern A1 ; 1 Responsive Therapy; Insufficient Charge;' in d)])),
    (to_df['EventType'].isin([e for e in to_df['EventType'] if ('Pattern A1 ; 2 Responsive Therapies' in e or 'Pattern A1 ; 2 Responsive Therapies; Insufficient Charge;' in e)])),
    (to_df['EventType'].isin([f for f in to_df['EventType'] if ('Pattern A1 ; 3 Responsive Therapies' in f or 'Pattern A1 ; 3 Responsive Therapies; Insufficient Charge;' in f)])),
    (to_df['EventType'].isin([g for g in to_df['EventType'] if ('Pattern A1 ; 4 Responsive Therapies' in g or 'Pattern A1 ; 4 Responsive Therapies; Insufficient Charge;' in g)])),
    (to_df['EventType'].isin([j for j in to_df['EventType'] if ('Pattern A1 ; 5 Responsive Therapies' in j or 'Pattern A1 ; 5 Responsive Therapies; Insufficient Charge;' in j)])),
    (to_df['EventType'].isin([k for k in to_df['EventType'] if ('Pattern A1 Therapy Delivery Inhibited by' in k)])), #will consider events inhibited by cap limit or PEI
    (to_df['EventType'] == 'Magnet applied')
    ]

# create a list of the values we want to assign for each condition
values = ['1', '2', '3', '4', '5', '1', '0']

# create a new column and use np.select to assign values to it using our lists as arguments
parsed_df['Pattern_A1'] = np.select(conditions, values)

In [16]:
#Assign detections for pattern A2

conditions = [
    (to_df['EventType'].isin([d for d in to_df['EventType'] if ('Pattern A2 ; 1 Responsive Therapy' in d or 'Pattern A2 ; 1 Responsive Therapy; Insufficient Charge;' in d)])),
    (to_df['EventType'].isin([e for e in to_df['EventType'] if ('Pattern A2 ; 2 Responsive Therapies' in e or 'Pattern A2 ; 2 Responsive Therapies; Insufficient Charge;' in e)])),
    (to_df['EventType'].isin([f for f in to_df['EventType'] if ('Pattern A2 ; 3 Responsive Therapies' in f or 'Pattern A2 ; 3 Responsive Therapies; Insufficient Charge;' in f)])),
    (to_df['EventType'].isin([g for g in to_df['EventType'] if ('Pattern A2 ; 4 Responsive Therapies' in g or 'Pattern A2 ; 4 Responsive Therapies; Insufficient Charge;' in g)])),
    (to_df['EventType'].isin([j for j in to_df['EventType'] if ('Pattern A2 ; 5 Responsive Therapies' in j or 'Pattern A2 ; 5 Responsive Therapies; Insufficient Charge;' in j)])),
    (to_df['EventType'].isin([k for k in to_df['EventType'] if ('Pattern A2 Therapy Delivery Inhibited by' in k)])),
    (to_df['EventType'] == 'Magnet applied')
    ]

# create a list of the values we want to assign for each condition
values = ['1', '2', '3', '4', '5', '1', '0']

# create a new column and use np.select to assign values to it using our lists as arguments
parsed_df['Pattern_A2'] = np.select(conditions, values)

In [17]:
#Assign detections for pattern B1

conditions = [
    (to_df['EventType'].isin([d for d in to_df['EventType'] if ('Pattern B1 ; 1 Responsive Therapy' in d or 'Pattern B1 ; 1 Responsive Therapy; Insufficient Charge;' in d)])),
    (to_df['EventType'].isin([e for e in to_df['EventType'] if ('Pattern B1 ; 2 Responsive Therapies' in e or 'Pattern B1 ; 2 Responsive Therapies; Insufficient Charge;' in e)])),
    (to_df['EventType'].isin([f for f in to_df['EventType'] if ('Pattern B1 ; 3 Responsive Therapies' in f or 'Pattern B1 ; 3 Responsive Therapies; Insufficient Charge;' in f)])),
    (to_df['EventType'].isin([g for g in to_df['EventType'] if ('Pattern B1 ; 4 Responsive Therapies' in g or 'Pattern B1 ; 4 Responsive Therapies; Insufficient Charge;' in g)])),
    (to_df['EventType'].isin([j for j in to_df['EventType'] if ('Pattern B1 ; 5 Responsive Therapies' in j or 'Pattern B1 ; 5 Responsive Therapies; Insufficient Charge;' in j)])),
    (to_df['EventType'].isin([k for k in to_df['EventType'] if ('Pattern B1 Therapy Delivery Inhibited by' in k)])),
    (to_df['EventType'] == 'Magnet applied')
    ]

# create a list of the values we want to assign for each condition
values = ['1', '2', '3', '4', '5', '1', '0']

# create a new column and use np.select to assign values to it using our lists as arguments
parsed_df['Pattern_B1'] = np.select(conditions, values)

In [18]:
#Assign detections for pattern B2

conditions = [
    (to_df['EventType'].isin([d for d in to_df['EventType'] if ('Pattern B2 ; 1 Responsive Therapy' in d or 'Pattern B2 ; 1 Responsive Therapy; Insufficient Charge;' in d)])),
    (to_df['EventType'].isin([e for e in to_df['EventType'] if ('Pattern B2 ; 2 Responsive Therapies' in e or 'Pattern B2 ; 2 Responsive Therapies; Insufficient Charge;' in e)])),
    (to_df['EventType'].isin([f for f in to_df['EventType'] if ('Pattern B2 ; 3 Responsive Therapies' in f or 'Pattern B2 ; 3 Responsive Therapies; Insufficient Charge;' in f)])),
    (to_df['EventType'].isin([g for g in to_df['EventType'] if ('Pattern B2 ; 4 Responsive Therapies' in g or 'Pattern B2 ; 4 Responsive Therapies; Insufficient Charge;' in g)])),
    (to_df['EventType'].isin([j for j in to_df['EventType'] if ('Pattern B2 ; 5 Responsive Therapies' in j or 'Pattern B2 ; 5 Responsive Therapies; Insufficient Charge;' in j)])),
    (to_df['EventType'].isin([k for k in to_df['EventType'] if ('Pattern B2 Therapy Delivery Inhibited by' in k)])),
    (to_df['EventType'] == 'Magnet applied')
    ]

# create a list of the values we want to assign for each condition
values = ['1', '2', '3', '4', '5', '1', '0']

# create a new column and use np.select to assign values to it using our lists as arguments
parsed_df['Pattern_B2'] = np.select(conditions, values)

In [19]:
#Assign therapies count

conditions = [
        (to_df['EventType'].isin([x for x in to_df['EventType'] if ('Pattern A1 ; 1 Responsive Therapy' in x or 'Pattern A1 ; 1 Responsive Therapy; Insufficient Charge;' in x)])),
    (to_df['EventType'].isin([x for x in to_df['EventType'] if ('Pattern A1 ; 2 Responsive Therapies' in x or 'Pattern A1 ; 2 Responsive Therapies; Insufficient Charge;' in x)])),
    (to_df['EventType'].isin([f for f in to_df['EventType'] if ('Pattern A1 ; 3 Responsive Therapies' in f or 'Pattern A1 ; 3 Responsive Therapies; Insufficient Charge;' in f)])),
    (to_df['EventType'].isin([g for g in to_df['EventType'] if ('Pattern A1 ; 4 Responsive Therapies' in g or 'Pattern A1 ; 4 Responsive Therapies; Insufficient Charge;' in g)])),
    (to_df['EventType'].isin([j for j in to_df['EventType'] if ('Pattern A1 ; 5 Responsive Therapies' in j or 'Pattern A1 ; 5 Responsive Therapies; Insufficient Charge;' in j)])),
    (to_df['EventType'].isin([k for k in to_df['EventType'] if ('Pattern A1 Therapy Delivery Inhibited by' in k)])),
    (to_df['EventType'].isin([l for l in to_df['EventType'] if ('Pattern A2 ; 1 Responsive Therapy' in l or 'Pattern A2 ; 1 Responsive Therapy; Insufficient Charge;' in l)])),
    (to_df['EventType'].isin([m for m in to_df['EventType'] if ('Pattern A2 ; 2 Responsive Therapies' in m or 'Pattern A2 ; 2 Responsive Therapies; Insufficient Charge;' in m)])),
    (to_df['EventType'].isin([n for n in to_df['EventType'] if ('Pattern A2 ; 3 Responsive Therapies' in n or 'Pattern A2 ; 3 Responsive Therapies; Insufficient Charge;' in n)])),
    (to_df['EventType'].isin([o for o in to_df['EventType'] if ('Pattern A2 ; 4 Responsive Therapies' in o or 'Pattern A2 ; 4 Responsive Therapies; Insufficient Charge;' in o)])),
    (to_df['EventType'].isin([p for p in to_df['EventType'] if ('Pattern A2 ; 5 Responsive Therapies' in p or 'Pattern A2 ; 5 Responsive Therapies; Insufficient Charge;' in p)])),
    (to_df['EventType'].isin([q for q in to_df['EventType'] if ('Pattern A2 Therapy Delivery Inhibited by' in q)])),
    (to_df['EventType'] == 'Magnet applied')
    ]

# create a list of the values we want to assign for each condition
values = ['1', '2', '3', '4', '5', '0', '1', '2', '3', '4', '5', '0', '0']

# create a new column and use np.select to assign values to it using our lists as arguments
parsed_df['Therapies'] = np.select(conditions, values)

In [20]:
#Check tabulated data
parsed_df = parsed_df.reset_index(drop=True)
parsed_df

Unnamed: 0,EventTimestamp,EventDuration,EventType,Pattern_A1,Pattern_A2,Pattern_B1,Pattern_B2,Therapies
0,2024-02-21 13:02:10,2.5,Pattern A1 Therapy Delivery Inhibited by Thera...,1,0,0,0,0
1,2024-02-21 13:03:09,3.0,Pattern A1 Therapy Delivery Inhibited by Thera...,1,0,0,0,0
2,2024-02-21 13:05:12,2.5,Pattern A1 Therapy Delivery Inhibited by Thera...,1,0,0,0,0
3,2024-02-21 13:05:46,2.5,Pattern A1 Therapy Delivery Inhibited by Thera...,1,0,0,0,0
4,2024-02-21 13:06:35,2.5,Pattern A1 Therapy Delivery Inhibited by Thera...,1,0,0,0,0
...,...,...,...,...,...,...,...,...
401,2024-02-22 00:57:14,2.5,Pattern A1 Therapy Delivery Inhibited by Thera...,1,0,0,0,0
402,2024-02-22 00:57:51,2.5,Pattern A1 Therapy Delivery Inhibited by Thera...,1,0,0,0,0
403,2024-02-22 01:02:36,0,Magnet applied,0,0,0,0,0
404,2024-02-22 01:03:48,3.0,Pattern A1 Therapy Delivery Inhibited by Thera...,1,0,0,0,0


# Save final df

In [21]:
parsed_df.to_csv(f'{OUTDIR}{fn_parsed}')