In [1]:
"""
Preamble for most code and jupyter notebooks
@author: tobinsouth
@notebook date: 26 May 2021
"""

import numpy as np, pandas as pd

import matplotlib.pyplot as plt, seaborn as sns
import matplotlib as mpl

import math, string, re, pickle, json, time, os, sys, datetime, itertools

from collections import Counter
from tqdm import tqdm

# Set panda's options
pd.set_option("display.max_rows", 40)
pd.set_option("display.max_columns", 120)

# Better graphics
%config InlineBackend.figure_formats = ['retina']
params = {
    'axes.labelsize': 10,
    'font.size': 10,
    'legend.fontsize': 8,
    'xtick.labelsize': 8,
    'ytick.labelsize': 8,
    'figure.figsize': [5.3, 3.34], # Thesis width
    'figure.dpi' : 72, 
    'font.family': "serif",
    'text.usetex': False, # Use LaTeX when desired
    }
plt.rcParams.update(params)

# Set packages to autoreload
%load_ext autoreload
%autoreload 2

## Get all loaded in data

In [139]:
import os
all_reports = []
for file in os.listdir("data"):
    if file[-6:] == 'pickle':
        if file[:16] == 'list_of_reports_':
            with open('data/'+file, 'rb') as f:
                all_reports.extend(pickle.load(f))
                
# # Remove other devices links
for r in all_reports:
    r.pop('other medical devices reported as being used', None)

In [140]:
all_reports_df = pd.DataFrame(all_reports)
len(all_reports_df.drop_duplicates())/ len(all_reports_df)
all_reports_df = all_reports_df.drop_duplicates()

In [159]:
len(all_reports_df)

15264

# Comparing Word Usage

In [141]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer('[a-z]+') # Checks that word has a valid start, also returns start

word_usage_by_provider = all_reports_df.groupby('Report source category').apply(lambda df: " ".join(df['Event description'])).str.lower().apply(tokenizer.tokenize)

In [142]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words=set(stopwords.words('english')))
X = vectorizer.fit_transform(word_usage_by_provider.apply(" ".join))

word_usage = pd.DataFrame(X.toarray().T, index = vectorizer.get_feature_names(), columns = list(word_usage_by_provider.index))

top_words = []
for col in word_usage.columns:
    top_words.append(list(word_usage[col].sort_values(ascending=False)[:10].index))
top_words = pd.DataFrame(top_words, index = word_usage.columns).T
top_words

Unnamed: 0,Consumer,Government,Health Professional,Industry,Medical Scientist,Other
0,pain,wording,patient,patient,reactive,patient
1,mesh,gloves,device,reported,slides,blood
2,vaginal,required,line,device,suction,line
3,incontinence,nzs,blood,due,positive,leak
4,patient,news,removed,revision,hepatitis,device
5,back,none,patients,procedure,core,machine
6,bladder,choice,catheter,customer,result,leaking
7,pelvic,pro,one,lead,false,found
8,constant,marking,used,surgery,mucin,bag
9,erosion,sterile,noted,underwent,hbc,pump


### Getting Activity Words

In [84]:
def expand_text_style(text):
    tokens = text.split('/')
    tokens = [t.replace('(s)','') for t in tokens]
    tokens = [t.split(',')[0].rstrip().lstrip() for t in tokens]
    return tokens + list(itertools.chain.from_iterable([t.split(' ') for t in tokens]))

[ 2011 Adult Compendium of Physical Activities](https://sites.google.com/site/compendiumofphysicalactivities/home)

In [96]:
CPA_activities = pd.read_csv('vocabulary/CPA_activities.csv')
all_activities = list(itertools.chain.from_iterable(CPA_activities['SPECIFIC ACTIVITIES'].str.split(',')))
all_activities.extend(CPA_activities['MAJOR HEADING'].to_list())
all_activities = list(set([a.rstrip().lstrip().lower() for a in all_activities if type(a) is str]))

In [97]:
# Text splitting
all_activities = list(set(itertools.chain.from_iterable([expand_text_style(text) for text in all_activities])))
all_activities.remove('')

In [98]:
# Manual Adding
activites = ['intercourse',  'overactive'] + ['experienced post', 'removed meshes', 'sexual intercourse']
all_activities = all_activities+activites

### Getting medical probelm terms

Attached also a list with terminology of possible adverse events as classified by [IMDRF](http://imdrf.org/documents/documents.asp)

In [161]:
medical_problems = []
for file in os.listdir('vocabulary'):
    if (file[:5] == 'annex') and (file[-5:] == '.json'):
        with open('vocabulary/'+file, "r") as f:
            annex = json.load(f)
            medical_problems.extend(annex)
medical_problem_terms = [p['term'].lower() for p in medical_problems]
medical_problem_terms = list(set(medical_problem_terms))

In [164]:
medical_problem_terms = list(itertools.chain.from_iterable([expand_text_style(text) for text in medical_problem_terms]))
medical_problem_terms = list(set(medical_problem_terms)) # Remove duplicates

In [102]:
problems = ['dehiscence', 'discoloration', 'hernia', 'pain', 'incontinence', 'prolapse', 'pelvic prolapse', 'complications', 'infection', 'infections', 'painful', 'seroma', 'cystocele', 'dyspareunia', 'inflammation', 'discomfort', 'stress', 'complications', 'inflammation', 'recurrence'] + \
['mesh erosion', 'pain severe', 'severe scarring', 'scarring', 'bladder syndrome', 'overactive bladder', 'incisional hernia', 'hernia recurrence', 'constant pain', 'abdominal pain', 'pain intercourse', 'nerve pain', 'stress incontinence', 'umbilical hernia', 'tract infections', 'leg pain', 'painful sex', 'urinary incontinence', 'incontinence pain', 'cystocele recurrence', 'ongoing pain', 'ability walk', 'extreme pain', 'unable sex'] 

print(len([p for p in problems if p not in medical_problem_terms])/len(problems))
print([p for p in problems if p not in medical_problem_terms])
# [p for p in problems if p in medical_problem_terms]

medical_problem_terms = medical_problem_terms + problems

0.5909090909090909
['pelvic prolapse', 'painful', 'cystocele', 'recurrence', 'mesh erosion', 'pain severe', 'severe scarring', 'scarring', 'bladder syndrome', 'overactive bladder', 'incisional hernia', 'hernia recurrence', 'constant pain', 'pain intercourse', 'nerve pain', 'stress incontinence', 'umbilical hernia', 'tract infections', 'leg pain', 'painful sex', 'incontinence pain', 'cystocele recurrence', 'ongoing pain', 'ability walk', 'extreme pain', 'unable sex']


In [127]:
for w in stopwords.words('english'):
    if w in medical_problem_terms: medical_problem_terms.remove(w)
    if w in all_activities: all_activities.remove(w)

# Look for activities and problems

In [143]:
insulin_df = all_reports_df[all_reports_df['GMDN term'].str.lower().apply(lambda s:'insulin' in s)]

In [144]:
report_word_map = {}
for i, report_text in enumerate(tqdm(insulin_df['Event description'])):
    found_activities = [a for a in all_activities if a in report_text]
    found_probelms = [p for p in medical_problem_terms if p in report_text]
    report_word_map[i] = {'act':found_activities, 'medprob':found_probelms}

100%|██████████| 158/158 [00:00<00:00, 1103.09it/s]


In [148]:
# https://www.nltk.org/_modules/nltk/metrics/association.html
# https://www.nltk.org/howto/collocations.html

import nltk
bigram_measures = nltk.collocations.BigramAssocMeasures() # Use for computing scores

all_singles = list(itertools.chain.from_iterable([r['act']+r['medprob'] for r in report_word_map.values()]))
all_pairs = list(itertools.chain.from_iterable([ [(act, prob) for act in r['act'] for prob in r['medprob']] for r in report_word_map.values()]))

single_frequencies = nltk.FreqDist(all_singles)
pair_frequencies = nltk.FreqDist(all_pairs)
finder = nltk.collocations.BigramCollocationFinder(single_frequencies, pair_frequencies)
scores = finder.score_ngrams(bigram_measures.student_t) # likelihood_ratio

In [149]:
removal = ['h','ic', 'patient', 'th', '-']
[s for s in scores if (s[0][0] not in removal) and (s[0][1] not in removal) and (s[0][0]!=s[0][1])]

[(('deliver', 'liver'), 5.494173473689984),
 (('high', 'due'), 4.44405048759028),
 (('deliver', 'pump'), 4.180940676063974),
 (('2', 'pump'), 4.174166797051436),
 (('ring', 'pump'), 4.060582528552559),
 (('ring', 'liver'), 4.025732986049425),
 (('deliver', 'ring'), 3.8219712062565265),
 (('ring', 'deliver'), 3.8219712062565265),
 (('ton', 'pump'), 3.709626140145161),
 (('2', 'red'), 3.6275327605978296),
 (('level', 'blood'), 3.5511689032850455),
 (('deliver', 'delivered'), 3.5050043117909198),
 (('deliver', 'red'), 3.480801098371401),
 (('0', 'pump'), 3.477933975417401),
 (('level', 'pump'), 3.4619818129115774),
 (('one', 'pump'), 3.4540057316586656),
 (('high', 'blood'), 3.369894329355232),
 (('arm', 'alarm'), 3.2949475041439267),
 (('high', 'pump'), 3.293807797441236),
 (('1', 'pump'), 3.2731609378763493),
 (('2', 'port'), 3.2283391156032177),
 (('2', 'time'), 3.184196278227127),
 (('low', 'pump'), 3.1486344092276064),
 (('high', 'liver'), 3.1403326406510237),
 (('ton', 'button'), 3.

In [None]:
insulin_problems = ['hospitalised']

In [158]:
Counter([t for t in tokenizer.tokenize(" ".join(insulin_df['Event description'].str.lower())) if t not in stopwords.words('english')]).most_common()

[('pump', 131),
 ('patient', 100),
 ('customer', 74),
 ('insulin', 73),
 ('blood', 50),
 ('glucose', 45),
 ('high', 44),
 ('due', 34),
 ('levels', 27),
 ('hospitalised', 23),
 ('battery', 22),
 ('hospital', 21),
 ('reported', 19),
 ('delivered', 19),
 ('set', 18),
 ('needle', 18),
 ('bg', 17),
 ('infusion', 17),
 ('mmol', 16),
 ('l', 16),
 ('cannula', 15),
 ('new', 15),
 ('time', 15),
 ('error', 15),
 ('removed', 13),
 ('e', 13),
 ('advised', 12),
 ('low', 12),
 ('days', 12),
 ('delivery', 12),
 ('screen', 12),
 ('units', 12),
 ('delivering', 12),
 ('admitted', 11),
 ('bolus', 11),
 ('ketones', 11),
 ('later', 10),
 ('alarm', 10),
 ('day', 10),
 ('experienced', 10),
 ('times', 9),
 ('buttons', 9),
 ('states', 9),
 ('button', 9),
 ('level', 8),
 ('errors', 8),
 ('animas', 8),
 ('multiple', 8),
 ('display', 8),
 ('child', 7),
 ('dka', 7),
 ('called', 7),
 ('two', 7),
 ('occurred', 7),
 ('area', 7),
 ('would', 7),
 ('following', 7),
 ('occlusion', 7),
 ('motor', 7),
 ('stated', 7),
 ('bac