# Medical Text

Medical Text Dataset [https://www.kaggle.com/datasets/chaitanyakck/medical-text/data]

In [1]:
import pandas as pd
import nltk

In [2]:
df = pd.read_csv('data/train.dat', sep="\t", header=None)

In [3]:
df.rename(columns={0:'condition', 1:'abstract'}, inplace=True)
df.head()

Unnamed: 0,condition,abstract
0,4,Catheterization laboratory events and hospital...
1,5,Renal abscess in children. Three cases of rena...
2,2,Hyperplastic polyps seen at sigmoidoscopy are ...
3,5,Subclavian artery to innominate vein fistula a...
4,4,Effect of local inhibition of gamma-aminobutyr...


## Sentence splitting

In [4]:
# Split the text into sentences
sentences = df.abstract.apply(nltk.sent_tokenize)

In [5]:
sentences[0] # Print the first record

['Catheterization laboratory events and hospital outcome with direct angioplasty for acute myocardial infarction To assess the safety of direct infarct angioplasty without antecedent thrombolytic therapy, catheterization laboratory and hospital events were assessed in consecutively treated patients with infarctions involving the left anterior descending (n = 100 patients), right (n = 100), and circumflex (n = 50) coronary arteries.',
 'The groups of patients were similar for age (left anterior descending coronary artery, 59 years; right coronary artery, 58 years; circumflex coronary artery, 62 years), patients with multivessel disease (left anterior descending coronary artery, 55%; right coronary artery, 55%; circumflex coronary artery, 64%), and patients with initial grade 0/1 antegrade flow (left anterior descending coronary artery, 79%; right coronary artery, 84%; circumflex coronary artery, 90%).',
 'Cardiogenic shock was present in eight patients with infarction of the left anteri

## Word Tokenization

In [6]:
tokens = []
for sentence in sentences:
    words = [nltk.word_tokenize(sent) for sent in sentence]
    tokens.append(words)
df["token_abstract"] = tokens # set tokenized sentences as df col to save progress

In [7]:
df.head()

Unnamed: 0,condition,abstract,token_abstract
0,4,Catheterization laboratory events and hospital...,"[[Catheterization, laboratory, events, and, ho..."
1,5,Renal abscess in children. Three cases of rena...,"[[Renal, abscess, in, children, .], [Three, ca..."
2,2,Hyperplastic polyps seen at sigmoidoscopy are ...,"[[Hyperplastic, polyps, seen, at, sigmoidoscop..."
3,5,Subclavian artery to innominate vein fistula a...,"[[Subclavian, artery, to, innominate, vein, fi..."
4,4,Effect of local inhibition of gamma-aminobutyr...,"[[Effect, of, local, inhibition, of, gamma-ami..."


## Lemmatization

In [8]:
wnl = nltk.WordNetLemmatizer()

lemm = []
for record in df.token_abstract:
    for word in record:
        word = [wnl.lemmatize(t) for t in word] # update sentences into lemmatized

In [9]:
df.head()

Unnamed: 0,condition,abstract,token_abstract
0,4,Catheterization laboratory events and hospital...,"[[Catheterization, laboratory, events, and, ho..."
1,5,Renal abscess in children. Three cases of rena...,"[[Renal, abscess, in, children, .], [Three, ca..."
2,2,Hyperplastic polyps seen at sigmoidoscopy are ...,"[[Hyperplastic, polyps, seen, at, sigmoidoscop..."
3,5,Subclavian artery to innominate vein fistula a...,"[[Subclavian, artery, to, innominate, vein, fi..."
4,4,Effect of local inhibition of gamma-aminobutyr...,"[[Effect, of, local, inhibition, of, gamma-ami..."
