In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)
import json #read json file

#text preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import string

from sklearn.model_selection import train_test_split

  pd.set_option('max_colwidth', -1)


In [2]:
#text preprocessing function
def prepare_text(df):
    df['prepare_term'] = df['term'].str.lower()
    df['tokens'] = df['prepare_term'].apply(nltk.word_tokenize)
    df['no_punct_text'] = df['prepare_term'].str.translate(str.maketrans('', '', string.punctuation))
    stop_words = set(stopwords.words('english'))
    df['filtered_text'] = df['no_punct_text'].apply(lambda x: " ".join(word for word in x.split() if word.lower() not in stop_words))
    stemmer = PorterStemmer()
    df['stemmed_text'] = df['filtered_text'].apply(lambda x: " ".join(stemmer.stem(word) for word in x.split()))
    lemmatizer = WordNetLemmatizer()
    df['lemmatized_text'] = df['stemmed_text'].apply(lambda x: " ".join(lemmatizer.lemmatize(word) for word in x.split()))
    new_df = df[['lemmatized_text', 'code']].copy()
    return new_df

In [3]:
data = pd.read_csv('../data/dataset.tsv',sep='\t')
data


Unnamed: 0,term,code
0,infection caused by Staphylococcus Coagulase negative,A49.0
1,infection due to Staphylococcus Coagulase negative,A49.0
2,fetal infection caused by Staphylococcus aureus,A49.0
3,early neonatal infection caused by Staphylococcus aureus,A49.0
4,bacteremia caused by Methicillin resistant Staphylococcus aureus,A49.0
...,...,...
16563,atherosclerosis of aortoiliac bypass graft,Z95.8
16564,atherosclerosis aortoiliac bypass graft,Z95.8
16565,peripheral nerve neurostimulator device in situ,Z95.8
16566,peripheral nerve neuropacemaker in situ,Z95.8


In [4]:
new_data = prepare_text(data)
new_data

Unnamed: 0,lemmatized_text,code
0,infect caus staphylococcu coagulas neg,A49.0
1,infect due staphylococcu coagulas neg,A49.0
2,fetal infect caus staphylococcu aureu,A49.0
3,earli neonat infect caus staphylococcu aureu,A49.0
4,bacteremia caus methicillin resist staphylococcu aureu,A49.0
...,...,...
16563,atherosclerosi aortoiliac bypass graft,Z95.8
16564,atherosclerosi aortoiliac bypass graft,Z95.8
16565,peripher nerv neurostimul devic situ,Z95.8
16566,peripher nerv neuropacemak situ,Z95.8


In [5]:
#Train test split
train, test = train_test_split(new_data, test_size=0.2, random_state=42)
print(f'Train size : {train.shape[0]} rows')
print(f'Test size : {test.shape[0]} rows')

Train size : 13254 rows
Test size : 3314 rows


In [6]:
#Create train set txt for fastText
try:
    with open('train.txt', 'w') as f:    
        for i in range(len(train)):
            text = train.iloc[i,0]
            label = train.iloc[i,1]
            f.write(f'{text} __label__{label}\n')
except FileNotFoundError:
        print("The 'docs' directory does not exist")

In [7]:
#Create test set txt for performance measurement
try:
    with open('test.txt', 'w') as f:    
        for i in range(len(test)):
            text = test.iloc[i,0]
            label = test.iloc[i,1]
            f.write(f'{text} __label__{label}\n')
except FileNotFoundError:
        print("The 'docs' directory does not exist")

In [9]:
#JSON file -- not use
f = open('../data/icd10_choose.json')
data_json = json.load(f)
data_json

[{'code': 'A49.0',
  'title': 'Staphylococcal infection, unspecified site',
  'inclusion': None},
 {'code': 'A56.1',
  'title': 'Chlamydial infection of pelviperitoneum and other genitourinary organs',
  'inclusion': None},
 {'code': 'C07',
  'title': 'Malignant neoplasm of parotid gland',
  'inclusion': None},
 {'code': 'C40.1',
  'title': 'Malignant neoplasm: Short bones of upper limb',
  'inclusion': None},
 {'code': 'C41.0',
  'title': 'Malignant neoplasm: Bones of skull and face',
  'inclusion': None},
 {'code': 'C43.3',
  'title': 'Malignant neoplasm: Malignant melanoma of other and unspecified parts of face',
  'inclusion': None},
 {'code': 'C44.9',
  'title': 'Malignant neoplasm: Malignant neoplasm of skin, unspecified',
  'inclusion': None},
 {'code': 'C71.9',
  'title': 'Malignant neoplasm: Brain, unspecified',
  'inclusion': None},
 {'code': 'C81.2',
  'title': 'Mixed cellularity classical Hodgkin lymphoma',
  'inclusion': None},
 {'code': 'C84.0', 'title': 'Mycosis fungoide