In [28]:
import fasttext
import pandas as pd
model = fasttext.load_model("./model/ICD10.bin")

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import string



In [29]:
def prepare_text(df):
    df['prepare_term'] = df['term'].str.lower()
    df['tokens'] = df['prepare_term'].apply(nltk.word_tokenize)
    df['no_punct_text'] = df['prepare_term'].str.translate(str.maketrans('', '', string.punctuation))
    stop_words = set(stopwords.words('english'))
    df['filtered_text'] = df['no_punct_text'].apply(lambda x: " ".join(word for word in x.split() if word.lower() not in stop_words))
    stemmer = PorterStemmer()
    df['stemmed_text'] = df['filtered_text'].apply(lambda x: " ".join(stemmer.stem(word) for word in x.split()))
    lemmatizer = WordNetLemmatizer()
    df['lemmatized_text'] = df['stemmed_text'].apply(lambda x: " ".join(lemmatizer.lemmatize(word) for word in x.split()))
    new_df = df[['lemmatized_text', 'code']].copy()
    return new_df

In [30]:
filename = './data/dataset.tsv'
validation_tsv = pd.read_csv(filename,sep='\t')
prepared_val = prepare_text(validation_tsv)
prepared_val

Unnamed: 0,lemmatized_text,code
0,infect caus staphylococcu coagulas neg,A49.0
1,infect due staphylococcu coagulas neg,A49.0
2,fetal infect caus staphylococcu aureu,A49.0
3,earli neonat infect caus staphylococcu aureu,A49.0
4,bacteremia caus methicillin resist staphylococ...,A49.0
...,...,...
16563,atherosclerosi aortoiliac bypass graft,Z95.8
16564,atherosclerosi aortoiliac bypass graft,Z95.8
16565,peripher nerv neurostimul devic situ,Z95.8
16566,peripher nerv neuropacemak situ,Z95.8


In [31]:
#Predict
predict = model.predict(validation_tsv['term'].to_list())
result = pd.DataFrame({'term':validation_tsv['term'].to_list(),
                       'codePredict':[p[0].strip('__label__') for p in predict[0]]})
result

Unnamed: 0,term,codePredict
0,infection caused by Staphylococcus Coagulase n...,K76.8
1,infection due to Staphylococcus Coagulase nega...,E03.8
2,fetal infection caused by Staphylococcus aureus,Q87.8
3,early neonatal infection caused by Staphylococ...,K76.8
4,bacteremia caused by Methicillin resistant Sta...,A49.0
...,...,...
16563,atherosclerosis of aortoiliac bypass graft,Z95.8
16564,atherosclerosis aortoiliac bypass graft,Z95.8
16565,peripheral nerve neurostimulator device in situ,Z95.8
16566,peripheral nerve neuropacemaker in situ,Z95.8
