# Utils 

In [86]:
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, precision_score, recall_score

In [4]:
df = pd.read_csv('AP_ICD10.tsv', sep='\t')
df.head()

Unnamed: 0,chf cmplnt,A/P,icd10encounterdiagcode,icd10encounterdiagdescr
0,"Followup: Osteoarthritis of hip; Hip pain, Pos...",Doing well post-op. Progressing as expected. o...,M1612,"M1612: Unilateral primary osteoarthritis, left..."
1,"Knee pain, Post Op",The fracture is healing well in good alignment...,S82025A,S82025A: Nondisplaced longitudinal fracture of...
2,Followup: Osteoarthritis of knee; Knee pain,Osteoarthritis of the Right knee.The patient e...,M1711,"M1711: Unilateral primary osteoarthritis, righ..."
3,Hip pain,Left hip Trocanteric Bursitis. trochanteric bu...,M7062,"M7062: Trochanteric bursitis, left hip"
4,"Knee pain, Post Op",Left Medial Meniscus Tear. Doing well post-op....,S83222A,"S83222A: Prph tear of medial meniscus, current..."


In [11]:
df.dtypes

chf cmplnt                 object
A/P                        object
icd10encounterdiagcode     object
icd10encounterdiagdescr    object
dtype: object

# Data Processing

In [27]:
df = df[~df[['A/P', 'icd10encounterdiagcode']].isnull().any(axis=1)]

In [28]:
df['icd_cat']  = df.apply(lambda row: row['icd10encounterdiagcode'][:3], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [58]:
def tokenize_txt(row):
    new_row = re.sub(r'[^\w\s-]', '', row)
    new_row = new_row.lower().strip()
    return new_row

In [59]:
df['token_ap'] = df.apply(lambda row: tokenize_txt(row['A/P']), axis=1)

In [60]:
df_new = df[['token_ap', 'icd_cat']]

In [65]:
df_new.head()

Unnamed: 0,token_ap,icd_cat
0,doing well post-op progressing as expected ost...,M16
1,the fracture is healing well in good alignment...,S82
2,osteoarthritis of the right kneethe patient ex...,M17
3,left hip trocanteric bursitis trochanteric bur...,M70
4,left medial meniscus tear doing well post-op p...,S83


# Data Modeling 

In [77]:
df_new.icd_cat.nunique()

34

In [79]:
len(df_new)

240

In [78]:
# for each icd category it's an unbalanced classification
df_new.groupby('icd_cat').count()

Unnamed: 0_level_0,token_ap
icd_cat,Unnamed: 1_level_1
G56,1
M00,2
M13,1
M16,17
M17,45
M18,1
M19,2
M21,1
M22,5
M23,16


In [110]:
def prepare_train_test_data(df_train, df_test, icd_cat):    
    # upsampling
    
    # tfidf score
    vectorizer = TfidfVectorizer()
    vectorizer.fit(df_train.token_ap.values)
    
    X_train = vectorizer.transform(df_train.token_ap.values)
    y_train = df_train.icd_cat.values == icd_cat
    
    X_test = vectorizer.transform(df_test.token_ap.values)
    y_test = df_test.icd_cat.values == icd_cat
    
    return X_train, y_train, X_test, y_test

In [114]:
# training and validation
for icd_cat in df_train.icd_cat.unique():
    # split train test
    test_idx = np.random.randint(len(df_new), size=int(len(df_new)/3))
    df_train = df_new[~df_new.index.isin(test_idx)]
    df_test = df_new[df_new.index.isin(test_idx)]
    
    if len(df_train[df_train.icd_cat==icd_cat])==0 or len(df_train[df_train.icd_cat==icd_cat])==0:
        continue
    print('Fitting classifer for ICD category {}'.format(icd_cat))
    X_train, y_train, X_test, y_test = prepare_train_test_data(df_train, df_test, icd_cat)
    
    # fit model
    clf = LogisticRegression()
    clf.fit(X_train, y_train)
    
    # train roc_auc
    y_train_score = clf.predict_proba(X_train)[:, 1]
    roc_auc_train = roc_auc_score(y_train, y_train_score)  
    print('Training roc_auc is {}'.format(roc_auc_train))
    
    # test roc_auc
    y_pred = clf.predict(X_test)
    precision = precision_score(y_test.astype(int), y_pred.astype(int))
    recall = recall_score(y_test.astype(int), y_pred.astype(int))
    
    print('Testing precision is {} and recall is {}'.format(precision, recall))
    print(8*'*')

Fitting classifer for ICD category M16
Training roc_auc is 0.9991634132738427
Testing precision is 1.0 and recall is 0.16666666666666666
********
Fitting classifer for ICD category M17
Training roc_auc is 0.9978041282389107
Testing precision is 0.875 and recall is 0.5833333333333334
********
Fitting classifer for ICD category M70
Training roc_auc is 0.9964705882352941
Testing precision is 0.0 and recall is 0.0
********
Fitting classifer for ICD category S83
Training roc_auc is 0.9815233785822022
Testing precision is 0.75 and recall is 0.6428571428571429
********
Fitting classifer for ICD category M24
Training roc_auc is 1.0
Testing precision is 0.0 and recall is 0.0
********
Fitting classifer for ICD category S46
Training roc_auc is 0.9992603550295858
Testing precision is 0.0 and recall is 0.0
********
Fitting classifer for ICD category M25
Training roc_auc is 0.9958579881656804
Testing precision is 0.0 and recall is 0.0
********
Fitting classifer for ICD category S82
Training roc_auc 

  'precision', 'predicted', average, warn_for)


Training roc_auc is 1.0
Testing precision is 0.0 and recall is 0.0
********
Fitting classifer for ICD category M00
Training roc_auc is 1.0
Testing precision is 0.0 and recall is 0.0
********
Fitting classifer for ICD category M19
Training roc_auc is 1.0
Testing precision is 0.0 and recall is 0.0
********
Fitting classifer for ICD category S63
Training roc_auc is 1.0
Testing precision is 0.0 and recall is 0.0
********
Fitting classifer for ICD category S52
Training roc_auc is 1.0
Testing precision is 0.0 and recall is 0.0
********
Fitting classifer for ICD category M23
Training roc_auc is 0.9536749482401656
Testing precision is 0.0 and recall is 0.0
********
Fitting classifer for ICD category G56
Training roc_auc is 1.0
Testing precision is 0.0 and recall is 0.0
********
Fitting classifer for ICD category M22
Training roc_auc is 0.9990253411306043
Testing precision is 0.0 and recall is 0.0
********
Fitting classifer for ICD category S68
Training roc_auc is 1.0
Testing precision is 0.0 a