# Machine Learning for ICD9 Classification

## MIMIC 2

### Preparation

Loading data

In [1]:
import sys
sys.path.append('/home/datasets/MIMIC/mimic_code')

from jumana.MIMIC_reader import MIMIC_Reader
from time import time
from sklearn import metrics
from jumana.evaluate_predictions import Evaluation
from jumana.icd92int import *
from jumana.MIMIC_data import MIMIC_Data

from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression



mimic_data = MIMIC_Data('2')
corpus_path = mimic_data.get_mimic_preprocessed_path()

#loading training and testing data using MIMIc_reader 
data_train = MIMIC_Reader(mimic_data,is_train=True)
data_test = MIMIC_Reader(mimic_data,is_train=False)
categories = data_train.target_names

In [4]:
tokenized_corpus = []
import codecs
for line in codecs.open('mimic_code/tal/shaped_mimic2', encoding='utf-8'):
    tokenized_corpus.append(line.strip())
data_train.data = tokenized_corpus[:len(data_train.data)]
data_test.data = tokenized_corpus[len(data_train.data):]

In [3]:
#convert the labels from text to a 2d binary matrix, (that's what the classifiers accept)
lb = preprocessing.MultiLabelBinarizer()
data_train.target = lb.fit_transform(data_train.target);

#remove labels from the test set that don't appear in the train set.
test_actual_targets = []
for tag_set in data_test.target:
    test_actual_targets.append([tag for tag in tag_set if tag in lb.classes_])
data_test.target = lb.transform(test_actual_targets)

#setting training targets and testing targets
y_train, y_test = data_train.target, data_test.target

print('data loaded')

Printing statstics about the data

In [4]:
def size_mb(docs):
    return sum(len(s.encode('utf-8')) for s in docs) / 1e6

data_train_size_mb = size_mb(data_train.data)
data_test_size_mb = size_mb(data_test.data)

print("%d documents - %0.3fMB (training set)" % (len(data_train.data), data_train_size_mb))
print("%d documents - %0.3fMB (test set)" % (len(data_test.data), data_test_size_mb))
print("%d categories\n" % len(categories))

### Extracting Features

#### Extracting tf-idf features

In [5]:
#used to check how much time feature extraction took
t0 = time()
vectorizer = TfidfVectorizer(sublinear_tf=True, max_features=10000,stop_words='english', ngram_range = (1,3))
X_train = vectorizer.fit_transform(data_train.data)

duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
print("n_samples: %d, n_features: %d\n" % X_train.shape)

print("Extracting features from the test data using the same vectorizer")
t0 = time()
X_test = vectorizer.transform(data_test.data)
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
print("n_samples: %d, n_features: %d\n" % X_test.shape)

#### Defining Evaluation

In [6]:
#Evaluation is done using Perotte's code from the heirarchy SVM paper
def  evaluate_predictions(pred_binary,name):
    predictions = lb.inverse_transform(pred_binary)
    icd2int = init(mimic_data)
    #map labels to int using Perotte's mapping file the list contains indices represented in a string format
    predictions = icd92int(predictions,icd2int)
    #convert the strings to ints
    predictions_int = []
    for prediction_set in predictions:
        predictions_int.append([int(x) for x in prediction_set])
    evaluation = Evaluation(mimic_data)
    evaluation.evaluate(predictions_int,name + '.out')

### Running Classifiers

In [7]:
def benchmark(clf,name):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)
    
    #evaluation using Perotte's 
    evaluate_predictions(pred,name)

#### Defining Classifiers

In [10]:
#logistic regression,chi2+kbest,tf-idf 
clf_log_chi = Pipeline([('chi2', SelectKBest(chi2, k=10000)),#opts.select_chi2 )),
                ('logistic_regression', LogisticRegression(n_jobs=3))])
#linear svc, chi2+kbest, tf-idf
clf_svc_chi = Pipeline([('chi2', SelectKBest(chi2, k=10000)),#opts.select_chi2 )), #try 20k 30k on tfidf and chi see if it makes difference
                ('linear_svc', LinearSVC())])

for clf, name in (
        #linear svc, tf-idf
        (OneVsRestClassifier(LinearSVC(),n_jobs=3), 'linear_svc1'), #l2 hinge_squared
        (OneVsRestClassifier(clf_log_chi,n_jobs=3), 'logistic_regression_chi_tfidf'),
        #the resutls were exactly the same as the results in linear_svc without chi, try later to play with the 
        #number of features of chi and tf-idf
        (OneVsRestClassifier(clf_svc_chi,n_jobs=3), 'svc_chi_tfidf')
):
    print('=' * 80)
    print(name)
    benchmark(clf,name)