In [15]:
import sys
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from nltk.tokenize import TreebankWordTokenizer
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score

pd.set_option('display.max_colwidth', -1)
pd.options.display.max_rows = 999
np.set_printoptions(threshold=sys.maxsize)

In [2]:
glove_input_file = '../models/w2v_glove_300.txt'
word2vec_output_file = 'w2v.txt'
glove2word2vec(glove_input_file, word2vec_output_file)
model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

In [3]:
w1 ="stroke"
print(model.most_similar(positive = w1, topn = 6))
#model.most_similar_cosmul(positive=['hepatoma', 'brain'], negative=['liver'])

[('heart', 0.4828336238861084), ('cardiac', 0.45815250277519226), ('debilitating', 0.4523412585258484), ('infarction', 0.4387247562408447), ('illness', 0.4336370527744293), ('complications', 0.43345579504966736)]


In [4]:
file = "../data/stopwords.txt"
with open(file) as f:
    stop_words = f.read().splitlines()

print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'no

In [5]:
def sentence_vector(sentence):
    word_list = TreebankWordTokenizer().tokenize(sentence)
    word_list = [word for word in word_list if word not in stop_words]
    word_vectors = []
    for x in word_list:
        try:
            word_vectors.append(model[x])
        except KeyError:
            None    
    return sum(word_vectors)/len(word_vectors)

In [6]:
# load prepartitioned train/test sets
test = pd.read_csv("../data/test.csv")
train = pd.read_csv("../data/AMIA_train_set.csv")

test['vec'] = [sentence_vector(x) for x in test.text]
train['vec'] = [sentence_vector(x) for x in train.text]

# load full data set
frames = [test, train]
df = pd.concat(frames)
df.expansion.unique()

train_grouped_abbr = train.groupby('abbrev')
test_grouped_abbr = test.groupby('abbrev')

AttributeError: 'DataFrameGroupBy' object has no attribute 'tolist'

In [None]:
# Loop through different abbreviations.
clf_type = 'mlp'
for abbr in train.abbrev.unique():

    train_abbr = train_grouped_abbr.get_group(abbr)
    test_abbr = test_grouped_abbr.get_group(abbr)

    X_train = np.array(list(train_abbr.vec))
    y_train = train_abbr.expansion

    X_test = np.array(list(test_abbr.vec))
    y_test = test_abbr.expansion

    if clf_type == 'svm':
        # set up SVM
        clf = SVC(C=1.0, kernel='linear', degree=1, probability=True).fit(X_train, y_train)

    elif clf_type == 'logistic':
        clf = LogisticRegression().fit(X_train, y_train)
        
    elif clf_type == 'mlp':
        clf = MLPClassifier(hidden_layer_sizes=(13,13,13),max_iter=500).fit(X_train, y_train)
        
    pred = clf.predict(X_test)
    cm = confusion_matrix(y_test, pred, labels=list(set(df.expansion)))
    print()
    print("##" * 20)
    print(" " * 20 + abbr)
    print("##" * 20)

    print(classification_report(y_test, pred))
    print()
    print(f'examples (first 5 cases)\t\t\t\t\t\ttrue_abbr\t\t\tpred_abbr')

    # Print first 5 cases
    i = 0
    for input_row, true_abbr, pred_abbr in zip(train_abbr.iterrows(), y_test, pred):

        sn_start = max(input_row[1].start - 25, 0)
        sn_end = min(input_row[1].end + 25, len(input_row[1].text))

        example_text = input_row[1].text[sn_start: sn_end]
        print(f'... {example_text} ...\t{true_abbr:<35}\t{pred_abbr}')

        if i == 5:
            break

        i += 1

In [None]:
def bag_test(X, y):
    
    # bagging test
    seed = 1075
    np.random.seed(seed)
    max_samples = [0.05, 0.1, 0.2, 0.5, 0.6, 0.7, 0.8, 0.9]

    # Define classifiers:
    svc = SVC(C=1.0, degree=1) 
    logistic = LogisticRegression()
    mlp = MLPClassifier(hidden_layer_sizes=(13,13,13),max_iter=500)

    #for i in range(1, 11):
    #n = i*0.1
    
    for n in max_samples:
        print('max samples:', n)
        bagging_clf = BaggingClassifier(logistic, max_samples=n, max_features=10, random_state=seed)
        bagging_scores = cross_val_score(bagging_clf, X, y, cv=7, n_jobs=-1)

        print("Mean of: {1:.3f}, std: (+/-) {2:.3f} [Bagging {0}]\n".format(svc.__class__.__name__, 
                            bagging_scores.mean(), bagging_scores.std()))

    vanilla_scores = cross_val_score(svc, X, y, cv=7, n_jobs=-1)

    print("Mean of: {1:.3f}, std: (+/-) {2:.3f} [Vanilla {0}]\n".format(svc.__class__.__name__, 
                            vanilla_scores.mean(), vanilla_scores.std()))

In [21]:
# valid types in ['svm', 'logistic', 'mlp', 'bagging', 'boosting', rf', 'voting']
clf_type = 'rf'
seed = 1032

# Test: Loop through different abbreviations.
# valid abbreevs in ['MR', 'MS', 'MOM', 'FISH', 'OR', 'US']
for abbr in ['MR']:

    train_abbr = train_grouped_abbr.get_group(abbr)
    test_abbr = test_grouped_abbr.get_group(abbr)

    X_train = np.array(list(train_abbr.vec))
    y_train = train_abbr.expansion

    X_test = np.array(list(test_abbr.vec))
    y_test = test_abbr.expansion
    
    if clf_type == 'svm':
        # set up SVM
        clf = SVC(C=1.0, degree=1, probability=True).fit(X_train, y_train)

    elif clf_type == 'logistic':
        clf = LogisticRegression().fit(X_train, y_train).fit(X_train, y_train)
        
    elif clf_type == 'mlp':
        clf = MLPClassifier(hidden_layer_sizes=(13,13,13),max_iter=500).fit(X_train, y_train) 
        
    elif clf_type == 'bagging':
        clf = BaggingClassifier(tree.DecisionTreeClassifier(random_state=1)).fit(X_train, y_train)
        
    elif clf_type == 'boosting':
        num_trees = 70
        clf = AdaBoostClassifier(n_estimators=num_trees, random_state=seed).fit(X_train, y_train)
    
    elif clf_type == 'rf':
        clf = RandomForestClassifier().fit(X_train, y_train)
        
    elif clf_type == 'voting':
        svm = SVC(C=1.0, degree=1, probability=True)
        logistic = LogisticRegression()
        mlp = MLPClassifier(hidden_layer_sizes=(13,13,13),max_iter=500)
        bagging = BaggingClassifier(tree.DecisionTreeClassifier(random_state=1))
        boosting = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
        rf = RandomForestClassifier()
        
        estimators = [('svm', svm), ('logistic', logistic), ('mlp', mlp), ('bagging', bagging), ('boosting', boosting), ('rf', rf)]
        clf = VotingClassifier(estimators).fit(X_train, y_train)
            
    pred = clf.predict(X_test)
    cm = confusion_matrix(y_test, pred, labels=list(set(df.expansion)))


    
    print(clf_type)
    print('Accuracy:', clf.score(X_test,y_test))
    
    print()
    print("##" * 20)
    print(" " * 20 + abbr)
    print("##" * 20)
    
    print(classification_report(y_test, pred))
    print()
    print(f'examples (first 5 cases)\t\t\t\t\t\ttrue_abbr\t\t\tpred_abbr')

    # Print first 5 cases
    i = 0
    for input_row, true_abbr, pred_abbr in zip(train_abbr.iterrows(), y_test, pred):

        sn_start = max(input_row[1].start - 25, 0)
        sn_end = min(input_row[1].end + 25, len(input_row[1].text))

        example_text = input_row[1].text[sn_start: sn_end]
        #print(f'... {example_text} ...\t{true_abbr:<35}\t{pred_abbr}')

        if i == 5:
            break

        i += 1


'''
Refrences:

https://github.com/prathamesh1993/Clinical-Acronym-disambiguation
https://www.datacamp.com/community/tutorials/ensemble-learning-python
https://scikit-learn.org/stable/modules/ensemble.html
https://machinelearningmastery.com/ensemble-machine-learning-algorithms-python-scikit-learn/
https://www.datacamp.com/community/tutorials/random-forests-classifier-python
https://www.springboard.com/blog/beginners-guide-neural-network-in-python-scikit-learn-0-18/
https://medium.com/@rrfd/boosting-bagging-and-stacking-ensemble-methods-with-sklearn-and-mlens-a455c0c982de
'''

rf
Accuracy: 0.76

########################################
                    MR
########################################
                          precision    recall  f1-score   support

         GENERAL ENGLISH       0.00      0.00      0.00         1
      magnetic resonance       0.73      0.96      0.83        28
    mitral regurgitation       0.85      0.55      0.67        20
myocardial infarction:MI       0.00      0.00      0.00         1

               micro avg       0.76      0.76      0.76        50
               macro avg       0.39      0.38      0.37        50
            weighted avg       0.75      0.76      0.73        50


examples (first 5 cases)						true_abbr			pred_abbr


  'precision', 'predicted', average, warn_for)


'\nhttps://github.com/prathamesh1993/Clinical-Acronym-disambiguation\nhttps://www.datacamp.com/community/tutorials/ensemble-learning-python\nhttps://scikit-learn.org/stable/modules/ensemble.html\nhttps://machinelearningmastery.com/ensemble-machine-learning-algorithms-python-scikit-learn/\nhttps://www.datacamp.com/community/tutorials/random-forests-classifier-python\nhttps://www.springboard.com/blog/beginners-guide-neural-network-in-python-scikit-learn-0-18/\nhttps://medium.com/@rrfd/boosting-bagging-and-stacking-ensemble-methods-with-sklearn-and-mlens-a455c0c982de\n'

In [None]:
print('accuracy: {}'.format(cross_val_scores))
print()
#print(set(df.expansion))
#print([len(df[df.expansion == x]) for x in set(df.expansion)])
print()
#print(cm)
print()
print(f1_score(y_test,pred,average = 'weighted'))

In [None]:
sentence = 'Patient was tested for US, cystic fibrosis and other heritable diseases '

sentence = "Procedure went without complications, and the patient was sent to the \
floor postoperatively after he was extubated in the FISH. Thoracic surgery was consulted \
on the day of surgery. Gastrografin upper GI study performed on admission showed no leak. \
However, the patient was admitted under the care of thoracic surgery team, and he was kept \
n.p.o. and followed up on daily basis for any change in vital signs, chest pain for another \
upper GI swallow study which was done 7 days after his symptoms started. "


sentence = ['she had an US to determine if the baby was good', 'he had an US to determine if there was a mass', 'If the patient continues to require blood transfusions an/or if he becomes hemodynamically unstable he will need to be taken to the OR for cystoscopy with clot evacuation. Since his surgery was so recently performed we would rather try to hold off on this, however, as not to disrupt the anastomosis with his new transplanted kidney.']

for s in sentence:
    print('sentence:', s)
    #print(vector_breakage(sentence))
    print('prediction:', clf.predict(sentence_vector(s).reshape(1, -1)))
    print()

## Try

In [None]:
try_out = ''
print(vector_breakage(try_out))
print(clf.predict(sentence_vector(try_out).reshape(1, -1)))