In [13]:
import sys
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from nltk.tokenize import TreebankWordTokenizer
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support
pd.set_option('display.max_colwidth', -1)
pd.options.display.max_rows = 999
np.set_printoptions(threshold=sys.maxsize)

In [14]:
glove_input_file = '../models/w2v_glove_300.txt'
word2vec_output_file = 'w2v.txt'
glove2word2vec(glove_input_file, word2vec_output_file)
model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

In [15]:
w1 ="stroke"
print(model.most_similar(positive = w1, topn = 6))
#model.most_similar_cosmul(positive=['hepatoma', 'brain'], negative=['liver'])

[('heart', 0.4828336238861084), ('cardiac', 0.45815250277519226), ('debilitating', 0.4523412585258484), ('infarction', 0.4387247562408447), ('illness', 0.4336370527744293), ('complications', 0.43345579504966736)]


In [16]:
file = "../data/stopwords.txt"
with open(file) as f:
    stop_words = f.read().splitlines()

print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'no

In [17]:
def sentence_vector(sentence):
    word_list = TreebankWordTokenizer().tokenize(sentence)
    word_list = [word for word in word_list if word not in stop_words]
    word_vectors = []
    for x in word_list:
        try:
            word_vectors.append(model[x])
        except KeyError:
            None    
    return sum(word_vectors)/len(word_vectors)

def vector_breakage(sentence):
    word_list = TreebankWordTokenizer().tokenize(sentence)
    word_list = [word for word in word_list if word not in stop_words]
    word_vectors_list = []
    for x in word_list:
        try:
            if len(model[x])==200:
                word_vectors_list.append(x)
        except:
            None
        else:
            None
    return word_vectors_list

In [18]:
# load prepartitioned train/test sets
test = pd.read_csv("../data/test.csv")
train = pd.read_csv("../data/AMIA_train_set.csv")

#print(train)

# load full data set
frames = [test, train]
df = pd.concat(frames)
df = df[['text','expansion']]
df['vec'] = [sentence_vector(x) for x in df.text]
df.expansion.unique()

test = test[['text','expansion', 'case']]
train = train[['text','expansion']]
test['vec'] = [sentence_vector(x) for x in test.text]
train['vec'] = [sentence_vector(x) for x in train.text]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [19]:
X = list(df.vec)
X = np.array(X)
y = df.expansion

X1 = list(train.vec)
X_train = np.array(X1)
y_train = train.expansion

X2 = list(test.vec)
X_test = np.array(X2)
y_test = test.expansion

In [23]:
# Define classifiers:
svc = SVC(C=1.0, kernel='linear', degree=1).fit(X_train, y_train)
logistic = LogisticRegression().fit(X_train, y_train)

In [26]:
# SVM results
pred = svc.predict(X_test)
cm = confusion_matrix(y_test, pred,labels=list(set(df.expansion)))
cross_val_scores = cross_val_score(clf, X, y, cv=7)

predicted_expansion = list(pred)
case = test['case'].tolist()

results = pd.DataFrame(
    {'case': case,
     'expansion': predicted_expansion
    })

#print(results)



In [24]:
print('accuracy: {}'.format(cross_val_scores))
print()
#print(set(df.expansion))
#print([len(df[df.expansion == x]) for x in set(df.expansion)])
print()
#print(cm)
print()
print(f1_score(y_test,pred,average = 'weighted'))



           case                          expansion
0    26043_OR    operating room                   
1    26272_OR    magnetic resonance               
2    19905_IT    magnetic resonance               
3    19749_IT    operating room                   
4    26095_OR    ultrasound                       
5    36650_US    United States                    
6    22809_MR    mitral regurgitation             
7    22752_MR    mitral regurgitation             
8    26358_OR    operating room                   
9    23977_MS    morphine sulfate                 
10   19624_IT    multiple sclerosis               
11   36856_US    United States                    
12   19833_IT    intrathecal                      
13   22592_MR    mitral regurgitation             
14   22822_MR    magnetic resonance               
15   21726_MOM   GENERAL ENGLISH                  
16   19714_IT    United States                    
17   36593_US    United States                    
18   26398_OR    operating room

In [21]:
# LogisticRegression results
pred = logisic.predict(X_test)
cm = confusion_matrix(y_test, pred,labels=list(set(df.expansion)))
cross_val_scores = cross_val_score(clf, X, y, cv=7)

predicted_expansion = list(pred)
case = test['case'].tolist()

results = pd.DataFrame(
    {'case': case,
     'expansion': predicted_expansion
    })

#print(results)



           case                          expansion
0    26043_OR    operating room                   
1    26272_OR    magnetic resonance               
2    19905_IT    multiple sclerosis               
3    19749_IT    operating room                   
4    26095_OR    operating room                   
5    36650_US    United States                    
6    22809_MR    mitral regurgitation             
7    22752_MR    mitral regurgitation             
8    26358_OR    operating room                   
9    23977_MS    morphine sulfate                 
10   19624_IT    operating room                   
11   36856_US    United States                    
12   19833_IT    operating room                   
13   22592_MR    mitral regurgitation             
14   22822_MR    magnetic resonance               
15   21726_MOM   GENERAL ENGLISH                  
16   19714_IT    United States                    
17   36593_US    United States                    
18   26398_OR    fluorescent in

In [25]:
print('accuracy: {}'.format(cross_val_scores))
print()
#print(set(df.expansion))
#print([len(df[df.expansion == x]) for x in set(df.expansion)])
print()
#print(cm)
print()
print(f1_score(y_test,pred,average = 'weighted'))

accuracy: [0.69942197 0.68712871 0.687251   0.68336673 0.70445344 0.70325203
 0.71574642]



0.6675536551666332


  'precision', 'predicted', average, warn_for)


In [None]:
sentence = 'Patient was tested for US, cystic fibrosis and other heritable diseases '

sentence = "Procedure went without complications, and the patient was sent to the \
floor postoperatively after he was extubated in the FISH. Thoracic surgery was consulted \
on the day of surgery. Gastrografin upper GI study performed on admission showed no leak. \
However, the patient was admitted under the care of thoracic surgery team, and he was kept \
n.p.o. and followed up on daily basis for any change in vital signs, chest pain for another \
upper GI swallow study which was done 7 days after his symptoms started. "


sentence = ['she had an US to determine if the baby was good', 'he had an US to determine if there was a mass', 'If the patient continues to require blood transfusions an/or if he becomes hemodynamically unstable he will need to be taken to the OR for cystoscopy with clot evacuation. Since his surgery was so recently performed we would rather try to hold off on this, however, as not to disrupt the anastomosis with his new transplanted kidney.']

for s in sentence:
    print('sentence:', s)
    #print(vector_breakage(sentence))
    print('prediction:', clf.predict(sentence_vector(s).reshape(1, -1)))
    print()

## Try

In [None]:
try_out = ''
print(vector_breakage(try_out))
print(clf.predict(sentence_vector(try_out).reshape(1, -1)))