In [2]:
import pandas as p
import numpy as np
train=p.read_csv("pubmed_causal_language_use.csv", delimiter=',')
y=train['label'].values
X=train['sentence'].values

(1530,) (1530,) (1531,) (1531,)
Daily consumption of 3\\xa0g of soluble fiber from 70\\xa0g\\xa0of oats leads to beneficial effects on the lipid parameters, specifically total cholesterol and low-density lipoprotein cholesterol in hypercholesterolemic Asian Indians.
1
The lack of symptoms and the preoperative EGD findings were not suggestive of this diagnosis in any case.
0


[[  0   1   2   3]
 [651 260 107 512]]


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# several commonly used vectorizer setting

#  unigram boolean vectorizer, set minimum document frequency to 5
unigram_bool_vectorizer = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english')

#  unigram term frequency vectorizer, set minimum document frequency to 5
unigram_count_vectorizer = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english')

#  unigram and bigram term frequency vectorizer, set minimum document frequency to 5
gram12_count_vectorizer = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english')

#  unigram tfidf vectorizer, set minimum document frequency to 5
unigram_tfidf_vectorizer = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english')

In [4]:
# fit vocabulary in training documents and transform the training documents into vectors

X_vec1=unigram_count_vectorizer.fit_transform(X)
X_vec2=unigram_bool_vectorizer.fit_transform(X)
X_vec3=unigram_tfidf_vectorizer.fit_transform(X)

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, X_train1, X_test1,X_train2, X_test2,X_train3, X_test3,  y_train, y_test = train_test_split(X, X_vec1,X_vec2, X_vec3, y, test_size=0.5, random_state=0)



In [6]:
from sklearn.svm import LinearSVC

# initialize the LinearSVC model
svm_clf1 = LinearSVC(C=10)
svm_clf2 = LinearSVC(C=10)
svm_clf3 = LinearSVC(C=10)

# use the training data to train the model
svm_clf1.fit(X_train1,y_train)
svm_clf2.fit(X_train2,y_train)
svm_clf3.fit(X_train3,y_train)

LinearSVC(C=10)

In [7]:
from sklearn.metrics import confusion_matrix
y_pred1 = svm_clf1.predict(X_test1)
y_pred2 = svm_clf2.predict(X_test1)
y_pred3 = svm_clf3.predict(X_test1)
cm1=confusion_matrix(y_test, y_pred1, labels=[0,1,2,3])
cm2=confusion_matrix(y_test, y_pred2, labels=[0,1,2,3])
cm3=confusion_matrix(y_test, y_pred3, labels=[0,1,2,3])


from sklearn.metrics import classification_report
target_names = ['0','1','2','3']
print(cm1)
print(cm2)
print(cm3)
print(classification_report(y_test, y_pred1, target_names=target_names))
print(classification_report(y_test, y_pred2, target_names=target_names))
print(classification_report(y_test, y_pred3, target_names=target_names))


[[532  76  36  61]
 [ 56 122  15  41]
 [ 32  32  17  25]
 [ 73  41  17 355]]
[[526  79  34  66]
 [ 54 123  15  42]
 [ 30  33  19  24]
 [ 76  45  16 349]]
[[430 108  53 114]
 [ 29 135  19  51]
 [ 12  38  28  28]
 [ 33  39  22 392]]
              precision    recall  f1-score   support

           0       0.77      0.75      0.76       705
           1       0.45      0.52      0.48       234
           2       0.20      0.16      0.18       106
           3       0.74      0.73      0.73       486

    accuracy                           0.67      1531
   macro avg       0.54      0.54      0.54      1531
weighted avg       0.67      0.67      0.67      1531

              precision    recall  f1-score   support

           0       0.77      0.75      0.76       705
           1       0.44      0.53      0.48       234
           2       0.23      0.18      0.20       106
           3       0.73      0.72      0.72       486

    accuracy                           0.66      1531
   macro

0.6616590463749183

In [23]:
## interpreting LinearSVC models
## http://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC

## LinearSVC uses a one-vs-all strategy to extend the binary SVM classifier to multi-class problems
## for the Kaggle sentiment classification problem, there are five categories 0,1,2,3,4 with 0 as very negative and 4 very positive
## LinearSVC builds five binary classifier, "very negative vs. others", "negative vs. others", "neutral vs. others", "positive vs. others", "very positive vs. others", 
## and then pick the most confident prediction as the final prediction.

## Linear SVC also ranks all features based on their contribution to distinguish the two concepts in each binary classifier
## For category "0" (very negative), get all features and their weights and sort them in increasing order
feature_ranks0 = sorted(zip(svm_clf1.coef_[0], unigram_tfidf_vectorizer.get_feature_names()))
feature_ranks1 = sorted(zip(svm_clf1.coef_[1], unigram_tfidf_vectorizer.get_feature_names()))
feature_ranks2 = sorted(zip(svm_clf1.coef_[2], unigram_tfidf_vectorizer.get_feature_names()))
feature_ranks3 = sorted(zip(svm_clf1.coef_[3], unigram_tfidf_vectorizer.get_feature_names()))

## get the 10 features that are best indicators of very negative sentiment (they are at the bottom of the ranked list)
norelation10 = feature_ranks0[-10:]
directcausal10=feature_ranks1[-10:]
conditionalcausal0=feature_ranks2[-10:]
correlational10=feature_ranks3[-10:]
print("no relation")
for i in range(0, len(norelation10)):
    print(norelation10[i])
print()

print("directcausal")
for i in range(0, len(directcausal10)):
    print(directcausal10[i])
print()

print("conditionalcausal")
for i in range(0, len(conditionalcausal0)):
    print(conditionalcausal0[i])
print()

print("correlational")
for i in range(0, len(correlational10)):
    print(correlational10[i])
print()


no relation
(1.3603567496937938, 'esophageal')
(1.3828492176709677, 'proposed')
(1.4035981750920654, 'specifically')
(1.4574677980244173, 'meal')
(1.4861196936152612, 'bias')
(1.4891036645214306, 'obtained')
(1.4947479721251886, 'policy')
(1.518923578480536, 'superior')
(1.535957270336285, 'needed')
(1.8571787441180412, 'european')

directcausal
(1.470894445409777, 'did')
(1.4725563125305057, 'resulted')
(1.4920430559946503, 'determinants')
(1.516579236450756, 'administered')
(1.5357332637946086, 'successful')
(1.5398324496044509, 'highly')
(1.5565175046165693, 'tolerated')
(1.5613627551938094, 'effective')
(1.6118872847996417, 'contributed')
(1.6408811144571323, 'theory')

conditionalcausal
(0.9524652763524939, 'appeared')
(0.9554093287798371, 'improve')
(0.959306102786325, 'mg')
(0.9854239143058787, 'cad')
(1.0066334595324449, 'influence')
(1.1826670329504154, 'helpful')
(1.2101228921921385, 'mediated')
(1.5563965209452932, 'problem')
(1.612360197993797, 'useful')
(1.81493562101969, 

In [26]:
err_cnt1 = 0
toorelate1=0
poorrelate1=0
for i in range(0, len(y_test)):
    if(y_test[i]!= y_pred1[i]):
        print(X_test[i])
        err_cnt1 = err_cnt1+1
print("errors:", err_cnt1)
for i in range(0, len(y_test)):
    if(y_test[i]> y_pred1[i]):
        print(X_test[i])
        toorelate1 = toorelate1+1
print("too related errors:", toorelate1)
for i in range(0, len(y_test)):
    if(y_test[i]< y_pred1[i]):
        print(X_test[i])
        poorrelate1 = poorrelate1+1
print("poor related errors:", poorrelate1)

errors: 505
too related errors: 251
poor related errors: 254


In [19]:
# import the MNB module
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

# initialize the MNB model
nb_clf1= MultinomialNB()
nb_clf2= MultinomialNB()
nb_clf3= MultinomialNB()

# use the training data to train the MNB model
# feature_log_prob_ stores the conditional probs for all categories
# if the labels are strings, the index is in alphabetic order
# e.g. 'f' comes before 't' in alphabet, so 'f' is in [0] dimension and 't' in [1]

nb_clf1.fit(X_train1,y_train)
nb_clf2.fit(X_train2,y_train)
nb_clf3.fit(X_train3,y_train)

MultinomialNB()

In [20]:
from sklearn.metrics import confusion_matrix
y_pred4 = nb_clf1.predict(X_test1)
y_pred5 = nb_clf2.predict(X_test1)
y_pred6 = nb_clf3.predict(X_test1)
cm4=confusion_matrix(y_test, y_pred4, labels=[0,1,2,3])
cm5=confusion_matrix(y_test, y_pred5, labels=[0,1,2,3])
cm6=confusion_matrix(y_test, y_pred6, labels=[0,1,2,3])


from sklearn.metrics import classification_report
target_names = ['0','1','2','3']
print(cm4)
print(cm5)
print(cm6)
print(classification_report(y_test, y_pred4, target_names=target_names))
print(classification_report(y_test, y_pred5, target_names=target_names))
print(classification_report(y_test, y_pred6, target_names=target_names))



[[509  75  23  98]
 [ 49 121   6  58]
 [ 31  37  10  28]
 [ 45  27   4 410]]
[[513  67  22 103]
 [ 49 122   6  57]
 [ 35  34  12  25]
 [ 45  27   4 410]]
[[543  47   2 113]
 [ 65  84   1  84]
 [ 40  25   1  40]
 [ 41  10   0 435]]
              precision    recall  f1-score   support

           0       0.80      0.72      0.76       705
           1       0.47      0.52      0.49       234
           2       0.23      0.09      0.13       106
           3       0.69      0.84      0.76       486

    accuracy                           0.69      1531
   macro avg       0.55      0.54      0.54      1531
weighted avg       0.68      0.69      0.68      1531

              precision    recall  f1-score   support

           0       0.80      0.73      0.76       705
           1       0.49      0.52      0.50       234
           2       0.27      0.11      0.16       106
           3       0.69      0.84      0.76       486

    accuracy                           0.69      1531
   macro

In [22]:
## interpreting LinearSVC models
## http://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC

## LinearSVC uses a one-vs-all strategy to extend the binary SVM classifier to multi-class problems
## for the Kaggle sentiment classification problem, there are five categories 0,1,2,3,4 with 0 as very negative and 4 very positive
## LinearSVC builds five binary classifier, "very negative vs. others", "negative vs. others", "neutral vs. others", "positive vs. others", "very positive vs. others", 
## and then pick the most confident prediction as the final prediction.

## Linear SVC also ranks all features based on their contribution to distinguish the two concepts in each binary classifier
## For category "0" (very negative), get all features and their weights and sort them in increasing order
feature_ranks00 = sorted(zip(nb_clf2.coef_[0], unigram_tfidf_vectorizer.get_feature_names()))
feature_ranks11 = sorted(zip(nb_clf2.coef_[1], unigram_tfidf_vectorizer.get_feature_names()))
feature_ranks22 = sorted(zip(nb_clf2.coef_[2], unigram_tfidf_vectorizer.get_feature_names()))
feature_ranks33 = sorted(zip(nb_clf2.coef_[3], unigram_tfidf_vectorizer.get_feature_names()))

## get the 10 features that are best indicators of very negative sentiment (they are at the bottom of the ranked list)
norelation10B = feature_ranks00[-10:]
directcausal10B=feature_ranks11[-10:]
conditionalcausal0B=feature_ranks22[-10:]
correlational10B=feature_ranks33[-10:]
print("no relation")
for i in range(0, len(norelation10B)):
    print(norelation10B[i])
print()

print("directcausal")
for i in range(0, len(directcausal10B)):
    print(directcausal10B[i])
print()

print("conditionalcausal")
for i in range(0, len(conditionalcausal0B)):
    print(conditionalcausal0B[i])
print()

print("correlational")
for i in range(0, len(correlational10B)):
    print(correlational10B[i])
print()

no relation
(-5.194457226265445, 'results')
(-5.141813492780023, 'trial')
(-5.021185504991408, 'high')
(-5.021185504991408, 'treatment')
(-4.99871264913935, 'needed')
(-4.816391092345395, 'clinical')
(-4.762323871075119, 'risk')
(-4.615720396883244, 'study')
(-4.55687989686031, 'studies')
(-4.051784947803305, 'patients')

directcausal
(-5.36467714485613, 'women')
(-5.310609923585854, 'effect')
(-5.310609923585854, 'significantly')
(-5.259316629198304, 'effective')
(-5.164006449393979, 'cancer')
(-5.164006449393979, 'study')
(-5.119554686823145, 'weight')
(-5.076995072404349, 'treatment')
(-4.92284439257709, 'risk')
(-4.4264075062632, 'patients')

conditionalcausal
(-5.69204721843778, 'role')
(-5.574264182781397, 'breast')
(-5.574264182781397, 'early')
(-5.574264182781397, 'effective')
(-5.574264182781397, 'reduce')
(-5.46890366712357, 'cancer')
(-5.373593487319245, 'increase')
(-5.286582110329615, 'improve')
(-4.635994544188466, 'patients')
(-4.513392222096133, 'risk')

correlational
(

In [28]:
err_cnt2 = 0
toorelate2=0
poorrelate2=0
for i in range(0, len(y_test)):
    if(y_test[i]!= y_pred5[i]):
        print(X_test[i])
        err_cnt2 = err_cnt2+1
print("errors:", err_cnt2)
for i in range(0, len(y_test)):
    if(y_test[i]> y_pred5[i]):
        print(X_test[i])
        toorelate2 = toorelate2+1
print("too related errors:", toorelate2)
for i in range(0, len(y_test)):
    if(y_test[i]< y_pred5[i]):
        print(X_test[i])
        poorrelate2 = poorrelate2+1
print("poor related errors:", poorrelate2)

errors: 474
too related errors: 194
poor related errors: 280


In [3]:
!git clone -b master https://github.com/charles9n/bert-sklearn
!cd bert-sklearn; pip install .

SyntaxError: invalid syntax (<ipython-input-3-ed0d9fbac4d5>, line 1)

In [None]:
from bert_sklearn import BertClassifier
model1 = BertClassifier()         # text/text pair classification
print(model1)
model2.fit(X_train1, y_train)
model2 = BertClassifier()         # text/text pair classification
print(model2)
model3.fit(X_train2, y_train)
model3 = BertClassifier()         # text/text pair classification
print(model3)
model.fit(X_train3, y_train)

In [None]:
y_pred7 = model1.predict(X_test1)
y_pred8 = model2.predict(X_test1)
y_pred9 = model3.predict(X_test1)

In [1]:
from sklearn.metrics import confusion_matrix

cm7=confusion_matrix(y_test, y_pred7, labels=[0,1,2,3])
cm8=confusion_matrix(y_test, y_pred8, labels=[0,1,2,3])
cm9=confusion_matrix(y_test, y_pred9, labels=[0,1,2,3])


from sklearn.metrics import classification_report
target_names = ['0','1','2','3']
print(cm7)
print(cm8)
print(cm9)
print(classification_report(y_test, y_pred7, target_names=target_names))
print(classification_report(y_test, y_pred8, target_names=target_names))
print(classification_report(y_test, y_pred9, target_names=target_names))

NameError: name 'y_test' is not defined