In [None]:
import pandas as p
import numpy as np
train=p.read_csv('/drive/My Drive/pubmed_causal_language_use.csv', delimiter=',')
y=train['label'].values
X=train['sentence'].values

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# several commonly used vectorizer setting

#  unigram boolean vectorizer, set minimum document frequency to 5
unigram_bool_vectorizer = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english')

#  unigram term frequency vectorizer, set minimum document frequency to 5
unigram_count_vectorizer = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english')

#  unigram and bigram term frequency vectorizer, set minimum document frequency to 5
gram12_count_vectorizer = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english')

#  unigram tfidf vectorizer, set minimum document frequency to 5
unigram_tfidf_vectorizer = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english')

In [None]:
# fit vocabulary in training documents and transform the training documents into vectors

X_vec1=unigram_count_vectorizer.fit_transform(X)
X_vec2=unigram_bool_vectorizer.fit_transform(X)
X_vec3=unigram_tfidf_vectorizer.fit_transform(X)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, X_train1, X_test1,X_train2, X_test2,X_train3, X_test3,  y_train, y_test = train_test_split(X, X_vec1,X_vec2, X_vec3, y, test_size=0.5, random_state=0)



In [None]:
from sklearn.svm import LinearSVC

# initialize the LinearSVC model
svm_clf1 = LinearSVC(C=10)
svm_clf2 = LinearSVC(C=10)
svm_clf3 = LinearSVC(C=10)

# use the training data to train the model
svm_clf1.fit(X_train1,y_train)
svm_clf2.fit(X_train2,y_train)
svm_clf3.fit(X_train3,y_train)

LinearSVC(C=10, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [None]:
from sklearn.metrics import confusion_matrix
y_pred1 = svm_clf1.predict(X_test1)
y_pred2 = svm_clf2.predict(X_test1)
y_pred3 = svm_clf3.predict(X_test1)
cm1=confusion_matrix(y_test, y_pred1, labels=[0,1,2,3])
cm2=confusion_matrix(y_test, y_pred2, labels=[0,1,2,3])
cm3=confusion_matrix(y_test, y_pred3, labels=[0,1,2,3])


from sklearn.metrics import classification_report
target_names = ['0','1','2','3']
print(cm1)
print(cm2)
print(cm3)
print(classification_report(y_test, y_pred1, target_names=target_names))
print(classification_report(y_test, y_pred2, target_names=target_names))
print(classification_report(y_test, y_pred3, target_names=target_names))


[[532  76  36  61]
 [ 56 122  15  41]
 [ 32  32  17  25]
 [ 73  41  17 355]]
[[526  79  34  66]
 [ 54 123  15  42]
 [ 30  33  19  24]
 [ 76  45  16 349]]
[[430 108  53 114]
 [ 29 135  19  51]
 [ 12  38  28  28]
 [ 33  39  22 392]]
              precision    recall  f1-score   support

           0       0.77      0.75      0.76       705
           1       0.45      0.52      0.48       234
           2       0.20      0.16      0.18       106
           3       0.74      0.73      0.73       486

    accuracy                           0.67      1531
   macro avg       0.54      0.54      0.54      1531
weighted avg       0.67      0.67      0.67      1531

              precision    recall  f1-score   support

           0       0.77      0.75      0.76       705
           1       0.44      0.53      0.48       234
           2       0.23      0.18      0.20       106
           3       0.73      0.72      0.72       486

    accuracy                           0.66      1531
   macro

In [None]:
## interpreting LinearSVC models
## http://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC

## LinearSVC uses a one-vs-all strategy to extend the binary SVM classifier to multi-class problems
## for the Kaggle sentiment classification problem, there are five categories 0,1,2,3,4 with 0 as very negative and 4 very positive
## LinearSVC builds five binary classifier, "very negative vs. others", "negative vs. others", "neutral vs. others", "positive vs. others", "very positive vs. others", 
## and then pick the most confident prediction as the final prediction.

## Linear SVC also ranks all features based on their contribution to distinguish the two concepts in each binary classifier
## For category "0" (very negative), get all features and their weights and sort them in increasing order
feature_ranks0 = sorted(zip(svm_clf1.coef_[0], unigram_tfidf_vectorizer.get_feature_names()))
feature_ranks1 = sorted(zip(svm_clf1.coef_[1], unigram_tfidf_vectorizer.get_feature_names()))
feature_ranks2 = sorted(zip(svm_clf1.coef_[2], unigram_tfidf_vectorizer.get_feature_names()))
feature_ranks3 = sorted(zip(svm_clf1.coef_[3], unigram_tfidf_vectorizer.get_feature_names()))

## get the 10 features that are best indicators of very negative sentiment (they are at the bottom of the ranked list)
norelation10 = feature_ranks0[-10:]
directcausal10=feature_ranks1[-10:]
conditionalcausal0=feature_ranks2[-10:]
correlational10=feature_ranks3[-10:]
print("no relation")
for i in range(0, len(norelation10)):
    print(norelation10[i])
print()

print("directcausal")
for i in range(0, len(directcausal10)):
    print(directcausal10[i])
print()

print("conditionalcausal")
for i in range(0, len(conditionalcausal0)):
    print(conditionalcausal0[i])
print()

print("correlational")
for i in range(0, len(correlational10)):
    print(correlational10[i])
print()


no relation
(1.360355292330231, 'esophageal')
(1.3828460165676668, 'proposed')
(1.403598126102271, 'specifically')
(1.4574656131630868, 'meal')
(1.4861245216005308, 'bias')
(1.489105477220581, 'obtained')
(1.4947499852286197, 'policy')
(1.5189292438585977, 'superior')
(1.535959348512013, 'needed')
(1.8571856939800397, 'european')

directcausal
(1.4708934685494444, 'did')
(1.4725520129997347, 'resulted')
(1.4920415794209594, 'determinants')
(1.5165766552767825, 'administered')
(1.5357363379714157, 'successful')
(1.5398294007640627, 'highly')
(1.5565140451361603, 'tolerated')
(1.561362924031826, 'effective')
(1.6118857523909604, 'contributed')
(1.6408802250967407, 'theory')

conditionalcausal
(0.9524651073301786, 'appeared')
(0.95540923656475, 'improve')
(0.9593048395901931, 'mg')
(0.985423698791247, 'cad')
(1.0066341060153399, 'influence')
(1.182666745279389, 'helpful')
(1.2101222584083318, 'mediated')
(1.556396900838478, 'problem')
(1.6123606778161472, 'useful')
(1.8149364312808731, 'l

In [None]:
err_cnt1 = 0
toorelate1=0
poorrelate1=0
for i in range(0, len(y_test)):
    if(y_test[i]!= y_pred1[i]):
        print(X_test[i])
        err_cnt1 = err_cnt1+1
print("errors:", err_cnt1)
for i in range(0, len(y_test)):
    if(y_test[i]> y_pred1[i]):
        print(X_test[i])
        toorelate1 = toorelate1+1
print("too related errors:", toorelate1)
for i in range(0, len(y_test)):
    if(y_test[i]< y_pred1[i]):
        print(X_test[i])
        poorrelate1 = poorrelate1+1
print("poor related errors:", poorrelate1)

"Daily treatment with the combination of ezetimibe plus fenofibrate is an acceptable alternative to atorvastatin for the treatment of dyslipidemia in patients who are intolerant of statins."
The absence of two mtDNA mutations  in ND1 gene rules out the possibility of involvement of these mutations in early  onset diabetes in Pakistani population.
It appears important for pharmacists to show their daily involvement in the quality of medical care.
Topical CsA led to an increase in corneal SBN density, improving clinical signs and symptoms of SSDE.
A simultaneous integrated boost strategy could benefit from adaptive planning during the course.
Multiple and immediate access to the web-based education program at home may prove useful as a source of reference for women with GDM.
We conclude that PTG tends to lead to less psychological distress overall but particularly so in a high impact context.(PsycINFO Database Record
The prevalence of PEW varied depending on the tool applied.
Moreover, o

In [None]:
# import the MNB module
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

# initialize the MNB model
nb_clf1= MultinomialNB()
nb_clf2= MultinomialNB()
nb_clf3= MultinomialNB()

# use the training data to train the MNB model
# feature_log_prob_ stores the conditional probs for all categories
# if the labels are strings, the index is in alphabetic order
# e.g. 'f' comes before 't' in alphabet, so 'f' is in [0] dimension and 't' in [1]

nb_clf1.fit(X_train1,y_train)
nb_clf2.fit(X_train2,y_train)
nb_clf3.fit(X_train3,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [None]:
from sklearn.metrics import confusion_matrix
y_pred4 = nb_clf1.predict(X_test1)
y_pred5 = nb_clf2.predict(X_test1)
y_pred6 = nb_clf3.predict(X_test1)
cm4=confusion_matrix(y_test, y_pred4, labels=[0,1,2,3])
cm5=confusion_matrix(y_test, y_pred5, labels=[0,1,2,3])
cm6=confusion_matrix(y_test, y_pred6, labels=[0,1,2,3])


from sklearn.metrics import classification_report
target_names = ['0','1','2','3']
print(cm4)
print(cm5)
print(cm6)
print(classification_report(y_test, y_pred4, target_names=target_names))
print(classification_report(y_test, y_pred5, target_names=target_names))
print(classification_report(y_test, y_pred6, target_names=target_names))



[[509  75  23  98]
 [ 49 121   6  58]
 [ 31  37  10  28]
 [ 45  27   4 410]]
[[513  67  22 103]
 [ 49 122   6  57]
 [ 35  34  12  25]
 [ 45  27   4 410]]
[[543  47   2 113]
 [ 65  84   1  84]
 [ 40  25   1  40]
 [ 41  10   0 435]]
              precision    recall  f1-score   support

           0       0.80      0.72      0.76       705
           1       0.47      0.52      0.49       234
           2       0.23      0.09      0.13       106
           3       0.69      0.84      0.76       486

    accuracy                           0.69      1531
   macro avg       0.55      0.54      0.54      1531
weighted avg       0.68      0.69      0.68      1531

              precision    recall  f1-score   support

           0       0.80      0.73      0.76       705
           1       0.49      0.52      0.50       234
           2       0.27      0.11      0.16       106
           3       0.69      0.84      0.76       486

    accuracy                           0.69      1531
   macro

In [None]:
## interpreting LinearSVC models
## http://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC

## LinearSVC uses a one-vs-all strategy to extend the binary SVM classifier to multi-class problems
## for the Kaggle sentiment classification problem, there are five categories 0,1,2,3,4 with 0 as very negative and 4 very positive
## LinearSVC builds five binary classifier, "very negative vs. others", "negative vs. others", "neutral vs. others", "positive vs. others", "very positive vs. others", 
## and then pick the most confident prediction as the final prediction.

## Linear SVC also ranks all features based on their contribution to distinguish the two concepts in each binary classifier
## For category "0" (very negative), get all features and their weights and sort them in increasing order
feature_ranks00 = sorted(zip(nb_clf2.coef_[0], unigram_tfidf_vectorizer.get_feature_names()))
feature_ranks11 = sorted(zip(nb_clf2.coef_[1], unigram_tfidf_vectorizer.get_feature_names()))
feature_ranks22 = sorted(zip(nb_clf2.coef_[2], unigram_tfidf_vectorizer.get_feature_names()))
feature_ranks33 = sorted(zip(nb_clf2.coef_[3], unigram_tfidf_vectorizer.get_feature_names()))

## get the 10 features that are best indicators of very negative sentiment (they are at the bottom of the ranked list)
norelation10B = feature_ranks00[-10:]
directcausal10B=feature_ranks11[-10:]
conditionalcausal0B=feature_ranks22[-10:]
correlational10B=feature_ranks33[-10:]
print("no relation")
for i in range(0, len(norelation10B)):
    print(norelation10B[i])
print()

print("directcausal")
for i in range(0, len(directcausal10B)):
    print(directcausal10B[i])
print()

print("conditionalcausal")
for i in range(0, len(conditionalcausal0B)):
    print(conditionalcausal0B[i])
print()

print("correlational")
for i in range(0, len(correlational10B)):
    print(correlational10B[i])
print()

no relation
(-5.194457226265445, 'results')
(-5.141813492780023, 'trial')
(-5.021185504991408, 'high')
(-5.021185504991408, 'treatment')
(-4.99871264913935, 'needed')
(-4.816391092345395, 'clinical')
(-4.762323871075119, 'risk')
(-4.615720396883244, 'study')
(-4.55687989686031, 'studies')
(-4.051784947803305, 'patients')

directcausal
(-5.36467714485613, 'women')
(-5.310609923585854, 'effect')
(-5.310609923585854, 'significantly')
(-5.259316629198304, 'effective')
(-5.164006449393979, 'cancer')
(-5.164006449393979, 'study')
(-5.119554686823145, 'weight')
(-5.076995072404349, 'treatment')
(-4.92284439257709, 'risk')
(-4.4264075062632, 'patients')

conditionalcausal
(-5.69204721843778, 'role')
(-5.574264182781397, 'breast')
(-5.574264182781397, 'early')
(-5.574264182781397, 'effective')
(-5.574264182781397, 'reduce')
(-5.46890366712357, 'cancer')
(-5.373593487319245, 'increase')
(-5.286582110329615, 'improve')
(-4.635994544188466, 'patients')
(-4.513392222096133, 'risk')

correlational
(

In [None]:
err_cnt2 = 0
toorelate2=0
poorrelate2=0
for i in range(0, len(y_test)):
    if(y_test[i]!= y_pred5[i]):
        print(X_test[i])
        err_cnt2 = err_cnt2+1
print("errors:", err_cnt2)
for i in range(0, len(y_test)):
    if(y_test[i]> y_pred5[i]):
        print(X_test[i])
        toorelate2 = toorelate2+1
print("too related errors:", toorelate2)
for i in range(0, len(y_test)):
    if(y_test[i]< y_pred5[i]):
        print(X_test[i])
        poorrelate2 = poorrelate2+1
print("poor related errors:", poorrelate2)

The absence of two mtDNA mutations  in ND1 gene rules out the possibility of involvement of these mutations in early  onset diabetes in Pakistani population.
It appears important for pharmacists to show their daily involvement in the quality of medical care.
A simultaneous integrated boost strategy could benefit from adaptive planning during the course.
Multiple and immediate access to the web-based education program at home may prove useful as a source of reference for women with GDM.
We conclude that PTG tends to lead to less psychological distress overall but particularly so in a high impact context.(PsycINFO Database Record
Seoul's ongoing program aimed to increase PAD coverage should also pay attention to improving community-level inequality as well as distributional efficiency.
The prevalence of PEW varied depending on the tool applied.
Moreover, optimizing vitamin D in early life may be critical for later health.
Our study confirms the well-known negative metabolic changes in PC

In [None]:
!git clone -b master https://github.com/charles9n/bert-sklearn
!cd bert-sklearn; pip install .
import os
os.chdir("bert-sklearn")
print(os.listdir())

Cloning into 'bert-sklearn'...
remote: Enumerating objects: 259, done.[K
remote: Total 259 (delta 0), reused 0 (delta 0), pack-reused 259[K
Receiving objects: 100% (259/259), 516.15 KiB | 12.00 MiB/s, done.
Resolving deltas: 100% (131/131), done.
Processing /content/bert-sklearn/bert-sklearn
[33m  DEPRECATION: A future pip version will change local packages to be built in-place without first copying to a temporary directory. We recommend you use --use-feature=in-tree-build to test your packages with this new behavior before it becomes the default.
   pip 21.3 will remove support for this functionality. You can find discussion regarding this at https://github.com/pypa/pip/issues/7555.[0m
Building wheels for collected packages: bert-sklearn
  Building wheel for bert-sklearn (setup.py) ... [?25l[?25hdone
  Created wheel for bert-sklearn: filename=bert_sklearn-0.3.1-py3-none-any.whl size=54247 sha256=e07bdbc46ae9bd370289ae8061e3a0dab3f100d8d864323d5bd922e62de3cbab
  Stored in directo

In [None]:
from bert_sklearn import BertClassifier
model1 = BertClassifier()         # text/text pair classification
print(model1)
model1.fit(X_train, y_train)



Building sklearn text classifier...
BertClassifier(bert_config_json=None, bert_model='bert-base-uncased',
               bert_vocab=None, do_lower_case=None, epochs=3, eval_batch_size=8,
               fp16=False, from_tf=False, gradient_accumulation_steps=1,
               ignore_label=None, label_list=None, learning_rate=2e-05,
               local_rank=-1, logfile='bert_sklearn.log', loss_scale=0,
               max_seq_length=128, num_mlp_hiddens=500, num_mlp_layers=0,
               random_state=42, restore_file=None, train_batch_size=32,
               use_cuda=True, validation_fraction=0.1, warmup_proportion=0.1)


100%|██████████| 231508/231508 [00:00<00:00, 708350.18B/s]


Loading bert-base-uncased model...


100%|██████████| 440473133/440473133 [00:14<00:00, 30861007.94B/s]
100%|██████████| 433/433 [00:00<00:00, 255973.73B/s]


Defaulting to linear classifier/regressor
Loading Pytorch checkpoint

train data size: 1377, validation data size: 153



  cpuset_checked))


Training  :   0%|          | 0/44 [00:00<?, ?it/s]

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  ../torch/csrc/utils/python_arg_parser.cpp:1050.)
  next_m.mul_(beta1).add_(1 - beta1, grad)


Validating:   0%|          | 0/20 [00:00<?, ?it/s]


Epoch 1, Train loss: 1.0655, Val loss: 0.6270, Val accy: 76.47%



Training  :   0%|          | 0/44 [00:00<?, ?it/s]

Validating:   0%|          | 0/20 [00:00<?, ?it/s]


Epoch 2, Train loss: 0.4898, Val loss: 0.4711, Val accy: 83.01%



Training  :   0%|          | 0/44 [00:00<?, ?it/s]

Validating:   0%|          | 0/20 [00:00<?, ?it/s]


Epoch 3, Train loss: 0.1568, Val loss: 0.3271, Val accy: 86.27%



BertClassifier(bert_config_json=None, bert_model='bert-base-uncased',
               bert_vocab=None, do_lower_case=True, epochs=3, eval_batch_size=8,
               fp16=False, from_tf=False, gradient_accumulation_steps=1,
               ignore_label=None, label_list=array([0, 1, 2, 3]),
               learning_rate=2e-05, local_rank=-1, logfile='bert_sklearn.log',
               loss_scale=0, max_seq_length=128, num_mlp_hiddens=500,
               num_mlp_layers=0, random_state=42, restore_file=None,
               train_batch_size=32, use_cuda=True, validation_fraction=0.1,
               warmup_proportion=0.1)

In [None]:
y_pred7 = model1.predict(X_test)


  cpuset_checked))


Predicting:   0%|          | 0/192 [00:00<?, ?it/s]

In [None]:
from sklearn.metrics import confusion_matrix

cm7=confusion_matrix(y_test, y_pred7, labels=[0,1,2,3])



from sklearn.metrics import classification_report
target_names = ['0','1','2','3']
print(cm7)

print(classification_report(y_test, y_pred7, target_names=target_names))


[[592  41  17  55]
 [ 16 198   6  14]
 [  6   6  91   3]
 [ 28  12   3 443]]
              precision    recall  f1-score   support

           0       0.92      0.84      0.88       705
           1       0.77      0.85      0.81       234
           2       0.78      0.86      0.82       106
           3       0.86      0.91      0.89       486

    accuracy                           0.86      1531
   macro avg       0.83      0.86      0.85      1531
weighted avg       0.87      0.86      0.87      1531



In [None]:
err_cnt3 = 0
toorelate3=0
poorrelate3=0
for i in range(0, len(y_test)):
    if(y_test[i]!= y_pred7[i]):
        print(X_test[i])
        err_cnt3 = err_cnt3+1
print("errors:", err_cnt3)
for i in range(0, len(y_test)):
    if(y_test[i]> y_pred7[i]):
        print(X_test[i])
        toorelate3 = toorelate3+1
print("too related errors:", toorelate3)
for i in range(0, len(y_test)):
    if(y_test[i]< y_pred7[i]):
        print(X_test[i])
        poorrelate3 = poorrelate3+1
print("poor related errors:", poorrelate3)

The absence of two mtDNA mutations  in ND1 gene rules out the possibility of involvement of these mutations in early  onset diabetes in Pakistani population.
Multiple and immediate access to the web-based education program at home may prove useful as a source of reference for women with GDM.
We conclude that PTG tends to lead to less psychological distress overall but particularly so in a high impact context.(PsycINFO Database Record
Seoul's ongoing program aimed to increase PAD coverage should also pay attention to improving community-level inequality as well as distributional efficiency.
In ACS patients, without previous history of DM, MS is highly prevalent.
The prevalence of PEW varied depending on the tool applied.
Our study confirms the well-known negative metabolic changes in PCOS patients.
Fibroadenomas and phyllodes tumours differ with regard to various cytological features, aiding in their distinction on fine-needle aspiration biopsy.
Further studies may improve the understan

In [None]:
import tensorflow as tf
from tensorflow import keras

In [None]:
MAX_NB_WORDS = 10000

tokenizer = keras.preprocessing.text.Tokenizer(MAX_NB_WORDS) # Selecting top 10000 words
tokenizer.fit_on_texts(news_data['text'])
train_data = tokenizer.texts_to_sequences(news_data['text'])

word_index = tokenizer.word_index

NameError: ignored