In [1]:
import sys
import random
import os
from pathlib import Path
import shutil
import json
import pandas as pd
from collections import defaultdict

import numpy as np
import argparse
import tqdm
import spacy
from spacy.gold import minibatch
from spacy.language import Language
from spacy import util

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [2]:
from scispacy.data_util import read_full_med_mentions, read_ner_from_tsv
from scispacy.util import WhitespaceTokenizer
from scispacy.per_class_scorer import PerClassScorer
from scispacy.train_utils import evaluate_ner

In [3]:
import en_core_sci_sm
nlp = en_core_sci_sm.load()

In [4]:
agreed_df = pd.read_csv('agreed_.csv', names=['expert','sentence', 'label'])

In [5]:
agreed_df = agreed_df[['sentence', 'label']] 
agreed_df

Unnamed: 0,sentence,label
0,PIK3CA is frequently mutated in breast cancer ...,Yes
1,Levels of TBXA2R (left) and TBXAS1 (right) and...,Yes
2,Evaluation of the toll-like receptor 6 Ser249P...,Yes
3,Mice expressing Wnt-1 under the control of the...,Yes
4,IL-1β remained significantly increased in E/P-...,Yes
...,...,...
376,In contrast only MUC1 levels increased with no...,Altered Expression
377,The expression of the ubiquitin ligase subunit...,Altered Expression
378,Analyses of both cytokine and chemokine levels...,Altered Expression
379,As shown in table 2 ER positive tumours had si...,Altered Expression


In [6]:
data_train, data_test = train_test_split(agreed_df, test_size=.1, random_state=31)

In [7]:
uniq_labels = list(agreed_df['label'].unique())

In [8]:
def get_categories(categories, uniq_labels_):
    cats = dict(zip(uniq_labels_,[0.0]*len(uniq_labels_)))
    for category in categories:
        cats[category] = 1.0
    return {'cats': cats}

get_categories(['Neutral', 'Yes'],uniq_labels )

{'cats': {'Yes': 1.0,
  'No': 0.0,
  'Positive': 0.0,
  'Negative': 0.0,
  'Neutral': 1.0,
  'Altered Expression': 0.0,
  'Genetic Variation': 0.0,
  'Any': 0.0,
  'Regulatory modification': 0.0}}

In [9]:
# check if `textcat` is already in the pipe, add if not
if 'textcat' not in nlp.pipe_names:
    print('creating!!')
    textcat = nlp.create_pipe('textcat', config={'choice_style': 'multiple'})
    nlp.add_pipe(textcat, last=True)
else:
    textcat = nlp.get_pipe('textcat')

# add labels to the model    
for label in uniq_labels:
    textcat.add_label(label)

creating!!


In [10]:
datas_train = defaultdict(set)
for iter_, row_ in data_train.iterrows():
    datas_train[row_['sentence']].add(row_['label'])
    
datas_test = defaultdict(set)
for iter_, row_ in data_test.iterrows():
    datas_test[row_['sentence']].add(row_['label'])    

In [11]:
data_train_spacy = []
for sentences, labels in datas_train.items():
    data_train_spacy.append((sentences,get_categories(list(labels),uniq_labels)))
    
data_test_spacy = []
for sentences, labels in datas_test.items():
    data_test_spacy.append((sentences,get_categories(list(labels),uniq_labels)))    

In [12]:
# train the model
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    for i in range(50):
        print('Epoch %d' % i)
        losses = {}
        batches = minibatch(data_train_spacy, size=16)
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
                       losses=losses)

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29
Epoch 30
Epoch 31
Epoch 32
Epoch 33
Epoch 34
Epoch 35
Epoch 36
Epoch 37
Epoch 38
Epoch 39
Epoch 40
Epoch 41
Epoch 42
Epoch 43
Epoch 44
Epoch 45
Epoch 46
Epoch 47
Epoch 48
Epoch 49


In [13]:
def roundoff(dict_y):
    for k, v in dict_y.items():
        v = round(v,2) 
        dict_y[k] = v 
    return dict_y


In [28]:
p = 20
data_test.iloc[p].sentence, datas_test[data_test.iloc[p].sentence]

('The two lowest scores 68 and 85 occurred in two sibs (one with autism the other with "not quite autism") who both were heterozygous for the DLX2 Serine insertion/deletion and both had early trigonocephaly which normalized.',
 {'Yes'})

In [29]:
docs = nlp(data_test.iloc[p].sentence)
roundoff(docs.cats)

{'Yes': 0.01,
 'No': 0.02,
 'Positive': 1.0,
 'Negative': 0.22,
 'Neutral': 0.0,
 'Altered Expression': 0.04,
 'Genetic Variation': 0.93,
 'Any': 0.02,
 'Regulatory modification': 0.0}

In [25]:
# Save model 
from pathlib import Path

output_dir = '/nfs/gns/literature/Santosh_Tirunagari/GitHub/spacy_models/textcat/relation/'
new_model_name = 'relationv01'

if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.meta['name'] = new_model_name  # rename model
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

Saved model to /nfs/gns/literature/Santosh_Tirunagari/GitHub/spacy_models/textcat/relation


In [35]:
!python -m spacy package /nfs/gns/literature/Santosh_Tirunagari/GitHub/spacy_models/textcat/relation /nfs/gns/literature/Santosh_Tirunagari/GitHub/spacy_models/textcat/model

[38;5;2m✔ Loaded meta.json from file[0m
/nfs/gns/literature/Santosh_Tirunagari/GitHub/spacy_models/textcat/relation/meta.json
[38;5;2m✔ Successfully created package 'en_relationv01-0.2.5'[0m
/nfs/gns/literature/Santosh_Tirunagari/GitHub/spacy_models/textcat/model/en_relationv01-0.2.5
To build the package, run `python setup.py sdist` in this directory.


In [34]:
from spacy import displacy
displacy.render(docs, style='dep', jupyter=True)



In [30]:
docs.ents

(scores,
 sibs,
 autism,
 heterozygous,
 DLX2,
 Serine,
 insertion/deletion,
 early,
 trigonocephaly,
 normalized)

In [16]:
# with textcat.model.use_params(optimizer.averages):
#     docs = [nlp.tokenizer(h) for h in data_test.sentence]
#     test_pred = np.array(
#         [sorted(doc.cats.items(), key=lambda x: -x[1])[0][0]
#          for doc in textcat.pipe(docs)])
#     print('Test Acc: %.4f' %
#           (pd.Series(test_pred == data_test.label.values).sum() / data_test.shape[0]))

# Spacy transformer