In [27]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

SystemError: GPU device not found

In [28]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [29]:
!pip3 install transformers

Defaulting to user installation because normal site-packages is not writeable


In [4]:
pip install --upgrade pip --user

Requirement already up-to-date: pip in /home/vaibhav.raj/.local/lib/python3.6/site-packages (20.0.2)
Note: you may need to restart the kernel to use updated packages.


In [30]:
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import BertTokenizer

In [31]:
# Load a trained model and vocabulary that you have fine-tuned
model = BertForSequenceClassification.from_pretrained('model_save1')
tokenizer = BertTokenizer.from_pretrained('model_save1')

# Copy the model to the GPU.
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [36]:
import pandas as pd

# Load the dataset into a pandas dataframe.
df1 = pd.read_csv("deal_test1.csv")

df1 = df1[df1['SENTENCES'].str.len() <512]

df1['LABEL'] = df1['LABEL'].astype(int)

df1.head()

sentences = df1.SENTENCES.values
labels =df1.LABEL.values

In [37]:
print(labels)

[0 1 1 1 0 0 1 1 1 0 1 1 1 1 1 0 1 1 0 0 1 0 0 0 0 0 1 1 0 0 0 1 0 1 0 1 1
 1 1 1 1 0 0 0 1 0 1 1 1 1 0 1 0 0 0 0 1 1 1 1 1 0 1 1 0 0 0 0 0 0 0 0 1 0]


In [38]:
print(type( dict(df1.iloc[0])['LABEL']) )

df1.count()

<class 'numpy.int64'>


LINKS        74
SENTENCES    74
LABEL        74
Queries      12
FN           13
dtype: int64

In [39]:
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [40]:
MAX_LEN =256
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []

# For every sentence...
for sent in sentences:
    # `encode` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    encoded_sent = tokenizer.encode(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                   )
    
    input_ids.append(encoded_sent)

print('Maximum_sentence_length:-',max([len(sen) for sen in input_ids]))

# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, 
                          dtype="long", truncating="post", padding="post")

# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask) 

# Convert to tensors.
prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)
prediction_labels = torch.tensor(labels)

# Set the batch size.  
batch_size = 30  

# Create the DataLoader.
prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

Maximum_sentence_length:- 74


In [41]:
# Prediction on test set

print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs)))

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions , true_labels = [], []

# Predict 
for batch in prediction_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)


    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch

    # Telling the model not to compute or store gradients, saving memory and 
    # speeding up prediction
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        outputs = model(b_input_ids, token_type_ids=None, 
                        attention_mask=b_input_mask)

    logits = outputs[0]

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    # Store predictions and true labels
    predictions.append(logits)
    true_labels.append(label_ids)

print('    DONE.')

Predicting labels for 74 test sentences...
    DONE.


In [42]:
print(predictions)
print('-'*50)
print(true_labels)

[array([[ 0.30837977, -0.18365942],
       [ 0.6811949 , -0.6678633 ],
       [-1.4250894 ,  0.85718834],
       [-0.56047523,  0.2359885 ],
       [ 1.534073  , -1.1917535 ],
       [ 1.7418637 , -1.3553119 ],
       [-1.4937885 ,  0.9425752 ],
       [-1.530865  ,  0.8851419 ],
       [-1.3353658 ,  0.6926642 ],
       [ 1.699431  , -1.1628115 ],
       [-0.4885103 ,  0.18428466],
       [-0.7694634 ,  0.37012166],
       [-0.75417596,  0.36999458],
       [-1.2260853 ,  0.77012336],
       [ 1.1864306 , -0.9415839 ],
       [ 1.4719377 , -0.9795939 ],
       [-1.5866519 ,  0.93857545],
       [-1.2763113 ,  0.67101055],
       [-0.17478517, -0.13741815],
       [ 1.3174405 , -1.0245115 ],
       [ 0.7727344 , -0.8454611 ],
       [-0.67265403,  0.15766264],
       [-1.0939124 ,  0.66917425],
       [ 1.7244306 , -1.3163557 ],
       [ 1.6721957 , -1.2892926 ],
       [ 0.46148017, -0.40228555],
       [-1.4076521 ,  0.8651201 ],
       [ 0.33882448, -0.17439142],
       [ 1.6575406 

In [43]:
print('Positive samples: %d of %d (%.2f%%)' % (df1.LABEL.sum(), len(df1.LABEL), (df1.LABEL.sum() / len(df1.LABEL) * 100.0)))

Positive samples: 38 of 74 (51.35%)


In [44]:
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = preds.flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [45]:
def fns(preds, labels ,i1):
    #'numpy.ndarray' object type
    pred_flat = preds.flatten()
    #'numpy.ndarray' object type
    labels_flat = labels.flatten()
  
    for j1 in range(len(pred_flat)):
        if(pred_flat[j1] == 0 and labels_flat[j1] ==1):
            fn_index_list.append(batch_size*i1 + j1)

In [46]:
def fps(preds, labels, i1):
    #'numpy.ndarray' object type
    pred_flat = preds.flatten()
    #'numpy.ndarray' object type
    labels_flat = labels.flatten()
  
    for j1 in range(len(pred_flat)):
        if(pred_flat[j1] == 1 and labels_flat[j1] ==0):
            fp_index_list.append(batch_size*i1 + j1)

In [47]:
from sklearn.metrics import matthews_corrcoef

matthews_set = []
sum_l,ln =0,0

# Evaluate each test batch using Matthew's correlation coefficient
print('Calculating Matthews Corr. Coef. for each batch...')

predicted_labels =[]
# For each input batch...
for i in range(len(true_labels)):
    # The predictions for this batch are a 2-column ndarray (one column for "0" 
    # and one column for "1"). Pick the label with the highest value and turn this
    # in to a list of 0s and 1s.
    pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
    predicted_labels+= list(pred_labels_i)
    print(pred_labels_i)
    print('-'*50)
    print(true_labels[i])

    batch_acc = flat_accuracy(pred_labels_i, true_labels[i])
    print(batch_acc)
    print('~'*50)
    sum_l += batch_acc*len(true_labels[i])
    ln+= len(true_labels[i])

    # Calculate and store the coef for this batch.  
    matthews = matthews_corrcoef(true_labels[i], pred_labels_i)                
    matthews_set.append(matthews)

Calculating Matthews Corr. Coef. for each batch...
[0 0 1 1 0 0 1 1 1 0 1 1 1 1 0 0 1 1 1 0 0 1 1 0 0 0 1 0 0 0]
--------------------------------------------------
[0 1 1 1 0 0 1 1 1 0 1 1 1 1 1 0 1 1 0 0 1 0 0 0 0 0 1 1 0 0]
0.7666666666666667
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[0 0 0 1 0 1 1 1 1 1 1 0 0 0 1 0 1 1 1 1 1 1 0 0 0 0 1 1 1 1]
--------------------------------------------------
[0 1 0 1 0 1 1 1 1 1 1 0 0 0 1 0 1 1 1 1 0 1 0 0 0 0 1 1 1 1]
0.9333333333333333
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[1 0 1 1 0 0 0 0 0 0 0 0 1 0]
--------------------------------------------------
[1 0 1 1 0 0 0 0 0 0 0 0 1 0]
1.0
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


In [48]:
avg_acc = (sum_l/ln)*100
print(avg_acc)

87.83783783783784


In [49]:
matthews_set

[0.5345224838248488, 0.8611111111111112, 1.0]

In [50]:
fn_index_list =[]
# For each input batch...
for i in range(len(true_labels)):
    pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
    fns(pred_labels_i, true_labels[i],i)

print(fn_index_list)

[1, 14, 20, 27, 31]


In [51]:
fp_index_list =[]
# For each input batch...
for i in range(len(true_labels)):
    pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
    fps(pred_labels_i, true_labels[i],i)

print(fp_index_list)

[18, 21, 22, 50]


In [52]:
tp = df1.LABEL.sum() - len(fn_index_list)
tn = len(df1.LABEL) - df1.LABEL.sum() - len(fp_index_list)
fp =len(fp_index_list)
fn = len(fn_index_list)

print('tp', tp)
print('fp', fp)
print('fn', fn)
print('tn', tn)
precision = tp/(tp+fp)
recall = tp/(tp+fn)

f1_score = 2*precision*recall/(precision+recall)

print('precision:-',precision)
print('recall:-',recall)
print('f1_score:-',f1_score)
print('overall_accuracy:-',avg_acc)

tp 33
fp 4
fn 5
tn 32
precision:- 0.8918918918918919
recall:- 0.868421052631579
f1_score:- 0.88
overall_accuracy:- 87.83783783783784


In [53]:
# Load your usual SpaCy model (one of SpaCy English models)
import spacy
nlp = spacy.load('en')

# Add neural coref to SpaCy's pipe
import neuralcoref
neuralcoref.add_to_pipe(nlp)

<spacy.lang.en.English at 0x7fb81e110fd0>

In [54]:
# You're done. You can now use NeuralCoref as you usually manipulate a SpaCy document annotations.
doc = nlp('Sun Pharmaceutical recently debuted on the NYSE exchange. It is expected to raise $1 billion with a bond sale after paying completing its $4 billion deal for Ranbaxy Laboratories.')

In [55]:
for x in doc.ents:
    print(x.text, x.label_)

Sun Pharmaceutical ORG
NYSE ORG
$1 billion MONEY
$4 billion MONEY
Ranbaxy Laboratories ORG


In [56]:
doc._.has_coref
doc._.coref_clusters

[Sun Pharmaceutical: [Sun Pharmaceutical, It, its]]

In [57]:
print(doc._.coref_clusters[0].mentions)
print(doc._.coref_clusters[0].mentions[-1])
print(doc._.coref_clusters[0].mentions[-1]._.coref_cluster.main)

print('~'*50)
print(doc._.coref_clusters[1].mentions)
print(doc._.coref_clusters[1].mentions[-1])
print(doc._.coref_clusters[1].mentions[-1]._.coref_cluster.main)

[Sun Pharmaceutical, It, its]
its
Sun Pharmaceutical
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


IndexError: list index out of range

In [115]:
from pprint import pprint
pprint(doc._.coref_scores)

{Sun Pharmaceutical: {Sun Pharmaceutical: 0.8751775026321411},
 the NYSE exchange: {Sun Pharmaceutical: -1.158556342124939,
                     the NYSE exchange: 0.4251871109008789,
                     NYSE: -1.5458831787109375},
 NYSE: {Sun Pharmaceutical: -1.5722846984863281, NYSE: 3.334199905395508},
 It: {Sun Pharmaceutical: 6.316093444824219,
      the NYSE exchange: -1.129673719406128,
      NYSE: -1.7919954061508179,
      It: 0.2531677484512329},
 a bond sale: {Sun Pharmaceutical: -2.3145387172698975,
               the NYSE exchange: -1.9362432956695557,
               NYSE: -2.042330265045166,
               It: -1.4826641082763672,
               a bond sale: 1.732793927192688},
 paying completing its $4 billion deal for Ranbaxy Laboratories: {Sun Pharmaceutical: -1.9962166547775269,
                                                                  the NYSE exchange: -2.0024654865264893,
                                                                  NYSE: -2.2155206203

In [111]:
token_list = []
for token in doc:
    token_list.append(token.text)
print(token_list)

['We', 'are', 'looking', 'for', 'a', 'region', 'of', 'central', 'Italy', 'bordering', 'the', 'Adriatic', 'Sea', '.', 'The', 'area', 'is', 'mostly', 'mountainous', 'and', 'includes', 'Mt.', 'Corno', ',', 'the', 'highest', 'peak', 'of', 'the', 'mountain', 'range', '.', 'It', 'also', 'includes', 'many', 'sheep', 'and', 'an', 'Italian', 'entrepreneur', 'has', 'an', 'idea', 'about', 'how', 'to', 'make', 'a', 'little', 'money', 'of', 'them', '.']


In [109]:
print(doc._.coref_scores.keys())

[We, a region of central Italy bordering the Adriatic Sea, central Italy bordering the Adriatic Sea, Italy, the Adriatic Sea, The area, Mt. Corno, Mt. Corno, the highest peak of the mountain range, the mountain range, It, many sheep and an Italian entrepreneur, Italian, an Italian entrepreneur, an idea about how to make a little money of them, how to make a little money of them, a little money of them, them]


In [34]:
span = doc[-1:]
print(span._.is_coref)
print(span._.coref_cluster.main)
print(span._.coref_cluster.main._.coref_cluster)


False


AttributeError: 'NoneType' object has no attribute 'main'

In [32]:
token = doc[-1]
token._.in_coref
token._.coref_clusters

[]

In [79]:
# You're done. You can now use NeuralCoref as you usually manipulate a SpaCy document annotations.
doc1 = nlp('The legal pressures facing Michael Cohen are growing in a wide-ranging investigation of his personal business affairs and his work on behalf of his former client, President Trump.  In addition to his work for Mr. Trump, he pursued his own business interests, including ventures in real estate, personal loans and investments in taxi medallions.')
doc1._.has_coref
doc1._.coref_clusters

[Michael Cohen: [Michael Cohen, his, his, his, his, he, his]]

In [80]:
print(doc1._.coref_clusters[0].mentions)
print(doc1._.coref_clusters[0].mentions[-1])
print(doc1._.coref_clusters[0].mentions[-1]._.coref_cluster.main)

[Michael Cohen, his, his, his, his, he, his]
his
Michael Cohen


In [99]:
for x in doc1._.coref_clusters[0].mentions:
    print(x, x.start)

Michael Cohen 4
his 15
his 20
his 25
his 36
he 42
his 44


In [100]:
for token in doc1:
    if token._.in_coref:
        for cluster in token._.coref_clusters:
            print(token.text , " REFERS TO", cluster.main.text)

Michael  REFERS TO Michael Cohen
Cohen  REFERS TO Michael Cohen
his  REFERS TO Michael Cohen
his  REFERS TO Michael Cohen
his  REFERS TO Michael Cohen
his  REFERS TO Michael Cohen
he  REFERS TO Michael Cohen
his  REFERS TO Michael Cohen


In [98]:
mlist[45]

'own'

In [2]:
from pymongo import MongoClient

mongoClient = MongoClient('10.240.0.46', 36018, username='read',
                            password='fdfREsse', connectTimeoutMS=100000)

db=mongoClient["pharma_crawling"]
coll=db['vaibhav_test']

In [12]:
for cur in coll.find().skip(1000).limit(50):
    #print(cur['desc'])
    #print('~'*60)
    doc1 = nlp(cur['desc'])
    
    for x in doc1.ents:
        print(x.text, x.label_)
    
    print('-'*70)
    
    doc1._.has_coref
    print(doc1._.coref_clusters)
    
    break

Allergan PERSON
$40 billion MONEY
Actavis ORG
the Justice Department's ORG
SEC ORG
Thursday DATE
June 25 DATE
DOJ ORG
Antitrust Division ORG
Company ORG
four CARDINAL
Bloomberg PERSON
Vermont GPE
Bernie Sanders Rapidly PERSON
Medicare ORG
10% PERCENT
a single year DATE
between July of 2013 and 2014 DATE
Vermont GPE
Bernie Sanders PERSON
DOJ Antitrust ORG
last fall DATE
Actavis ORG
fourth ORDINAL
Impax Laboratories ORG
IPXL ORG
Lannett PERSON
Par Pharmaceutical PERSON
DOJ ORG
Policy and Regulatory Report ORG
DOJ ORG
Allergan PERSON
DOJ ORG
just two days DATE
last week DATE
Teva PERSON
TEVA PERSON
$40.5 billion MONEY
Allergan PERSON
$66 billion MONEY
Botox ORG
SEC ORG
Bloomberg PERSON
Allergan PERSON
DOJ ORG
Allergan PERSON
Congress ORG
Generics NORP
U.S. GPE
----------------------------------------------------------------------
[Allergan ($AGN): [Allergan ($AGN), its, its, it, The drugmaker, its, it, It, it], Actavis: [Actavis, Actavis, Actavis], the four drugmakers: [the four drugmaker

In [4]:
doc2 = nlp("It's discussing a $2.5 billion buyoutof ZS Pharma")
for x1 in doc2.ents:
    print(x1.text,x1.label_)

$2.5 billion MONEY


In [58]:
from nltk.parse.corenlp import CoreNLPParser
st1 = CoreNLPParser(url="http://localhost:9000", tagtype='ner')

In [59]:
op = st1.tag_sents([["Horizon hikes Depomed offer to $2B in bid for 'friendly' deal talks"]])
print(op)

[[('Horizon', 'O'), ('hikes', 'O'), ('Depomed', 'ORGANIZATION'), ('offer', 'O'), ('to', 'O'), ('$', 'MONEY'), ('2B', 'MONEY'), ('in', 'O'), ('bid', 'O'), ('for', 'O'), ('`', 'O'), ('friendly', 'O'), ("'", 'O'), ('deal', 'O'), ('talks', 'O')]]


In [24]:
print(len(sentences))

74


In [39]:
print(predicted_labels,len(predicted_labels))

[0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0] 74


In [74]:
import re
output =[]
for i in range(len(sentences) ):
    d ={}
    org_set,money_set =set(),set()
    if(predicted_labels[i] == 1):
        tup_list = st1.tag_sents([[sentences[i]]])
#         print(tup_list)
        val,temp ='',''
        for j in tup_list[0]:
            if(j[1] == 'O' and temp):
                if(temp == 'MONEY' and val and not re.search('[$] [A-Za-z]+',val) ):
                    money_set.add(val.lower())
                elif((temp == 'ORGANIZATION' or temp == 'PERSON') and val and not re.search('[$] [A-Za-z]+',val)):
                    org_set.add(val)
                
                val =''
            
            if(j[1]!= 'O'):
                val+= j[0]+ ' '
                temp = j[1]
                
        if(temp):
            if(temp == 'MONEY' and val and not re.search('[$] [A-Za-z]+',val) ):
                money_set.add(val.lower())
            elif((temp == 'ORGANIZATION' or temp == 'PERSON') and val and not re.search('[$] [A-Za-z]+',val)):
                org_set.add(val)
#         print(sentences[i])
#         print(org_list)
#         print(money_list)
        
    d['sentence'] = sentences[i]
    d['status'] = predicted_labels[i]
    d['organizations/persons'] = list(org_set)
    d['money'] = list(money_set)
    output.append(d)

#print(output)    
df_output = pd.DataFrame(output)
#print(df_output)            

In [75]:
df_output.head(10)

Unnamed: 0,sentence,status,organizations/persons,money
0,Allergan ($AGN) may be shedding its background...,0,[],[]
1,Allergan received the DOJ inquiry just two day...,0,[],[]
2,It recently took on the Allergan name after co...,1,[Allergan ],[$ 66 billion ]
3,"Horizon has offered $2 billion for Depomed, bu...",1,[Depomed ],[$ 2 billion ]
4,The Hyperion acquisition was completed on May ...,0,[],[]
5,Adjusted operating cash flow in the second qua...,0,[],[]
6,Merck sold Bayer its consumer business for $14...,1,"[Bayer , Merck ]",[$ 14 billion ]
7,Sanofi deepens its Evotec ties with a $330M di...,1,[Sanofi ],[$ 330m ]
8,Teva Reinforces Leadership Position in Respira...,1,"[Teva , Acquisition of Gecko Health Innovations ]",[]
9,As Leerink Partners' Jason Gerberry wrote in a...,0,[],[]


In [76]:
df_output.to_csv('deal_updated.csv')