In [None]:
import pandas as pd
import sklearn
import numpy as np
from tqdm import tqdm_notebook
import os,time
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import re
import nltk
from gensim.models import Word2Vec
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report
from sklearn import svm


In [None]:
nltk.download('stopwords')
nltk.download('punkt')
rnd = 42

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Carregar dados

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
uri_dev   = '/content/drive/My Drive/BioNLP/BioNLP-OST-2019_BB-rel_dev'
url_test  = '/content/drive/My Drive/BioNLP/BioNLP-OST-2019_BB-rel_test'
uri_trein = '/content/drive/My Drive/BioNLP/BioNLP-OST-2019_BB-rel_train'

In [None]:
train_files = os.listdir(path=uri_trein)
valid_files = os.listdir(path=uri_dev)
test_files = os.listdir(path=url_test)

In [None]:
def get_data_files(files, types):
    a1_df = pd.DataFrame(columns=['Words','sc_ec','Entity'])
    a2_df = pd.DataFrame(columns = ['Relation','word_1','word_2'])

 
    for file in tqdm_notebook(files):
        if file.endswith('.a1'):

            if (os.path.getsize(f'/content/drive/My Drive/BioNLP/BioNLP-OST-2019_BB-rel_{types}/{file}') > 0 ):
                a1 = pd.read_csv(f'/content/drive/My Drive/BioNLP/BioNLP-OST-2019_BB-rel_{types}/{file}',sep='\t',header=None)
                spl = a1[1].str.split(' ',n=1,expand=True)
                a1['Words'] = a1[2]
                a1['sc_ec'] = spl[1]
                a1['Entity'] = spl[0]
                a1.drop([1,2],axis=1,inplace=True)
                a1_df = a1_df.append(a1,ignore_index=True,sort=True)


                file = file.split(sep='.')
                if(os.path.getsize(f'/content/drive/My Drive/BioNLP/BioNLP-OST-2019_BB-rel_{types}/{file[0]}.a2') > 0):
                    a2 = pd.read_csv(f'/content/drive/My Drive/BioNLP/BioNLP-OST-2019_BB-rel_{types}/{file[0]}.a2',sep = '\t',header=None)
                    spl = a2[1].str.split(' ',n = 2,expand= True)
                    spl_1 = spl[1].str.split(':',n = 1,expand=True)
                    spl_2 = spl[2].str.split(':',n = 1,expand=True)

                    a2['Relation'] = spl[0]
                    a2['w_1'] = spl_1[1]
                    a2['w_2'] = spl_2[1]

                    me = pd.merge(a2,a1[[0,'Words']],how='left',
                                     left_on = ['w_1'],
                                     right_on = [0])
                    a2['word_1'] = me['Words']
                    del me
                    me = pd.merge(a2,a1[[0,'Words']],how='left',
                                     left_on = ['w_2'],
                                     right_on = [0])
                    a2['word_2'] = me['Words']
                    del me

                    a2.drop([1,'w_1','w_2'],axis = 1,inplace = True)
                    a2_df = a2_df.append(a2,ignore_index=True,sort=True)
                    del a1,a2
                    
                    a1_df.drop([0],axis=1,inplace = True)
                    
    return a1_df,a2_df


In [None]:
train_a1,train_a2 = get_data_files(train_files,'train')
valid_a1,valid_a2 = get_data_files(valid_files,'dev')



Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=377.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=194.0), HTML(value='')))




In [None]:
train_a1.drop(train_a1[train_a1['Entity'] == 'Title'].index,axis=0,inplace=True)
train_a1.drop(train_a1[train_a1['Entity'] == 'Paragraph'].index,axis=0,inplace=True)

valid_a1.drop(valid_a1[valid_a1['Entity'] == 'Title'].index,axis=0,inplace=True)
valid_a1.drop(valid_a1[valid_a1['Entity'] == 'Paragraph'].index,axis=0,inplace=True)



In [None]:

train_a2.drop([0],axis=1,inplace=True)
valid_a2.drop([0],axis=1,inplace=True)

na_inx = train_a2[train_a2['Relation'] == 'Equiv'].index.values
train_a2.drop(na_inx,inplace=True)

na_inx = valid_a2[valid_a2['Relation'] == 'Equiv'].index.values
valid_a2.drop(na_inx,inplace=True)


# stop words

In [None]:
def clean_data(txts):
    x = re.sub("[^a-zA-Z0-9]", " ",txts) 
    x = x.lower().split()                             
    stops = set(stopwords.words("english"))                  
    words = [w for w in x if not w in stops] 
    
    return( " ".join(words))

In [None]:
train_a2['word_1'] = train_a2['word_1'].apply(lambda x: clean_data(x))
train_a2['word_2'] = train_a2['word_2'].apply(lambda x: clean_data(x))

valid_a2['word_1'] = valid_a2['word_1'].apply(lambda x: clean_data(x))
valid_a2['word_2'] = valid_a2['word_2'].apply(lambda x: clean_data(x))


train_a2['combined_word'] = train_a2[['word_1', 'word_2']].apply(lambda x: ' '.join(x), axis=1)
valid_a2['combined_word'] = valid_a2[['word_1', 'word_2']].apply(lambda x: ' '.join(x), axis=1)

In [None]:
train_words = train_a2['combined_word'].tolist()
train_relation = train_a2['Relation'].tolist()

valid_words = valid_a2['combined_word'].tolist()
valid_relation = valid_a2['Relation'].tolist()

In [None]:
train_words1 = train_a2['word_1'].tolist()
train_words2 = train_a2['word_2'].tolist()

valid_words1 = valid_a2['word_1'].tolist()
valid_words2 = valid_a2['word_2'].tolist()

corpus = []
for i in range(len(train_words)):
    corpus.append(word_tokenize(train_words[i]))


for i in range(len(valid_words)):
    corpus.append(word_tokenize(valid_words[i]))   

train=[]
for i in range(len(train_words)):
    train.append(word_tokenize(train_words[i]))


valid=[]
for i in range(len(valid_words)):
    valid.append(word_tokenize(valid_words[i]))    

model = Word2Vec(corpus, size=50, window=2, min_count=1)         

#Corpus

In [None]:
train_words = train_a2['combined_word'].tolist()
train_relation = train_a2['Relation'].tolist()

valid_words = valid_a2['combined_word'].tolist()
valid_relation = valid_a2['Relation'].tolist()

## Feature extraction W2V

In [None]:
X_train= np.zeros((len(train),50)) 

for i in range(len(train)):
    emb = [model.wv[w] for w in train[i]] 
    X_train[i] = np.mean(emb, axis=0) 

In [None]:

X_valid= np.zeros((len(valid),50)) 
for i in range(len(valid)):
    emb = [model.wv[w] for w in valid[i]] 
    X_valid[i] = np.mean(emb, axis=0) 

In [None]:
le = preprocessing.LabelEncoder()
Y_train = le.fit_transform(train_relation)
Y_valid = le.fit_transform(valid_relation)

In [None]:
train_words1 = train_a2['word_1'].tolist()
train_words2 = train_a2['word_2'].tolist()

valid_words1 = valid_a2['word_1'].tolist()
valid_words2 = valid_a2['word_2'].tolist()

corpus = []
for i in range(len(train_words1)):
    corpus.append(word_tokenize(train_words1[i]))
for i in range(len(train_words2)):
    corpus.append(word_tokenize(train_words2[i]))
for i in range(len(valid_words1)):
    corpus.append(word_tokenize(valid_words1[i]))
for i in range(len(valid_words2)):
    corpus.append(word_tokenize(valid_words2[i]))

model = Word2Vec(corpus, size=50, window=2, min_count=1)

In [None]:
X1_train= np.zeros((len(train_words1),50)) 

for i in range(len(train_words1)):
    words = word_tokenize(train_words1[i])
    emb = [model.wv[w] for w in words] 
    X1_train[i] = np.mean(emb, axis=0) 
    
    
X2_train= np.zeros((len(train_words2),50)) 

for i in range(len(train_words2)):
    words = word_tokenize(train_words2[i])
    emb = [model.wv[w] for w in words] 
    X2_train[i] = np.mean(emb, axis=0)

In [None]:
X_train = np.concatenate((X1_train,X2_train),axis=1)

In [None]:
X1_valid= np.zeros((len(valid_words1),50)) 

for i in range(len(valid_words1)):
    words = word_tokenize(valid_words1[i])
    emb = [model.wv[w] for w in words] 
    X1_valid[i] = np.mean(emb, axis=0) 
    
    
X2_valid= np.zeros((len(valid_words2),50)) 

for i in range(len(valid_words2)):
    words = word_tokenize(valid_words2[i])
    emb = [model.wv[w] for w in words] 
    X2_valid[i] = np.mean(emb, axis=0) 
    
X_valid = np.concatenate((X1_valid,X2_valid),axis=1)

# Teste

In [None]:
rf = RandomForestClassifier(random_state=rnd)
rf.fit(X_train,Y_train)
pre = rf.predict(X_valid)
print(classification_report(Y_valid, pre))

              precision    recall  f1-score   support

           0       0.90      0.34      0.50       154
           1       0.82      0.99      0.89       454

    accuracy                           0.82       608
   macro avg       0.86      0.67      0.70       608
weighted avg       0.84      0.82      0.79       608

