In [1]:
import os
import pandas as pd
import spacy
import nltk
from nltk.corpus import stopwords
import re
from tqdm import tqdm

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/victor/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
nlp = spacy.load('es_core_news_sm')
stops = stopwords.words('spanish')

In [4]:
def get_pos(x):
    try:
        return nlp(x)[0].pos_
    except:
        'DUMMY'

In [5]:
document_count = 0
documents = []
sentences = []
words = []
tags = []
poss = []

for file in tqdm(os.listdir('../raw_data/train/subtrack1/')):
    
    if '.txt' in file:

        annfile = file.replace('.txt','.ann')
        #document_count += 1
        document = file.replace('.txt','')
        
        text = open(os.path.join('../raw_data/train/subtrack1/',file),'r',encoding='utf-8').readlines()
        annotations = open(os.path.join('../raw_data/train/subtrack1/',annfile),'r',encoding='utf-8').readlines()
        annotation_dict = {}
        for annotation in annotations:
            annotation = annotation.replace('\n','')
            annotation = annotation.replace('\t',' ')
            if annotation[0] == 'T':
                tag = annotation.split()[1]
                n1 = int(annotation.split()[2])
                n2 = int(annotation.split()[3])
                val = " ".join(annotation.split()[4:])
                id1 = n1
                for v in val.split():
                    #print (v, id1,id1+len(v))
                    annotation_dict[(id1,id1+len(v))] = tag
                    id1 = id1 + len(v) + 1
        
        sentence_count = 0
        n1 = 0
        n2 = 0
        lines = "".join(text)
        for c in lines:
            n2 += 1
            if c == ' ':
                if lines[n2-2] in [',','.','?','%',';','-','+','&']: 
                    tag = annotation_dict.get((n1,n2-2),"OTHER")
                    word = lines[n1:n2-2]
                    #print ((n1,n2-2), word, tag)
                else:
                    tag = annotation_dict.get((n1,n2-1),"OTHER")
                    word = lines[n1:n2-1]
                if len(word) > 0 and word not in stops:
                    documents.append(document)
                    sentences.append(sentence_count)
                    words.append(word.lower())
                    poss.append(get_pos(word))
                    if len(annotation_dict) > 0:
                        tags.append(tag)
                    else:
                        tags.append('NULL')
                    #print ((n1,n2-1), word, tag)
                n1 = n2
            if c == '\n':
                sentence_count += 1

subtask1_train = pd.DataFrame()
subtask1_train['document'] = documents
subtask1_train['sentence'] = sentences
subtask1_train['word'] = words
subtask1_train['tag'] = tags
subtask1_train['pos'] = poss

100%|██████████| 1000/1000 [08:35<00:00,  1.94it/s]


In [6]:
subtask1_train.pos.value_counts()

NOUN     52251
ADJ      23696
VERB     13541
NUM       6019
ADV       3502
PROPN     3451
ADP       2011
PUNCT     1984
PRON      1791
DET       1713
AUX        776
CONJ       257
SCONJ      234
INTJ       170
SYM         32
SPACE        4
Name: pos, dtype: int64

In [7]:
print (subtask1_train.tag.value_counts())

OTHER               93482
NULL                14188
NORMALIZABLES        2174
PROTEINAS            1501
UNCLEAR                68
NO_NORMALIZABLES       19
Name: tag, dtype: int64


In [8]:
print (subtask1_train.document.nunique())
print (subtask1_train.groupby(['document'])['sentence'].nunique().max())
print (subtask1_train.groupby(['document','sentence'])['word'].nunique().max())

500
33
359


In [9]:
document_count = 0
documents = []
sentences = []
words = []
tags = []
n1s = []
n2s = []
poss = []
for file in tqdm(os.listdir('../raw_data/dev/subtrack1/')):
    
    if '.txt' in file:

        annfile = file.replace('.txt','.ann')
        #document_count += 1
        document = file.replace('.txt','')
        
        text = open(os.path.join('../raw_data/dev/subtrack1/',file),'r',encoding='utf-8').readlines()
        annotations = open(os.path.join('../raw_data/dev/subtrack1/',annfile),'r',encoding='utf-8').readlines()
        annotation_dict = {}
        for annotation in annotations:
            annotation = annotation.replace('\n','')
            annotation = annotation.replace('\t',' ')
            if annotation[0] == 'T':
                tag = annotation.split()[1]
                n1 = int(annotation.split()[2])
                n2 = int(annotation.split()[3])
                val = " ".join(annotation.split()[4:])
                id1 = n1
                for v in val.split():
                    #print (v, id1,id1+len(v))
                    annotation_dict[(id1,id1+len(v))] = tag
                    id1 = id1 + len(v) + 1
        
        sentence_count = 0
        n1 = 0
        n2 = 0
        lines = "".join(text)
        for c in lines:
            n2 += 1
            if c == ' ':
                if lines[n2-2] in [',','.','?','%',';','-','+','&']: 
                    tag = annotation_dict.get((n1,n2-2),"OTHER")
                    word = lines[n1:n2-2]
                    #print ((n1,n2-2), word, tag)
                else:
                    tag = annotation_dict.get((n1,n2-1),"OTHER")
                    word = lines[n1:n2-1]
                if len(word) > 0 and word not in stops:
                    documents.append(document)
                    sentences.append(sentence_count)
                    words.append(word.lower())
                    poss.append(get_pos(word))
                    if len(annotation_dict) > 0:
                        tags.append(tag)
                    else:
                        tags.append('NULL')
                    #print ((n1,n2-1), word, tag)
                    n1s.append(n1)
                    if lines[n2-2] in [',','.','?','%',';','-','+','&']:
                        n2s.append(n2-2)
                    else:
                        n2s.append(n2-1)
                n1 = n2
            if c == '\n':
                sentence_count += 1

subtask1_dev = pd.DataFrame()
subtask1_dev['document'] = documents
subtask1_dev['sentence'] = sentences
subtask1_dev['n1'] = n1s
subtask1_dev['n2'] = n2s
subtask1_dev['word'] = words
subtask1_dev['tag'] = tags
subtask1_dev['pos'] = poss

100%|██████████| 500/500 [04:07<00:00,  1.13it/s]


In [10]:
subtask1_dev.pos.value_counts()

NOUN     25032
ADJ      11334
VERB      6632
NUM       2863
ADV       1656
PROPN     1535
ADP       1067
PUNCT      933
DET        894
PRON       863
AUX        343
CONJ       139
INTJ       104
SCONJ      103
SYM         15
SPACE        1
Name: pos, dtype: int64

In [11]:
subtask1_dev.tag.value_counts()

OTHER               47127
NULL                 4463
NORMALIZABLES        1048
PROTEINAS             836
UNCLEAR                27
NO_NORMALIZABLES       13
Name: tag, dtype: int64

In [12]:
print (subtask1_dev.document.nunique())
print (subtask1_dev.groupby(['document'])['sentence'].nunique().max())
print (subtask1_dev.groupby(['document','sentence'])['word'].nunique().max())

250
22
264


In [13]:
subtask1_train.to_csv('../data/subtask1_train.csv',index=False)
subtask1_dev.to_csv('../data/subtask1_dev.csv',index=False)

In [14]:
document_count = 0
documents = []
sentences = []
words = []
tags = []
n1s = []
n2s = []
poss = []
for file in tqdm(os.listdir('../raw_data/background/')):
    
    if '.txt' in file:

        annfile = file.replace('.txt','.ann')
        #document_count += 1
        document = file.replace('.txt','')
        
        text = open(os.path.join('../raw_data/background/',file),'r',encoding='utf-8').readlines()
        
        sentence_count = 0
        n1 = 0
        n2 = 0
        lines = "".join(text)
        for c in lines:
            n2 += 1
            if c == ' ':
                if lines[n2-2] in [',','.','?','%',';','-','+','&']: 
                    tag = annotation_dict.get((n1,n2-2),"OTHER")
                    word = lines[n1:n2-2]
                    #print ((n1,n2-2), word, tag)
                else:
                    tag = annotation_dict.get((n1,n2-1),"OTHER")
                    word = lines[n1:n2-1]
                if len(word) > 0 and word not in stops:
                    documents.append(document)
                    sentences.append(sentence_count)
                    words.append(word.lower())
                    poss.append(get_pos(word))
                    n1s.append(n1)
                    if lines[n2-2] in [',','.','?','%',';','-','+','&']:
                        n2s.append(n2-2)
                    else:
                        n2s.append(n2-1)
                    #print ((n1,n2-1), word, tag)
                n1 = n2
            if c == '\n':
                sentence_count += 1

subtask1_test = pd.DataFrame()
subtask1_test['document'] = documents
subtask1_test['sentence'] = sentences
subtask1_test['n1'] = n1s
subtask1_test['n2'] = n2s
subtask1_test['word'] = words
subtask1_test['pos'] = poss

100%|██████████| 3751/3751 [3:00:26<00:00,  1.06s/it]    


In [15]:
subtask1_dev.head(10)

Unnamed: 0,document,sentence,n1,n2,word,tag,pos
0,S1139-76322016000300016-2,0,0,8,lactante,OTHER,ADV
1,S1139-76322016000300016-2,0,12,13,1,OTHER,NUM
2,S1139-76322016000300016-2,0,14,17,mes,OTHER,NOUN
3,S1139-76322016000300016-2,0,20,22,29,OTHER,NUM
4,S1139-76322016000300016-2,0,23,27,días,OTHER,NOUN
5,S1139-76322016000300016-2,0,33,45,antecedentes,OTHER,NOUN
6,S1139-76322016000300016-2,0,49,56,interés,OTHER,NOUN
7,S1139-76322016000300016-2,0,62,67,acude,OTHER,VERB
8,S1139-76322016000300016-2,0,70,79,urgencias,OTHER,PROPN
9,S1139-76322016000300016-2,0,83,92,pediatría,OTHER,VERB


In [16]:
subtask1_test.head(10)

Unnamed: 0,document,sentence,n1,n2,word,pos
0,S0004-06142008000100008-1,0,0,11,presentamos,VERB
1,S0004-06142008000100008-1,0,15,19,caso,NOUN
2,S0004-06142008000100008-1,0,27,32,mujer,NOUN
3,S0004-06142008000100008-1,0,36,38,30,NUM
4,S0004-06142008000100008-1,0,39,43,años,NOUN
5,S0004-06142008000100008-1,0,45,53,fumadora,ADJ
6,S0004-06142008000100008-1,0,57,59,20,NUM
7,S0004-06142008000100008-1,0,60,75,cigarrillos/día,NOUN
8,S0004-06142008000100008-1,0,88,100,antecedentes,NOUN
9,S0004-06142008000100008-1,0,101,111,personales,ADJ


In [17]:
subtask1_test.to_csv('../data/subtask1_test.csv',index=False)