In [11]:
import os
import re
import codecs
import nltk

In [12]:
def process(text):
    tokenizer = nltk.tokenize.TweetTokenizer()
    text = text.lower()
    text = re.sub('<e1>', '<e1> ', text)
    text = re.sub('<e2>', '<e2> ', text)
    text = re.sub('</e1>', ' </e1>', text)
    text = re.sub('</e2>', ' </e2>', text)
    text = re.sub('\d+\.\d+', 'NUMERICAL', text)
    text = re.sub('[a-zA-Z\.]+\.com', 'URL', text)
    text = re.sub('[a-zA-Z\.]+\.org', 'URL', text)
    text = ' '.join(tokenizer.tokenize(text))
    return text

In [13]:
train_file = codecs.open('./SemEval2010_task8_training/TRAIN_FILE.TXT', 'r', encoding='utf8')
lines = train_file.readlines()
texts = []
categories = []

for index, line in enumerate(lines):
    if '\t' in line:
        line = line.strip()
        text = line.split('\t')[1][1:-1]
        text = process(text)
        category = lines[index + 1].strip()
        texts.append(text)
        categories.append(category)

assert(len(texts) == 8000)
assert(len(categories) == 8000)

In [14]:
from sklearn.model_selection import StratifiedKFold
sk_fold = StratifiedKFold(n_splits=10, random_state=1229, shuffle=True)
train_ids = []
valid_ids = []

for train_idx, valid_idx in sk_fold.split(texts, categories):
    train_ids = train_idx
    valid_ids = valid_idx

train_texts = []
train_categories = []

valid_texts = []
valid_categories = []

for train_id in train_ids:
    train_texts.append(texts[train_id])
    train_categories.append(categories[train_id])

for valid_id in valid_ids:
    valid_texts.append(texts[valid_id])
    valid_categories.append(categories[valid_id])
    
print("Train Data Size: {}".format(len(train_texts)))
print("Valid Data Size: {}".format(len(valid_texts)))

Train Data Size: 7208
Valid Data Size: 792


In [15]:
# import random

# random.seed(666)
# indexs = [i for i in range(len(texts))]
# random.shuffle(indexs)
# valid_rate = 0.1

# valid_ids = indexs[: int(len(texts) * 0.1)]
# train_ids = indexs[int(len(texts) * 0.1): ]

# train_texts = []
# train_categories = []

# valid_texts = []
# valid_categories = []

# for train_id in train_ids:
#     train_texts.append(texts[train_id])
#     train_categories.append(categories[train_id])

# for valid_id in valid_ids:
#     valid_texts.append(texts[valid_id])
#     valid_categories.append(categories[valid_id])
    
# print("Train Data Size: {}".format(len(train_texts)))
# print("Valid Data Size: {}".format(len(valid_texts)))

In [16]:
# from sklearn.model_selection import KFold
# sk_fold = KFold(n_splits=10, random_state=666)
# train_ids = []
# valid_ids = []

# for train_idx, valid_idx in sk_fold.split(texts, categories):
#     train_ids = train_idx
#     valid_ids = valid_idx

# train_texts = []
# train_categories = []

# valid_texts = []
# valid_categories = []

# for train_id in train_ids:
#     train_texts.append(texts[train_id])
#     train_categories.append(categories[train_id])

# for valid_id in valid_ids:
#     valid_texts.append(texts[valid_id])
#     valid_categories.append(categories[valid_id])
    
# print("Train Data Size: {}".format(len(train_texts)))
# print("Valid Data Size: {}".format(len(valid_texts)))

In [17]:
def write2file(texts, categories, target_file):
    target_file = codecs.open(target_file, 'w', encoding='utf8')
    for index, (text, category) in enumerate(zip(texts, categories)):
        target_file.write("{}\t{}\t{}\n".format(index + 1, text, category))
    target_file.close()

In [18]:
write2file(train_texts, train_categories, './train.txt')
write2file(valid_texts, valid_categories, './valid.txt')

In [19]:
# target_file = codecs.open('test.txt', 'w', encoding='utf8')
# with codecs.open('./SemEval2010_task8_testing/TEST_FILE.txt', 'r', encoding='utf8') as f, codecs.open('./SemEval2010_task8_testing_keys/TEST_FILE_KEY.TXT') as f1:
#     for line, line1 in zip(f.readlines(), f1.readlines()):
#         line = line.strip()
#         items = line.split('\t')
        
#         line1 = line1.strip()
#         items1 = line1.split('\t')
        
#         target_file.write('{}\t{}\t{}\n'.format(items[0], process(items[1][1:-1]), items1[1]))

In [20]:
train_file = codecs.open('./SemEval2010_task8_testing_keys/TEST_FILE_FULL.TXT', 'r', encoding='utf8')
lines = train_file.readlines()
texts = []
categories = []

for index, line in enumerate(lines):
    if '\t' in line:
        line = line.strip()
        text = line.split('\t')[1][1:-1]
        text = process(text)
        category = lines[index + 1].strip()
        texts.append(text)
        categories.append(category)

write2file(texts, categories, './test.txt')