In [1]:
from underthesea import word_tokenize, sent_tokenize
from string import punctuation
from sklearn.model_selection import train_test_split
from collections import Counter

In [2]:
def train_preprocess(text):
    res = []
    sents = sent_tokenize(text)
    for sent in sents:
        clean_sent = word_tokenize(sent, format='text')
        words = clean_sent.split()
        clean_sent = [word.lower() for word in words if word not in punctuation and word not in stop_words]
        res.append(clean_sent)
    return res

In [3]:
r = open('wordset/stop_words.txt', 'r', encoding='utf-8')
stop_words = r.read().split('\n')
r.close()

In [4]:
def build_vocab(file, dataset):
    words = []
    for sent in dataset:
        for word in sent:
            words.append(word)

    vocab = Counter(words)
    vocab = {k: c for k, c in vocab.items() if c > 5}
    w = open(file, 'w', encoding='utf-8')
    for word in vocab:
        w.write(word+'\n')
    w.close()

    return vocab

In [5]:
def test_preprocess(src_file, test_file, dev_file):
    r = open(src_file, 'r', encoding='utf-8')
    text  = r.read().split('\n')[:-1]
    r.close()

    sents = []
    aspects = []
    for line in text:
        ele = line.split('|')
        sent = train_preprocess(ele[0])
        if len(sent) > 0:
            sent = ' '.join(sent[0])
        else:
            continue

        if len(ele) <= 2:
            continue
        aspect = {}
        asp = ''
        for i in range(1, len(ele), 2):
            asp = ele[i].split('#')[0]
            aspect[asp] = 0
        if len(aspect) == 1:
            sents.append(sent)
            aspects.append(asp)

    sents_test, sents_dev, aspects_test, aspects_dev = train_test_split(sents, aspects, test_size=0.4, random_state=42)

    w = open(test_file, 'w', encoding='utf-8')
    for i in range(len(sents_test)):
        w.write(sents_test[i]+'|'+aspects_test[i]+'\n')
    w.close() 

    w = open(dev_file, 'w', encoding='utf-8')
    for i in range(len(sents_dev)):
        w.write(sents_dev[i]+'|'+aspects_dev[i]+'\n')
    w.close() 

    return aspects

## Hotel

In [6]:
r = open('original/unlabelled_hotel.txt', 'r', encoding='utf-8')
prehotel = r.read().split('\n')[:-1]
r.close()

hotel = []

for text in prehotel:
    processed_text = train_preprocess(text)
    hotel = [*hotel, *processed_text]
len(hotel)

47835

In [7]:
hotel_vocab = build_vocab('wordset/hotel_vocab.txt', hotel)

In [8]:
w = open('processed/hotel_train.txt', 'w', encoding='utf-8')
for sent in hotel:
    sent = [word for word in sent if word in hotel_vocab]
    if len(sent) > 2:
        w.write(' '.join(sent) + '\n')
w.close()

In [9]:
hotel_aspects = test_preprocess('original/labelled_hotel.txt', 'processed/hotel_test.txt', 'processed/hotel_dev.txt')

In [10]:
len_asp = Counter(hotel_aspects)
len_asp

Counter({'ROOM_AMENITIES': 135,
         'SERVICE': 223,
         'HOTEL': 234,
         'FACILITIES': 59,
         'ROOMS': 159,
         'LOCATION': 84,
         'FOOD&DRINKS': 94})

## Restaurant

In [11]:
r = open('original/unlabelled_restaurant.txt', 'r', encoding='utf-8')
prerestaurant = r.read().split('\n')[:-1]
r.close()

restaurant = []

for text in prerestaurant:
    processed_text = train_preprocess(text)
    restaurant = [*restaurant, *processed_text]
len(restaurant)

35485

In [12]:
restaurant_vocab = build_vocab('wordset/restaurant_vocab.txt', restaurant)

In [13]:
w = open('processed/restaurant_train.txt', 'w', encoding='utf-8')
for sent in restaurant:
    sent = [word for word in sent if word in restaurant_vocab]
    if len(sent) > 2:
        w.write(' '.join(sent) + '\n')
w.close()

In [14]:
restaurant_aspects = test_preprocess('original/labelled_restaurant.txt', 'processed/restaurant_test.txt', 'processed/restaurant_dev.txt')

In [15]:
len_asp = Counter(restaurant_aspects)
len_asp

Counter({'FOOD': 604,
         'RESTAURANT': 92,
         'LOCATION': 36,
         'AMBIENCE': 48,
         'SERVICE': 48,
         'DRINKS': 54})