In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from collections import Counter, OrderedDict

## Restaurant

In [2]:
aspect2id = {
    'RESTAURANT': 0,
    'FOOD': 1,
    'DRINKS': 2,
    'LOCATION': 3,
    'AMBIENCE': 4,
    'SERVICE': 5,
}

In [3]:
sents = ''
sent_of_aspects = {}

r = open('processed/restaurant_dev.txt', 'r', encoding='utf-8')
txt = r.read().split('\n')[:-1]
r.close()
for line in txt:
    sent = line.split('|')[0]
    aspect = line.split('|')[1]
    label = aspect2id[aspect]
    sents = sents + sent + ' '
    if label not in sent_of_aspects:
        sent_of_aspects[label] = []
        sent_of_aspects[label].append(sent + ' ')
    else:
        sent_of_aspects[label][0] = sent_of_aspects[label][0] + sent +' '

In [4]:
vectorizer = TfidfVectorizer(norm='l1')
t = vectorizer.fit_transform([sents])
t = dict(zip(vectorizer.get_feature_names(), t.toarray()[0]))



In [5]:
seeds = {}
for label in sent_of_aspects:
    vectorizer = TfidfVectorizer(norm='l1')
    ta = vectorizer.fit_transform(sent_of_aspects[label])
    ta = dict(zip(vectorizer.get_feature_names(), ta.toarray()[0]))
    score = {}
    for word in ta:
        score[word] = ta[word] * np.log2(ta[word] / t[word])
    score = {k: v for k, v in sorted(score.items(), key=lambda item: item[1])}
    score = Counter(score)
    score = score.most_common(30)
    seeds[label] = []
    for (word, sc) in score:
        seeds[label].append(word)
    

In [6]:
od = OrderedDict(sorted(seeds.items()))
w = open('wordset/restaurant_seeds.txt', 'w', encoding='utf-8')
for k, v in od.items():
    for word in od[k]:
        w.write(word + '|')
    w.write('\n')
w.close()

## Hotel

In [7]:
aspect2id = {
    'ROOM_AMENITIES': 0,
    'SERVICE' : 1,
    'ROOMS' : 2,
    'LOCATION': 3,
    'FOOD&DRINKS' : 4,
    'FACILITIES': 5,
    'HOTEL': 6,
}

In [8]:
sents = ''
sent_of_aspects = {}

r = open('processed/hotel_dev.txt', 'r', encoding='utf-8')
txt = r.read().split('\n')[:-1]
r.close()
for line in txt:
    sent = line.split('|')[0]
    aspect = line.split('|')[1]
    label = aspect2id[aspect]
    sents = sents + sent + ' '
    if label not in sent_of_aspects:
        sent_of_aspects[label] = []
        sent_of_aspects[label].append(sent + ' ')
    else:
        sent_of_aspects[label][0] = sent_of_aspects[label][0] + sent +' '

In [9]:
vectorizer = TfidfVectorizer(norm='l1')
t = vectorizer.fit_transform([sents])
t = dict(zip(vectorizer.get_feature_names(), t.toarray()[0]))

In [10]:
seeds = {}
for label in sent_of_aspects:
    vectorizer = TfidfVectorizer(norm='l1')
    ta = vectorizer.fit_transform(sent_of_aspects[label])
    ta = dict(zip(vectorizer.get_feature_names(), ta.toarray()[0]))
    score = {}
    for word in ta:
        score[word] = ta[word] * np.log2(ta[word] / t[word])
    score = {k: v for k, v in sorted(score.items(), key=lambda item: item[1])}
    score = Counter(score)
    score = score.most_common(30)
    seeds[label] = []
    for (word, sc) in score:
        seeds[label].append(word)

In [11]:
od = OrderedDict(sorted(seeds.items()))
w = open('wordset/hotel_seeds.txt', 'w', encoding='utf-8')
for k, v in od.items():
    for word in od[k]:
        w.write(word + '|')
    w.write('\n')
w.close()