In [1]:
! pip install stanza
! pip install -U 'scikit-learn<0.24'
!pip install sklearn-crfsuite

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting stanza
  Downloading stanza-1.4.2-py3-none-any.whl (691 kB)
[K     |████████████████████████████████| 691 kB 3.9 MB/s 
Collecting emoji
  Downloading emoji-2.2.0.tar.gz (240 kB)
[K     |████████████████████████████████| 240 kB 54.9 MB/s 
Building wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-2.2.0-py3-none-any.whl size=234926 sha256=a2ac05255e464715c451c81ddb30a0a2edb12c4361fe5a72077a28dad4db3d5c
  Stored in directory: /root/.cache/pip/wheels/86/62/9e/a6b27a681abcde69970dbc0326ff51955f3beac72f15696984
Successfully built emoji
Installing collected packages: emoji, stanza
Successfully installed emoji-2.2.0 stanza-1.4.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-learn<0.24
  Downloading scikit_learn-0.23.2-cp38-cp38

In [2]:
import pandas as pd
import stanza
from tqdm import tqdm
from collections import defaultdict
from sklearn.model_selection import train_test_split
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [3]:
stanza.download('ru')
nlp = stanza.Pipeline('ru', processors='tokenize,pos')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

INFO:stanza:Downloading default packages for language: ru (Russian) ...


Downloading https://huggingface.co/stanfordnlp/stanza-ru/resolve/v1.4.1/models/default.zip:   0%|          | 0…

INFO:stanza:Finished downloading models and saved to /root/stanza_resources.
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

INFO:stanza:Loading these models for language: ru (Russian):
| Processor | Package   |
-------------------------
| tokenize  | syntagrus |
| pos       | syntagrus |

INFO:stanza:Use device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Функция для BIO-тэггинга последовательностей

In [4]:
def bio_tagged_df(reviews, aspects):
  texts_ids = []
  texts = []
  pos_tags = []
  bio_tags = []
  starts = []
  stops = []

  for text_id, text in tqdm(reviews.items()):
  
    processed = nlp(text)
  
    for token in processed.iter_tokens():
    
      add = False
    
      for mention in aspects[text_id]:
        if token.start_char == int(mention['start']) and token.end_char <= int(mention['stop']):

          texts_ids.append(text_id)
          texts.append(token.text)
          pos_tags.append(token.words[0].upos)
          bio_tags.append('B-'+mention['aspect'])
          starts.append(token.start_char)
          stops.append(token.end_char)
          add = True

        elif token.start_char > int(mention['start']) and token.end_char <= int(mention['stop']):
          texts_ids.append(text_id)
          texts.append(token.text)
          pos_tags.append(token.words[0].upos)
          bio_tags.append('I-'+mention['aspect'])
          starts.append(token.start_char)
          stops.append(token.end_char)
          add = True

      if not add:
        texts_ids.append(text_id)
        texts.append(token.text)
        pos_tags.append(token.words[0].upos)
        bio_tags.append('O')
        starts.append(token.start_char)
        stops.append(token.end_char)

  bio_df = pd.DataFrame({'text_id': texts_ids,
                         'text': texts,
                         'pos_tag': pos_tags,
                         'bio_tag': bio_tags,
                         'start': starts,
                         'stop': stops
                         })
  return bio_df

Сразу возьмем все тестовые отзывы (не split), чтобы было больше данных, как тест будем использовать таргетные отзывы

#### Train

In [5]:
#загружаем данные
!wget https://raw.githubusercontent.com/named-entity/hse-nlp/master/4th_year/Project/train_aspects.txt
!wget https://raw.githubusercontent.com/named-entity/hse-nlp/master/4th_year/Project/train_reviews.txt

--2022-12-29 07:38:05--  https://raw.githubusercontent.com/named-entity/hse-nlp/master/4th_year/Project/train_aspects.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 228391 (223K) [text/plain]
Saving to: ‘train_aspects.txt’


2022-12-29 07:38:05 (7.07 MB/s) - ‘train_aspects.txt’ saved [228391/228391]

--2022-12-29 07:38:05--  https://raw.githubusercontent.com/named-entity/hse-nlp/master/4th_year/Project/train_reviews.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 446118 (436K) [text/plain]
Saving to: ‘train_reviews.txt’



In [6]:
reviews = {}
with open('train_reviews.txt') as f:
  for line in f:
    line = line.rstrip('\r\n').split('\t')
    reviews[line[0]] = line[1]

In [7]:
train_aspects = pd.read_csv('train_aspects.txt', delimiter = '\t', names = ['review_id', 'aspect', 'text', 'start', 'stop', 'sent'])

aspects = defaultdict(list)
with open('train_aspects.txt') as f:
  for line in f:
    line = line.rstrip('\r\n').split('\t')
    keys = ('aspect', 'text', 'start', 'stop', 'sent')
    # ['text_id', 'category', 'mention', 'start', 'end', 'sentiment']
    # тут можно отдельно запомнить начало и конец каждого упоминания
    aspects[line[0]].append(dict(zip(keys, line[1:])))

In [8]:
bio_df_train = bio_tagged_df(reviews=reviews,
                             aspects=aspects)

100%|██████████| 284/284 [06:44<00:00,  1.43s/it]


Наблюдается дисбаланс классов

In [9]:
bio_df_train['bio_tag'].value_counts()

O             40858
B-Food         1877
B-Service      1246
I-Food          959
B-Whole         795
B-Interior      686
I-Service       283
I-Interior      189
I-Whole         187
B-Price         134
I-Price          30
Name: bio_tag, dtype: int64

#### Тест

In [10]:
#загружаем данные
!wget https://raw.githubusercontent.com/named-entity/hse-nlp/master/4th_year/Project/dev_aspects.txt
!wget https://raw.githubusercontent.com/named-entity/hse-nlp/master/4th_year/Project/dev_reviews.txt

--2022-12-29 07:46:21--  https://raw.githubusercontent.com/named-entity/hse-nlp/master/4th_year/Project/dev_aspects.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 57508 (56K) [text/plain]
Saving to: ‘dev_aspects.txt’


2022-12-29 07:46:22 (3.99 MB/s) - ‘dev_aspects.txt’ saved [57508/57508]

--2022-12-29 07:46:22--  https://raw.githubusercontent.com/named-entity/hse-nlp/master/4th_year/Project/dev_reviews.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 110515 (108K) [text/plain]
Saving to: ‘dev_reviews.txt’


2022-12-29 07

In [11]:
reviews = {}
with open('dev_reviews.txt') as f:
  for line in f:
    line = line.rstrip('\r\n').split('\t')
    reviews[line[0]] = line[1]

test_aspects = pd.read_csv('dev_aspects.txt', delimiter = '\t', names = ['review_id', 'aspect', 'text', 'start', 'stop', 'sent'])

aspects = defaultdict(list)
with open('dev_aspects.txt') as f:
  for line in f:
    line = line.rstrip('\r\n').split('\t')
    keys = ('aspect', 'text', 'start', 'stop', 'sent')
    # ['text_id', 'category', 'mention', 'start', 'end', 'sentiment']
    # тут можно отдельно запомнить начало и конец каждого упоминания
    aspects[line[0]].append(dict(zip(keys, line[1:])))

In [12]:
bio_df_test = bio_tagged_df(reviews=reviews,
                            aspects=aspects)

100%|██████████| 71/71 [01:46<00:00,  1.49s/it]


### CRF

In [18]:
def word2features(sent, i):
    # достаёт фичи для i-го токена в предложении
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
        })
    else:
        features['BOS'] = True
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
                
    return features


def sent2features(sent):
    # достаёт фичи для всех токенов в предложении
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [19]:
sentences_train = defaultdict(list)
for elem in bio_df_train.itertuples():
  sentences_train[elem.text_id].append((elem.text, elem.pos_tag, elem.bio_tag))

sentences_train = list(sentences_train.values())

sentences_test = defaultdict(list)
for elem in bio_df_test.itertuples():
  sentences_test[elem.text_id].append((elem.text, elem.pos_tag, elem.bio_tag))

sentences_test = list(sentences_test.values())

In [20]:
X_train = [sent2features(s) for s in sentences_train]
y_train = [sent2labels(s) for s in sentences_train]
X_test = [sent2features(s) for s in sentences_test]
y_test = [sent2labels(s) for s in sentences_test]

In [21]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.1, 
    c2=0.1, 
    max_iterations=100, 
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [17]:
labels = list(set([elem for e in y_train for elem in e]))

y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, 
                      average='weighted', labels=labels)

0.988720485112953

In [22]:
sorted_labels = sorted(
    labels, 
    key=lambda name: (name[1:], name[0])
)

print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

           O      0.994     0.995     0.994     10211
      B-Food      0.937     0.962     0.949       449
      I-Food      0.982     0.993     0.987       273
  B-Interior      0.976     0.938     0.957       176
  I-Interior      0.853     1.000     0.921        29
     B-Price      0.971     1.000     0.986        34
     I-Price      1.000     1.000     1.000        11
   B-Service      0.971     0.899     0.934       338
   I-Service      0.958     0.919     0.938        74
     B-Whole      0.926     0.941     0.933       185
     I-Whole      1.000     0.979     0.989        48

    accuracy                          0.989     11828
   macro avg      0.961     0.966     0.963     11828
weighted avg      0.989     0.989     0.989     11828





In [23]:
def raw_cat_from_bio(text):
  return text.replace('B-', '').replace('I-', '')

In [27]:
#перевод в обычный формат

predicted_tags = [elem for e in y_pred for elem in e]

aspect_spans = []
text_id = 0
cur_span = ''
start = 0
end = 0
tag_ = ''

for elem in bio_df_test.itertuples():
  tag = predicted_tags[elem.Index]
  if tag != 'O':
    
    # продолжение последовательности
    if tag.startswith('I-'):

      spaces = ' ' * (int(elem.start) - int(end))
      cur_span = cur_span + spaces + elem.text 
      end = elem.stop

    #обновление последовательности
    elif tag.startswith('B-'):

      if cur_span:
        aspect_spans.append((text_id, cur_span, start, end, tag_))

      cur_span = elem.text
      text_id = str(elem.text_id)
      start = str(elem.start)
      end = str(elem.stop)
      tag_ = tag.replace('B-', '')

aspect_spans.append((text_id, cur_span, start, end, tag_))

In [28]:
aspect_spans[:5]

[('13823', '"аппетит"', '7', 16, 'Whole'),
 ('13823', 'встретил', '138', '146', 'Service'),
 ('13823', 'менеджер', '147', '155', 'Service'),
 ('13823', 'девушка', '179', '186', 'Service'),
 ('13823', 'проводила к столу', '188', 205, 'Service')]

In [29]:
#запись результатов выделения аспектов в файл

with open('dev_aspects_spans_predicted.txt', 'w', encoding = 'utf-8') as f:
  for elem in aspect_spans:
    f.write('\t'.join([str(e) for e in elem]) + '\n')

In [30]:
# сохранение модели
import pickle

filename = 'aspect_extraction_model.pkl'
pickle.dump(crf, open(filename, 'wb'))