# IMPORTS AND STUFF

In [None]:
!pip install stanza
!pip install sklearn_crfsuite
!pip install -U 'scikit-learn<0.24'

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from collections import defaultdict, Counter
from tqdm.notebook import tqdm
import stanza
from sklearn.model_selection import train_test_split
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from sklearn.model_selection import cross_val_score

stanza.download('ru')
nlp = stanza.Pipeline('ru', processors='tokenize,pos')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

INFO:stanza:Downloading default packages for language: ru (Russian) ...
INFO:stanza:File exists: /root/stanza_resources/ru/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources.
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

INFO:stanza:Loading these models for language: ru (Russian):
| Processor | Package   |
-------------------------
| tokenize  | syntagrus |
| pos       | syntagrus |

INFO:stanza:Use device: gpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


# DATA PREPARATION

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
texts, ids = [], []
with open('drive/My Drive/nlp/train_reviews.txt') as f:
  for line in f:
    text_id, text = line.rstrip('\r\n').split('\t')
    texts.append(text)
    ids.append(text_id)

In [None]:
train_texts, dev_texts, train_ids, dev_ids = train_test_split(texts, ids)

In [None]:
train_aspects, dev_aspects = [], []
with open("drive/My Drive/nlp/train_aspects.txt") as f:
  for line in f:
    line = line.rstrip('\r\n')
    text_id = line.split('\t')[0]
    if text_id in train_ids:
      train_aspects.append(line)
    if text_id in dev_ids:
      dev_aspects.append(line)

In [None]:
train_sentiment, dev_sentiment = [], []
with open('drive/My Drive/nlp/train_cats.txt') as f:
  for line in f:
    line = line.rstrip('\r\n')
    text_id = line.split('\t')[0]
    if text_id in train_ids:
      train_sentiment.append(line)
    if text_id in dev_ids:
      dev_sentiment.append(line)

In [None]:
with open("train_split_aspects.txt", 'w') as f:
  for l in train_aspects:
    print(l, file=f)
with open('dev_aspects.txt', "w") as f:
  for l in dev_aspects:
    print(l, file=f)
with open('train_split_reviews.txt', "w") as f:
  for i, l in zip(train_ids, train_texts):
    print(i, l, sep="\t", file=f)
with open('dev_reviews.txt', "w") as f:
  for i, l in zip(dev_ids, dev_texts):
    print(i, l, sep="\t", file=f)
with open('train_split_cats.txt', "w") as f:
  for l in train_sentiment:
    print(l, file=f)
with open('dev_cats.txt', "w") as f:
  for l in dev_sentiment:
    print(l, file=f)

### reading files for further processing 

In [None]:
dev_aspects = []
with open("dev_aspects.txt") as f:
  for line in f:
    line = line.rstrip('\r\n')
    text_id = line.split('\t')[0]
    dev_aspects.append(line)

In [None]:
words_dict = {}
for aspect in dev_aspects:
  cat = aspect.split('\t')[1]
  word = aspect.split('\t')[2]
  words_dict[word] = cat

In [None]:
sents_dict = {}
for aspect in dev_aspects:
  cat = aspect.split('\t')[-1]
  word = aspect.split('\t')[2]
  sents_dict[word] = cat

In [None]:
dev_texts, dev_ids = [], []
with open('dev_reviews.txt') as f:
  for line in f:
    text_id, text = line.rstrip('\r\n').split('\t')
    dev_texts.append(text)
    dev_ids.append(text_id)

In [None]:
dev_sentiment = []
with open('dev_cats.txt') as f:
  for line in f:
    line = line.rstrip('\r\n')
    text_id = line.split('\t')[0]
    dev_sentiment.append(line)

# BIO AND SENTIMENT TAGGING

In [None]:
reviews_train, review_test = {}, {}

for i in range(len(train_texts)):
  reviews_train[train_ids[i]] = train_texts[i]

for i in range(len(dev_texts)):
  review_test[dev_ids[i]] = dev_texts[i]

In [None]:
aspects_train, aspects_test = defaultdict(list), defaultdict(list)

In [None]:
keys = ('category', 'mention', 'start', 'end', 'sentiment')

for t in train_aspects:
  text_id = t.split('\t')[0]
  line = t.split('\t')[1:]
  aspects_train[text_id].append(dict(zip(keys, line)))

for t in dev_aspects:
  text_id = t.split('\t')[0]
  line = t.split('\t')[1:]
  aspects_test[text_id].append(dict(zip(keys, line)))

In [None]:
bio_train_texts = {}

for text_id, text in tqdm(list(reviews_train.items())):
  text_bio = []
  processed = nlp(text)
  for token in processed.iter_tokens():
    add = False
    for mention in aspects_train[text_id]:
      if token.start_char == int(mention['start']):
        bio = 'B-' + mention['category'] + '-' + mention['sentiment']
        add = True
        break
      elif token.end_char == int(mention['end']):
        bio = 'E-' + mention['category'] + '-' + mention['sentiment']
        add = True
        break
      elif int(mention['start']) < token.start_char < int(mention['end']):
        bio = 'I-' + mention['category'] + '-' + mention['sentiment']
        add = True
        break

    if not add:
      bio = 'O'

    text_bio.append(bio)

  bio_train_texts[text_id] = text_bio

  0%|          | 0/213 [00:00<?, ?it/s]

In [None]:
bio_test_texts = {}

for text_id, text in tqdm(list(review_test.items())):
  text_bio = []
  processed = nlp(text)
  for token in processed.iter_tokens():
    add = False
    for mention in aspects_test[text_id]:
      if token.start_char == int(mention['start']):
        bio = 'B-' + mention['category'] + '-' + mention['sentiment']
        add = True
        break
      elif token.end_char == int(mention['end']):
        bio = 'E-' + mention['category'] + '-' + mention['sentiment']
        add = True
        break
      elif int(mention['start']) < token.start_char < int(mention['end']):
        bio = 'I-' + mention['category'] + '-' + mention['sentiment']
        add = True
        break

    if not add:
      bio = 'O'

    text_bio.append(bio)

  bio_test_texts[text_id] = text_bio

  0%|          | 0/71 [00:00<?, ?it/s]

In [None]:
def word2features(sent, i):
    word = sent[i].text
    postag = sent[i].upos
    
    features = {
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
    }
    if i > 0:
        word1 = sent[i-1].text
        postag1 = sent[i-1].pos
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
        })
    else:
        features['BOS'] = True
        
    if i < len(sent)-1:
        word1 = sent[i+1].text
        postag1 = sent[i+1].pos
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
                
    return features


def text2features(text):
  prep_text = nlp(text)
  toks = []
  for sent in prep_text.sentences:
    for token in sent.words:
      toks.append(token)
  return [word2features(toks, i) for i in range(len(toks))]

def text2labels(text_ids, labels):
    return [labels[i] for i in text_ids]

def text2tokens(text):
    prep_text = nlp(text)
    toks = []
    for sent in prep_text.sentences:
      for token in sent.words:
        toks.append(token)
    return toks

In [None]:
X_train = [text2features(s) for s in train_texts]
y_train = text2labels(train_ids, bio_train_texts)

X_test = [text2features(s) for s in dev_texts]
y_test = text2labels(dev_ids, bio_test_texts)

# CRF

In [None]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.1, 
    c2=0.1, 
    max_iterations=100, 
    all_possible_transitions=True
)
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [None]:
labels = list(crf.classes_)
labels.remove('O')
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, 
                      average='weighted', labels=labels)

  _warn_prf(


0.4154056385630595

In [None]:
sorted_labels = sorted(
    labels, 
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

                     precision    recall  f1-score   support

        B-Food-both      0.000     0.000     0.000         4
        E-Food-both      0.000     0.000     0.000         1
        I-Food-both      0.000     0.000     0.000         0
    B-Food-negative      0.250     0.062     0.100        64
    E-Food-negative      0.000     0.000     0.000        14
    I-Food-negative      0.000     0.000     0.000         2
     B-Food-neutral      0.424     0.199     0.271       141
     E-Food-neutral      0.167     0.065     0.093        31
     I-Food-neutral      0.000     0.000     0.000        35
    B-Food-positive      0.564     0.475     0.516       297
    E-Food-positive      0.379     0.446     0.410        74
    I-Food-positive      0.347     0.461     0.395        76
    B-Interior-both      0.000     0.000     0.000         5
    E-Interior-both      0.000     0.000     0.000         2
B-Interior-negative      0.000     0.000     0.000        20
E-Interior-negative    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### For those words to which CRF has not assigned a category, but which occur in the training data and have a category there, we assign the same category and tone as in the training data. We write the results to a file




In [None]:
with open('dev_pred_aspects_sent_crf.txt', "w") as f:
  for i in tqdm(range(len(dev_texts))):
    preds = y_pred[i]
    dev_text = nlp(dev_texts[i])
    dev = []
    dev_ends = []
    dev_starts = []
    text = nlp(dev_text)
    sents = text.sentences
    for sent in sents:
      toks = sent.tokens
      for tok in toks:
        dev.append(tok.text)
        dev_ends.append(tok.end_char)
        dev_starts.append(tok.start_char)
    
    y_2 = 0
    for y in range(len(preds)):
      if y == y_2:
        y_2 = y + 1
        if preds[y] != 'O':
          if 'positive' in preds[y]:
            sent = 'positive'
            cat = preds[y][2:-9]

          elif 'negative' in preds[y]:
            sent = 'negative'
            cat = preds[y][2:-9]
          
          elif 'neutral' in preds[y]:
            sent = 'neutral'
            cat = preds[y][2:-8]
          
          elif 'both' in preds[y]:
            sent = 'both'
            cat = preds[y][2:-5]
          
          
          id_text = dev_ids[i]
          tokens_span = dev[y]
          ends_span = dev_ends[y]
          starts_span = dev_starts[y]
          for e in range(y+1, len(preds)):
            if 'positive' in preds[y]:
              cat_e = preds[e][2:-9]

            elif 'negative' in preds[y]:
              cat_e = preds[e][2:-9]
            
            elif 'neutral' in preds[y]:
              cat_e = preds[e][2:-8]
            
            elif 'both' in preds[y]:
              cat_e = preds[e][2:-5]

            if cat_e != cat:
              break
            else:
              token = ' ' + dev[e]
              ends_span = dev_ends[e]
              tokens_span += token
              y_2 = e + 1

          print(id_text, cat, tokens_span, starts_span, ends_span, sent, sep="\t", file=f)

        elif preds[y] == 'O':
          if dev[y] in sents_dict.keys():
            id_text = dev_ids[i]
            cat = words_dict[dev[y]]
            sent = sents_dict[dev[y]]
            tokens_span = dev[y]
            ends_span = dev_ends[y]
            starts_span = dev_starts[y]
            print(id_text, cat, tokens_span, starts_span, ends_span, sent, sep="\t", file=f)

  0%|          | 0/71 [00:00<?, ?it/s]

In [None]:
gold_aspect_cats = {}
with open('dev_aspects.txt') as fg:
    for line in fg:
        line = line.rstrip('\r\n').split('\t')
        if line[0] not in gold_aspect_cats:
            gold_aspect_cats[line[0]] = {"starts":[], "ends":[], "cats":[], "sents":[]}
        gold_aspect_cats[line[0]]["starts"].append(int(line[3]))
        gold_aspect_cats[line[0]]["ends"].append(int(line[4]))
        gold_aspect_cats[line[0]]["cats"].append(line[1])
        gold_aspect_cats[line[0]]["sents"].append(line[5])

# EVALUATION 1

In [None]:
full_match, partial_match, full_cat_match, partial_cat_match = 0, 0, 0, 0
total = 0
fully_matched_pairs = []
partially_matched_pairs = []
with open('dev_pred_aspects_sent_crf.txt') as fp:
    for line in fp:    
        total += 1
        line = line.rstrip('\r\n').split('\t')
        start, end = int(line[3]), int(line[4])
        category = line[1]
        doc_gold_aspect_cats = gold_aspect_cats[line[0]]
        if start in doc_gold_aspect_cats["starts"]:
            i = doc_gold_aspect_cats["starts"].index(start)
            if doc_gold_aspect_cats["ends"][i] == end:
                full_match += 1
                if doc_gold_aspect_cats["cats"][i] == category:
                    full_cat_match += 1
                else:
                    partial_cat_match += 1
                fully_matched_pairs.append(
                    (
                        [
                            doc_gold_aspect_cats["starts"][i], 
                            doc_gold_aspect_cats["ends"][i], 
                            doc_gold_aspect_cats["cats"][i],
                            doc_gold_aspect_cats["sents"][i]
                        ],
                        line
                    )
                )
                continue
        for s_pos in doc_gold_aspect_cats["starts"]:
            if start <= s_pos:
                i = doc_gold_aspect_cats["starts"].index(s_pos)
                if doc_gold_aspect_cats["ends"][i] == end:
                    partial_match += 1
                    partially_matched_pairs.append(
                        (
                            [
                                doc_gold_aspect_cats["starts"][i], 
                                doc_gold_aspect_cats["ends"][i], 
                                doc_gold_aspect_cats["cats"][i],
                                doc_gold_aspect_cats["sents"][i]
                            ],
                            line
                        )
                    )
                    if doc_gold_aspect_cats["cats"][i] == category:
                        partial_cat_match += 1
                    continue
                matched = False
                for e_pos in doc_gold_aspect_cats["ends"][i:]:
                    if s_pos <= end <= e_pos:
                        partial_match += 1
                        partially_matched_pairs.append(
                            (
                                [
                                    doc_gold_aspect_cats["starts"][i], 
                                    doc_gold_aspect_cats["ends"][i], 
                                    doc_gold_aspect_cats["cats"][i],
                                    doc_gold_aspect_cats["sents"][i]
                                ],
                                line
                            )
                        )
                        if doc_gold_aspect_cats["cats"][i] == category:
                            partial_cat_match += 1
                        matched = True
                        break
                if matched:
                    break
            if start > s_pos:
                i = doc_gold_aspect_cats["starts"].index(s_pos)
                if start < doc_gold_aspect_cats["ends"][i] <= end:
                    partial_match += 1
                    partially_matched_pairs.append(
                        (
                            [
                                doc_gold_aspect_cats["starts"][i], 
                                doc_gold_aspect_cats["ends"][i], 
                                doc_gold_aspect_cats["cats"][i],
                                doc_gold_aspect_cats["sents"][i]
                            ],
                            line
                        )
                    )
                    if doc_gold_aspect_cats["cats"][i] == category:
                        partial_cat_match += 1
                    break


In [None]:
gold_size = sum([len(gold_aspect_cats[x]["cats"]) for x in gold_aspect_cats])

In [None]:
print(f"""
Full match precision: {full_match / total}
Full match recall: {full_match / gold_size}
Partial match ratio in pred: {(full_match + partial_match)  / total}
Full category accuracy: {full_cat_match / total}
Partial category accuracy: {(full_cat_match + partial_cat_match) / total}
""")


Full match precision: 0.7187060478199718
Full match recall: 0.8241935483870968
Partial match ratio in pred: 0.8037974683544303
Full category accuracy: 0.7025316455696202
Partial category accuracy: 0.7862165963431786



# EVALUATION 2

In [None]:
def sentiment_accuracy(matches):
    matched_sentiment = 0.
    for pair in matches:
        *_, gold_s = pair[0]
        *_, pred_s = pair[1]
        if gold_s == pred_s:
            matched_sentiment += 1
    print(f"Mention sentiment accuracy: {matched_sentiment / len(matches)}")

In [None]:
sentiment_accuracy(fully_matched_pairs)

Mention sentiment accuracy: 0.7710371819960861


In [None]:
sentiment_accuracy(partially_matched_pairs)

Mention sentiment accuracy: 0.5289256198347108


# EVALUATION 3

In [None]:
def from_aspects_to_cats(path_from, path_to):
    aspects_d = {}
    with open (path_from, encoding='utf-8') as f:
        for line in f:
            line = line.split('\t')
            sent_id = line[0]
            cat = line[1]
            sent = line[-1].replace('\n', '')
            if sent_id not in aspects_d.keys():
                aspects_d[sent_id] = {}
            if cat not in aspects_d[sent_id]:
                aspects_d[sent_id][cat] = []
            aspects_d[sent_id][cat].append(sent)
    aspects_cat = dict.fromkeys(aspects_d.keys())
    cats = ['Food', 'Interior', 'Price', 'Whole', 'Service']
    for k in aspects_cat.keys():
        aspects_cat[k] = {}
        for c in cats:
            if c not in aspects_d[k].keys():
                aspects_cat[k][c] = 'absence'
            else:
                counter = Counter(aspects_d[k][c]).most_common()
                if len(counter)>1 and counter[0][0]==counter[1][0]:
                    aspects_cat[k][c] = 'both'
                else:
                    aspects_cat[k][c] = counter[0][0]
    with open(path_to, 'w', encoding='utf-8') as f:
        for k, v in aspects_cat.items():
            for c, s in v.items():
                print(k, c, s, sep='\t', file=f)

In [None]:
from_aspects_to_cats('dev_pred_aspects_sent_crf.txt', 'dev_pred_cats_sent.txt')

In [None]:
with open('dev_cats.txt') as gc, open('dev_pred_cats_sent.txt') as pc:
    gold_labels = set(gc.readlines())
    pred_labels = set(pc.readlines())
    print(
        "Overall sentiment accuracy:",
        len(gold_labels & pred_labels) / len(gold_labels)
    )

Overall sentiment accuracy: 0.6507042253521127
