In [28]:
! pip install stanza --quiet
! pip install sklearn_crfsuite --quiet

In [29]:
import pandas as pd
import copy
import stanza

In [30]:
stanza.download('ru')
nlp = stanza.Pipeline('ru', processors='tokenize,pos,lemma')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.7.0.json:   0%|   …

INFO:stanza:Downloading default packages for language: ru (Russian) ...
INFO:stanza:File exists: /root/stanza_resources/ru/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources.
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.7.0.json:   0%|   …

INFO:stanza:Loading these models for language: ru (Russian):
| Processor | Package            |
----------------------------------
| tokenize  | syntagrus          |
| pos       | syntagrus_charlm   |
| lemma     | syntagrus_nocharlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Done loading processors!


# Загрузка данных

In [31]:
!wget https://raw.githubusercontent.com/named-entity/hse-nlp/master/4th_year/Project/dev_reviews.txt

--2023-12-28 21:49:59--  https://raw.githubusercontent.com/named-entity/hse-nlp/master/4th_year/Project/dev_reviews.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 110515 (108K) [text/plain]
Saving to: ‘dev_reviews.txt.1’


2023-12-28 21:49:59 (3.93 MB/s) - ‘dev_reviews.txt.1’ saved [110515/110515]



Здесь нужно указать путь к файлам с тестовыми данными:

In [32]:
test_texts_path = 'dev_reviews.txt'

In [33]:
test_texts = pd.read_csv(test_texts_path, delimiter='\t', names = ['review_id', 'text'])

In [34]:
dev_sentences = []
for row in test_texts.itertuples(): #такая же предобработка, как и при обучении
    doc = nlp(row.text)
    current_position = 0
    for sentence in doc.sentences:
      s = []
      for token in sentence.words:
        start = row.text.find(token.text, current_position)
        end = start + len(token.text)
        current_position = end  # обновляем текущую позицию

        s.append([row.review_id, token.text, start, end, token.lemma, token.pos])
      dev_sentences.append(s)

In [35]:
def word2features(sent, i):
    word = sent[i][1]
    postag = sent[i][-1]

    features = {
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.isdigit()': word.isdigit(),
        'postag': postag
    }
    if i > 0:
        word1 = sent[i-1][1]
        postag1 = sent[i-1][-1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][1]
        postag1 = sent[i+1][-1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for _, label, token, _, _, _, _ in sent]

In [36]:
X_test = [sent2features(s) for s in dev_sentences]

In [37]:
import pickle

In [38]:
! wget https://raw.githubusercontent.com/zadushevno/nlp-4-project/main/aspect_extraction/crf.pkl

--2023-12-28 21:51:35--  https://raw.githubusercontent.com/zadushevno/nlp-4-project/main/aspect_extraction/crf.pkl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1052200 (1.0M) [application/octet-stream]
Saving to: ‘crf.pkl.1’


2023-12-28 21:51:36 (14.5 MB/s) - ‘crf.pkl.1’ saved [1052200/1052200]



In [39]:
crf = pickle.load(open('crf.pkl', 'rb'))

In [40]:
y_pred = crf.predict(X_test)

In [41]:
results = []
for i in range(len(dev_sentences)): #сохраняем все, что не O
  for j in range(len(dev_sentences[i])):
    if y_pred[i][j] != 'O':
      results.append([dev_sentences[i][j][0], y_pred[i][j], dev_sentences[i][j][1], dev_sentences[i][j][2], dev_sentences[i][j][3]])

In [42]:
results_concatenated = []
i = 0
while i < len(results): #соединяем в одну нграму последовательности B- I-*
    if results[i][1].startswith('B-'):
        ngram = [results[i][2]]
        j = i + 1
        while j < len(results) and results[j][1].startswith('I-'):
            ngram.append(results[j][2])
            j += 1
        results_concatenated.append([results[i][0], results[i][1][2:], ' '.join(ngram), results[i][3], results[j-1][4]])
        i = j
    else:
        results_concatenated.append([results[i][0], results[i][1][2:], results[i][2], results[i][3], results[i][4]])
        i += 1


In [48]:
result_df = pd.DataFrame(results_concatenated)

In [50]:
result_df.to_csv('pred_aspects.txt', sep='\t', index=False, header=False)