# Imports

In [None]:
import typing as t

import pandas as pd
import pymorphy2
import natasha

# Read Our Parallel Corpus

In [None]:
df = pd.read_csv("data/parallel_dataset.csv", index_col=False)

In [None]:
df

# Data Processing

## Extract Linguistic Characteristics from each of texts via Pymorphy and Natasha

In [None]:
morph = pymorphy2.MorphAnalyzer()

segmenter = natasha.Segmenter()
emb = natasha.NewsEmbedding()
morph_tagger = natasha.NewsMorphTagger(emb)
syntax_parser = natasha.NewsSyntaxParser(emb)


def get_dependency_relations(text: str) -> t.Sequence[str]:
    doc = natasha.Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    doc.parse_syntax(syntax_parser)
    
    dep_relations = [_.rel for _ in doc.tokens]
    return dep_relations


def process_review(review: str) -> t.Sequence[t.Mapping[str, str]]:
    words = review.split()
    dep_relations = get_dependency_relations(review)
    
    processed_words = []
    for word, dep in zip(words, dep_relations):
        p = morph.parse(word)[0]
        processed_word = {
            "word": word,
            "lemma": p.normal_form,
            "pos": p.tag.POS,
            "morph": str(p.tag),
            "dep": dep  
        }
        processed_words.append(processed_word)
    return processed_words

## Convert to Csv

In [None]:
def convert_to_csv_format(processed_review: t.Sequence[t.Mapping[str, str]]) -> pd.DataFrame:
    max_length = max(len(review) for review in processed_review)
    columns = ["sentence"]
    data = {"sentence": []}

    for i in range(1, max_length + 1):
        for attr in ["word", "lemma", "pos", "morph"]:
            column_name = f"{attr}{i}"
            columns.append(column_name)
            data[column_name] = []

    for review in processed_review:
        sentence = " ".join(word["word"] for word in review)
        row = {"sentence": sentence}
        for i, word in enumerate(review, start=1):
            for attr in ["word", "lemma", "pos", "morph"]:
                row[f"{attr}{i}"] = word[attr]
        for key in data.keys():
            data[key].append(row.get(key, None))

    return pd.DataFrame(data, columns=columns)

## The Preprocessing Itself

In [None]:
preprocessed_gen_reviews = df["gen_review"].apply(process_review)
preprocessed_actual_reviews = df["actual_review"].apply(process_review)

csv_ready_gen_reviews = convert_to_csv_format(preprocessed_gen_reviews)
csv_ready_actual_reviews = convert_to_csv_format(preprocessed_actual_reviews)

csv_ready_gen_reviews.to_csv("data/gen_linguistic_char.csv", index=False)
csv_ready_actual_reviews.to_csv("data/actual_linguistic_char.csv", index=False)