In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [2]:
import pandas as pd
import numpy as np
import json
import re
import spacy # NLP Preprocessor
from textblob import TextBlob
spacy.prefer_gpu()
pd.set_option('display.max_colwidth', 1000)

In [None]:
def get_data_from_json(path: str) -> pd.DataFrame:
    with open(path, "r") as json_file:
        json_lines = (line for line in json_file)
        rows = (
            {
                "text": data["text"],
                "comment": comment["text"],
                "score": comment["score"]
            }
            for line in json_lines
            for data in [json.loads(line)]
            for comment in data["comments"]
        )
        return pd.DataFrame(rows, columns=["text", "comment", "score"])



In [None]:
dataset_path = '/content/drive/MyDrive/datasets/ranking_train.jsonl'
df = get_data_from_json(dataset_path)

In [None]:
df.sample(3)

### Функци для предобработки текста

In [5]:
# Compile regex pattern for removing urls and special characters
url_pattern = re.compile(r'(http|www)\S+')
special_char_pattern = re.compile('[^A-Za-z0-9]+')

nlp = spacy.load('en_core_web_sm')
stop_words = nlp.Defaults.stop_words

# Cache processed strings and text sentiments
processed_strings = {}
text_sentiments = {}

def remove_urls(text):
    # Use compiled regex pattern to remove urls
    return url_pattern.sub('', text)

def remove_stops(text):
    # Split the text into tokens
    tokens = text.split()
    # Filter out stop words using the stop_words set
    wo_stops = [token for token in tokens if token.lower() not in stop_words]
    return " ".join(wo_stops)

def remove_specials(text):
    # Use compiled regex pattern to remove special characters
    clean_text = special_char_pattern.sub(' ', text)
    # Use list comprehension to filter out short words
    clean_text = " ".join(text for text in clean_text.split(' ') if len(text) > 2)
    return clean_text

def extract_feats(text):
    # Split the text into tokens
    tokens = text.split()
    # Filter out tokens that are not nouns or verbs
    feats = [token for token in tokens if TextBlob(token).tags[0][1] in ['NN', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']]
    return feats

def preprocess_text(text):
    if text in processed_strings:
        return processed_strings[text]

    wo_urls = remove_urls(text)
    wo_stops = remove_stops(wo_urls)
    clean_text = remove_specials(wo_stops)
    processed_strings[text] = clean_text
    return clean_text

def get_text_sentiments(text):
    if text in text_sentiments:
        return text_sentiments[text]
    sentiment = round(TextBlob(text).sentiment.polarity, 5)
    text_sentiments[text] = sentiment
    return sentiment

## Обработка текста

In [None]:
text_columns = ["text", "comment"]
df[text_columns] = df[text_columns].applymap(preprocess_text)
df["post_sentiments"] = df["text"].apply(get_text_sentiments)
df["comment_sentiments"] = df["comment"].apply(get_text_sentiments)

In [None]:
df.sample(1)

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
text_features = ["text", "comment"]
num_features = ["post_sentiments", "comment_sentiments"]
X = df[text_features + num_features]
y = df["score"]

from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, train_size=0.9)

In [None]:
%pip install catboost

In [None]:
from catboost import CatBoostClassifier

In [None]:
model = CatBoostClassifier(iterations = 50000, 
                           task_type="GPU", 
                           verbose=True, 
                           learning_rate=0.05,
                           early_stopping_rounds=10,
                           custom_loss=['Accuracy'])

In [None]:
model.fit(x_train, y_train, text_features = text_features)

In [None]:
preds = model.predict(x_valid)

In [None]:
from sklearn.metrics import ndcg_score

In [None]:
y_valid

In [None]:
y_valid_pred = model.predict_proba(x_valid)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
accuracy_score(y_valid, preds)

In [None]:
model.save("model.cbm")