In [1]:
#for collab

#from google.colab import drive
#drive.mount('/content/drive')

In [2]:
import pandas as pd
import numpy as np
import json
import re
import spacy # NLP Preprocessor
from textblob import TextBlob
pd.set_option('display.max_colwidth', 1000)

In [3]:
def get_data_from_json(path: str) -> pd.DataFrame:
    with open(path, "r") as json_file:
        json_lines = (line for line in json_file)
        rows = (
            {
                "text": data["text"],
                "comment": comment["text"],
                "score": comment["score"]
            }
            for line in json_lines
            for data in [json.loads(line)]
            for comment in data["comments"]
        )
        return pd.DataFrame(rows, columns=["text", "comment", "score"])



In [4]:
dataset_path = 'datasets/ranking_train.jsonl'
df = get_data_from_json(dataset_path)

In [5]:
#df.sample(3)

### Функци для предобработки текста

In [6]:
# Compile regex pattern for removing urls and special characters
url_pattern = re.compile(r'(http|www)\S+')
special_char_pattern = re.compile('[^A-Za-z0-9]+')

nlp = spacy.load('en_core_web_sm')
stop_words = nlp.Defaults.stop_words

# Cache processed strings and text sentiments
processed_strings = {}
text_sentiments = {}

def remove_urls(text):
    # Use compiled regex pattern to remove urls
    return url_pattern.sub('', text)

def remove_stops(text):
    # Split the text into tokens
    tokens = text.split()
    # Filter out stop words using the stop_words set
    wo_stops = [token for token in tokens if token.lower() not in stop_words]
    return " ".join(wo_stops)

def remove_specials(text):
    # Use compiled regex pattern to remove special characters
    clean_text = special_char_pattern.sub(' ', text)
    # Use list comprehension to filter out short words
    clean_text = " ".join(text for text in clean_text.split(' ') if len(text) > 2)
    return clean_text

def extract_feats(text):
    # Split the text into tokens
    tokens = text.split()
    # Filter out tokens that are not nouns or verbs
    feats = [token for token in tokens if TextBlob(token).tags[0][1] in ['NN', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']]
    return feats

def preprocess_text(text):
    if text in processed_strings:
        return processed_strings[text]

    wo_urls = remove_urls(text)
    wo_stops = remove_stops(wo_urls)
    clean_text = remove_specials(wo_stops)
    processed_strings[text] = clean_text
    return clean_text

def get_text_sentiments(text):
    if text in text_sentiments:
        return text_sentiments[text]
    sentiment = round(TextBlob(text).sentiment.polarity, 5)
    text_sentiments[text] = sentiment
    return sentiment

## Обработка текста

In [7]:
text_columns = ["text", "comment"]
df[text_columns] = df[text_columns].applymap(preprocess_text)
#df["post_sentiments"] = df["text"].apply(get_text_sentiments)
#df["comment_sentiments"] = df["comment"].apply(get_text_sentiments)

### Разделение данные и начало тренировки

In [8]:
df.sample(1)

Unnamed: 0,text,comment,score
357442,ridiculous Common Core test graders,test bad exposed math grade hate think life different degrees mathematics x27 heard stories grader niece crying trying complete daily homework x27 starting understand why,2


In [9]:
text_features = ["text", "comment"]
# Сентиментальность не оказала эффекта на скоре
#num_features = ["post_sentiments", "comment_sentiments"]
X = df[text_features]
y = df["score"]

from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, train_size=0.9, shuffle=False)

In [10]:
from catboost import CatBoostClassifier, Pool

In [11]:
model = CatBoostClassifier(task_type="GPU", 
                           verbose=True)

In [12]:
pool = Pool(x_train, y_train, text_features=text_features, feature_names=list(x_train.columns))

In [13]:
model.fit(x_train, y_train, text_features=text_features)

Learning rate set to 0.219949
0:	learn: 1.5586656	total: 306ms	remaining: 5m 5s
1:	learn: 1.5310953	total: 432ms	remaining: 3m 35s
2:	learn: 1.4950929	total: 643ms	remaining: 3m 33s
3:	learn: 1.4729976	total: 833ms	remaining: 3m 27s
4:	learn: 1.4603432	total: 966ms	remaining: 3m 12s
5:	learn: 1.4527315	total: 1.1s	remaining: 3m 2s
6:	learn: 1.4425627	total: 1.28s	remaining: 3m 1s
7:	learn: 1.4389195	total: 1.42s	remaining: 2m 55s
8:	learn: 1.4366518	total: 1.56s	remaining: 2m 51s
9:	learn: 1.4352297	total: 1.7s	remaining: 2m 48s
10:	learn: 1.4343452	total: 1.84s	remaining: 2m 45s
11:	learn: 1.4284775	total: 1.99s	remaining: 2m 43s
12:	learn: 1.4239555	total: 2.13s	remaining: 2m 42s
13:	learn: 1.4232328	total: 2.27s	remaining: 2m 40s
14:	learn: 1.4196859	total: 2.44s	remaining: 2m 40s
15:	learn: 1.4193417	total: 2.58s	remaining: 2m 38s
16:	learn: 1.4166549	total: 2.72s	remaining: 2m 37s
17:	learn: 1.4144716	total: 2.86s	remaining: 2m 36s
18:	learn: 1.4124785	total: 3.01s	remaining: 2m 3

: 

: 

In [None]:
preds = model.predict(x_valid)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
accuracy_score(y_valid, preds)

In [None]:
model.save("catboost_model.cbm")