In [None]:
import pandas as pd

import gc
import torch    

from catboost import CatBoostClassifier
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers.models.deprecated.graphormer.collating_graphormer import preprocess_item

In [None]:
pd.set_option("display.max_columns", None)

user_item = pd.read_parquet('/home/pret/PycharmProjects/Vseros_classification/Datasets/train.parquet', engine='pyarrow')
item_features = pd.read_parquet('/home/pret/PycharmProjects/Vseros_classification/Datasets/video_stat.parquet', engine='pyarrow')
test = pd.read_parquet('/home/pret/PycharmProjects/Vseros_classification/Datasets/test.parquet', engine='pyarrow')

### Предобработка

In [None]:
def target_preprocessing(data):
    if data['v_duration'] > 300:
        long_video = int(1)
        if data['watchtime'] > data['v_duration']*0.25:
            res = int(1)
        else:
            res = int(0)
    else:
        long_video = int(0)
        if data['watchtime'] > 30:
            res = int(1)
        else:
            res = int(0)
    return res, long_video

duration = item_features[['v_duration', 'video_id']]

# Объединение данных и применение функции
user_item = pd.merge(user_item, duration, on='video_id', how = 'left')
# apply: Применяет функцию target_preprocessing к каждой строке DataFrame.
user_item[['watchtime', 'long_video']] = user_item[['watchtime', 'v_duration']].apply(target_preprocessing, axis=1, result_type='expand')
user_item = user_item.drop(columns=['v_duration'])

def target_preprocessing(time):
    if time > 300:
        long_video = int(1)
    else:
        long_video = int(0)
    return long_video

duration = item_features[['v_duration', 'video_id']]

test = pd.merge(test, duration, on='video_id', how = 'left')
test['long_video'] = test['v_duration'].apply(lambda x: target_preprocessing(x))
test = test.drop(columns=['v_duration'])


def time_preprocessing(column):

    column = pd.to_datetime(column)
    column = column.apply(lambda x: x.timestamp())


    column = (column - column.min()) / (column.max() - column.min())

    return column

user_item['event_timestamp'] = time_preprocessing(user_item['event_timestamp'])
user_item.to_csv('/home/pret/PycharmProjects/Vseros_classification/Datasets/Interaction_table.csv')

item_features['v_pub_datetime'] = time_preprocessing(item_features['v_pub_datetime'])
item_features.to_csv('/home/pret/PycharmProjects/Vseros_classification/Datasets/Item_features.csv')

test['event_timestamp'] = time_preprocessing(test['event_timestamp'])
test.to_csv('/home/pret/PycharmProjects/Vseros_classification/Datasets/Interaction_table_test.csv')

In [None]:
# Очистка памяти

gc.collect()
torch.cuda.empty_cache()

In [None]:
train_dataset = pd.read_csv('/home/pret/PycharmProjects/Vseros_classification/Datasets/Interaction_table.csv')
item_features = pd.read_csv('/home/pret/PycharmProjects/Vseros_classification/Datasets/Item_features.csv')
item_features = item_features.drop(columns=['description','title'])
train_dataset = pd.merge(train_dataset, item_features, on='video_id', how='left')
train_dataset = train_dataset.drop(columns=['Unnamed: 0_x', 'Unnamed: 0_y', 'row_number'])

test = pd.read_csv('/home/pret/PycharmProjects/Vseros_classification/Datasets/Interaction_table_test.csv')
test = pd.merge(test, item_features, on='video_id', how='left')
test = test.drop(columns=['Unnamed: 0_x', 'Unnamed: 0_y', 'row_number'])
print('Merged successfully!')

### Обучение модели

In [None]:
categorical_data = ['user_id', 'video_id', 'category_id', 'region', 'city', 'long_video', 'author_id'] # для ухудшения результата можете выкинуть признаки

model = CatBoostClassifier(
 #   auto_class_weights='SqrtBalanced',
  #   l2_leaf_reg=int(2),
    learning_rate=0.8, # тут тыкаетесь 
    loss_function = 'Logloss',
    eval_metric='F1',
    random_seed=42,
    logging_level='Silent',
    iterations=5, # тут тыкаетесь 
    task_type="GPU",
    devices='0',
)

X_train = train_dataset.copy().drop(columns = ['watchtime'])
y_train = train_dataset.copy()['watchtime']

In [None]:
del train_dataset
del item_features 
# заботимся о памяти

In [None]:
model.fit(
    X_train, y_train,
    cat_features=categorical_data,
    logging_level='Verbose'
)
model.save_model("/home/pret/PycharmProjects/Vseros_classification/Models/catboost_model_LOL.cbm")

In [None]:
predict = model.predict_proba(test)[:,1:].flatten()
submission = pd.DataFrame({'target' : predict})

def threshold (item, holding):
    if item > holding:
        item = int(1)
    else:
        item = int(0)
    return item

hold = 0.3

submission = submission['target'].apply(lambda item: threshold(item, hold))
submission.to_csv('/home/pret/PycharmProjects/Vseros_classification/Submissions/HUITASUB.csv', index=False)

In [None]:
submission