In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install pymorphy3
!pip install catboost

In [None]:
import pandas as pd
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from catboost import Pool, CatBoostClassifier

from torch import nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch.nn.functional as F
import json
import torch

import nltk
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.linear_model import LogisticRegression
from nltk.tokenize import RegexpTokenizer
import pymorphy3
from tqdm import tqdm


nltk.download('stopwords')
russian_stopwords = stopwords.words("russian")
np.random.seed(0)
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
from IPython.display import clear_output


def plot_progress(train_losses, val_loss, train_accs, val_accs, lrs):
    clear_output(True)

    f, (ax1, ax2, ax3) = plt.subplots(nrows=1, ncols=3)
    f.set_figheight(6)
    f.set_figwidth(16)

    ax1.plot(train_losses, label='train loss')
    ax1.plot(val_loss, label='test loss')
    ax1.plot(np.zeros_like(train_losses), '--', label='zero')
    ax1.set_title('Loss', fontsize=14)
    ax1.set_ylabel('Loss')
    ax1.set_xlabel('Batch number')
    ax1.legend()

    ax2.plot(train_accs, label='Train Auc')
    ax2.plot(val_accs, label='Val Auc')
    ax2.plot(np.ones_like(train_accs), '--', label='Accuracy')
    ax2.set_title('Auc', fontsize=14)
    ax2.set_ylabel('Auc')
    ax2.set_xlabel('Batch number')
    ax2.legend()

    ax3.plot(lrs, label='learning rate')
    ax3.set_title('Learing rate')
    ax3.set_xlabel('Batch number')
    ax3.legend()

    plt.show()

In [None]:
patterns = "".join(string.punctuation)
stopwords_ru = stopwords.words("russian")
morph = pymorphy3.MorphAnalyzer()


def lemmatize(doc):
    for i in string.punctuation:
        doc = doc.replace(i, ' ')
    tokens = []
    for token in doc.split():
        if token and token not in stopwords_ru:
            token = token.strip()
            #token = morph.normal_forms(token)[0]
            tokens.append(token)
    return ' '.join(tokens)

In [None]:
geo = pd.read_csv('/content/drive/MyDrive/data/geo_info.csv', delimiter=';')
train = pd.read_csv('/content/drive/MyDrive/data/train.csv', delimiter=';')
labels = pd.read_csv('/content/drive/MyDrive/data/train_labels.csv', delimiter=';')
vectors = pd.read_csv('/content/drive/MyDrive/data/referer_vectors.csv', delimiter=';')

In [None]:
test = pd.read_csv('/content/drive/MyDrive/data/test.csv', delimiter=';')
test_users = pd.read_csv('/content/drive/MyDrive/data/test_users.csv', delimiter=';')

In [None]:
train.info()

In [None]:
geo.info() #region_id - много пропусков

In [None]:
vectors.info()

# Merge all dataframes

In [None]:
train.drop_duplicates(inplace=True)
vectors.drop_duplicates(inplace=True)

In [None]:
df = pd.merge(train, geo, how='left', on="geo_id")
df = pd.merge(df, vectors, how='inner', on='referer')
df = pd.merge(df, labels, how='left', on='user_id')
df = df[~df.target.isna()]
df.head()

In [None]:
from collections import defaultdict


domains = defaultdict(int)
for i in df.referer:
    domains[i.split('/')[2]] += 1
sorted_domains = {k: v for k, v in sorted(domains.items(), key=lambda item: item[1])}
cnt = sum(1 for i in sorted_domains.values() if i < 2)
cnt

In [None]:
len(domains)

In [None]:
sorted_domains = {k: v for k, v in sorted(domains.items(), key=lambda item: item[1])}
cnt = sum(1 for i in sorted_domains.values() if i < 2)
cnt

# Baseline

roc-auc = 0.8972650049187818 - all user_agent + cats from geo + domain

# Prepare df for train

add user_agent info + domain

In [None]:
df_new = df.copy()
df_new.index = df_new.user_id
df_new['domain'] = [i.split('/')[2] for i in df_new.referer]
second_part_domain = []
for i in df_new.referer:
    a = i.split('/')
    if a[3] != '':
        second_part_domain.append(a[3])
    else:
        second_part_domain.append('')
df_new['second_domain'] = second_part_domain
df_new.drop(columns=['request_ts', 'user_id', 'geo_id', 'referer'], inplace=True)

In [None]:
browsers = []
versions = []
os_version = []
os = []
for i in tqdm(df_new.user_agent):
    try:
        a = json.loads(str(i).replace("'", '"'))
        browsers.append(a['browser'])
        os.append(a['os'])
        versions.append(a['browser_version'])
        os_version.append(a['os_version'])
    except:
        a, b, c, d = browsers[-1],  os[-1], versions[-1], os_version[-1]
        browsers.append(a)
        os.append(b)
        versions.append(c)
        os_version.append(d)
df_new['browser'] = browsers
df_new['os'] = os
df_new['browser_version'] = versions
df_new['os_version'] = os_version
df_new.drop(columns=['user_agent'], inplace=True)

In [None]:
df_new['os'].value_counts()

In [None]:
# os_to_common = {'Fedora': 'Linux', 'Ubuntu': 'Linux', 'Tizen': 'Phone', 'Chrome OS': 'Linux', 'Windows Phone': 'Phone', 'FreeBSD': 'Other', 'Chromecast': 'Other'}
# df_new['os'] = df_new['os'].apply(lambda x: os_to_common[x] if x in os_to_common else x)

In [None]:
timezone_modes = {}
for k in tqdm(list(df_new.timezone_id.unique())):
    res = df_new[df_new.timezone_id == k].region_id.mode()
    try:
        timezone_modes[k] = res[0]
    except KeyError:
        timezone_modes[k] = df_new.region_id.mode()[0]

In [None]:
regions = []
for i in tqdm(df_new.values):
    if pd.isna(i[1]):
        regions.append(timezone_modes[i[2]])
    else:
        regions.append(i[1])
df_new['region_id'] = regions

In [None]:
test2 = test.copy()
test2 = pd.merge(test2, geo, how='left', on="geo_id")
test2 = pd.merge(test2, vectors, how='inner', on='referer')
test2['domain'] = [i.split('/')[2] for i in test2.referer]

test2 = pd.merge(test_users, test2, how='inner', on='user_id')
test2 = test2[~test2.user_id.duplicated()]
test2.index = test2.user_id

regions = []
for i in tqdm(test2.values):
    if pd.isna(i[6]):
        if i[7] not in timezone_modes:
            regions.append(test2.region_id.mode()[0])
        else:
            regions.append(timezone_modes[i[7]])
    else:
        regions.append(i[6])
test2['region_id'] = regions

second_part_domain = []
for i in test2.referer:
    a = i.split('/')
    if a[3] != '':
        second_part_domain.append(a[3])
    else:
        second_part_domain.append('')
test2['second_domain'] = second_part_domain
test2.drop(columns=['request_ts', 'user_id', 'geo_id', 'referer'], inplace=True)

browsers = []
versions = []
os_version = []
os = []
for i in tqdm(test2.user_agent):
    try:
        a = json.loads(str(i).replace("'", '"'))
        browsers.append(a['browser'])
        os.append(a['os'])
        versions.append(a['browser_version'])
        os_version.append(a['os_version'])
    except:
        a, b, c, d = browsers[-1],  os[-1], versions[-1], os_version[-1]
        browsers.append(a)
        os.append(b)
        versions.append(c)
        os_version.append(d)
test2['browser'] = browsers
test2['os'] = os
test2['browser_version'] = versions
test2['os_version'] = os_version

test2.drop(columns=['user_agent'], inplace=True)

test2.head()

In [None]:
for i in ['browser', 'os_version', 'browser_version', 'domain', 'second_domain', 'country_id', 'timezone_id', 'region_id']:
    all_elems = list(set(test2[i]) - set(df_new[i]))
    print(len(all_elems), i)

In [None]:
#le = LabelEncoder()
#for i in ['domain']:
#    all_elems = list(set(test2[i]) | set(df_new[i]))
#    le.fit(all_elems)
#    df_new[i] = le.transform(df_new[i])
#    test2[i] = le.transform(test2[i])

In [None]:
#замена label encoder для domain через евклидово расстояние по датасету vectors
def euclidean_distance(vec1, vec2):
    return np.linalg.norm(np.array(vec1) - np.array(vec2))


vectors['domain'] = [i.split('/')[2] for i in vectors.referer]
domains = {}
for elem in tqdm(df_new.domain.unique()):
    domains[elem] = vectors.loc[vectors.domain == elem, [f'component{i}' for i in range(10)]].mean().astype(int).values.tolist()

domains_test = {}
for elem in tqdm(test2.domain.unique()):
    domains_test[elem] = vectors.loc[vectors.domain == elem, [f'component{i}' for i in range(10)]].mean().astype(int).values.tolist()
res_test_domains = {}
for key in tqdm(domains_test.keys()):
    closest_object = None
    min_distance = float('inf')
    for name, vector in domains.items():
        distance = euclidean_distance(domains_test[key], vector)
        if distance < min_distance:
            min_distance = distance
            closest_object = name
    res_test_domains[key] = closest_object

for k, v in res_test_domains.items():
    test2.loc[test2.domain == k, 'domain'] = v
set(test2.domain) - set(df_new.domain)

In [None]:
test2.loc[test2.browser == 'HbbTV', 'browser'] = 'Chrome'
set(test2.browser) - set(df_new.browser)

In [None]:
print(set(test2.country_id) - set(df_new.country_id))
len(test2[test2.country_id == '10cdeb5']) + len(test2[test2.country_id == 'db21ba']) # всего 3 семлпа - заполним модой
test2.loc[(test2.country_id == '10cdeb5') | (test2.country_id == 'db21ba'), 'country_id'] = test2.country_id.mode()[0]
set(test2.country_id) - set(df_new.country_id)

In [None]:
print(set(test2.timezone_id) - set(df_new.timezone_id))
len(test2[test2.timezone_id == '10480cf']) + len(test2[test2.timezone_id == '98e66e']) # 3 семлпа
test2.loc[(test2.timezone_id == '10480cf') | (test2.timezone_id == '98e66e'), 'timezone_id'] = test2.timezone_id.mode()[0]
set(test2.timezone_id) - set(df_new.timezone_id)

In [None]:
print(set(test2.region_id) - set(df_new.region_id))
len(test2[test2.region_id == '354493']) + len(test2[test2.region_id == '19bf7a']) + len(test2[test2.region_id == '575134'])# 3 семлпа
test2.loc[(test2.region_id == '354493') | (test2.region_id == '19bf7a') | (test2.region_id == '575134'), 'region_id'] = test2.region_id.mode()[0]
set(test2.region_id) - set(df_new.region_id)

In [None]:
# различия => можно оставить browser, country, time, region
'''1 browser
6 os_version
75 browser_version
228 domain
13903 second_domain
2 country_id
2 timezone_id
3 region_id'''

In [None]:
columns = ['country_id', 'region_id', 'domain', 'browser', 'os', 'timezone_id'] + [f'component{i}' for i in range(10)]
category = ['region_id', 'timezone_id', 'country_id', 'os', 'browser', 'domain']
X, y = df_new[columns], df_new['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
X_train['os'].value_counts()

# Catboost Model

In [None]:
train_dataset = Pool(data=X_train,
                     label=y_train,
                     cat_features=category)

eval_dataset = Pool(data=X_test,
                    label=y_test,
                    cat_features=category)


model = CatBoostClassifier(
    iterations=700,
    learning_rate=0.01,
    random_seed=42,
    max_depth=6,
    # loss_function = 'CrossEntropy',
    custom_metric=['AUC']
)
model.fit(
    train_dataset,
    eval_set=eval_dataset,
    use_best_model=True,
    early_stopping_rounds=30,
    verbose=True
)

In [None]:
preds_proba = model.predict_proba(eval_dataset)[:, 1]
roc_auc_score(y_test, preds_proba)

In [None]:
importances = model.get_feature_importance(type='PredictionValuesChange')
feature_importances = pd.Series(importances, index=X.columns).sort_values()
feature_importances

component0      0.800107
timezone_id     0.849820
component9      1.025836
component4      1.099582
component7      1.105323
component6      1.968028
region_id       2.149833
component5      2.486600
country_id      2.971283
component3      3.159298
component1      3.643665
component2      4.152498
component8      5.700547
os              7.060173
browser        16.055573
domain         45.771834

# 0.88

In [None]:
train_X = X_train.copy()
train_X.columns

In [None]:
train_X['country_id'] = train_X['country_id'].astype('category')
train_X['region_id'] = train_X['region_id'].astype('category')
train_X['timezone_id'] = train_X['timezone_id'].astype('category')
train_X['domain'] = train_X['domain'].astype('category')
# train_X['second_domain'] = train_X['second_domain'].astype('category')
train_X['browser'] = train_X['browser'].astype('category')
train_X['os'] = train_X['os'].astype('category')
# train_X['browser_vesion'] = train_X['browser_version'].astype('category')
# train_X['os_version'] = train_X['os_version'].astype('category')
train_X.info()

In [None]:
y_train.info()

In [None]:
from xgboost import XGBClassifier

# Define the model
model = XGBClassifier(
    n_estimators=700,          # Equivalent to iterations
    learning_rate=0.01,        # Same as CatBoost's learning_rate
    random_state=42,           # Same as random_seed
    max_depth=6,               # Same as CatBoost's max_depth
    eval_metric='auc',         # Equivalent to custom_metric=['AUC']
    use_label_encoder=False,     # To avoid warnings about label encoding
    enable_categorical = True
)

# Fit the model
model.fit(
    train_X,
    y_train,
    # eval_set=eval_dataset,
    verbose=True
)

In [None]:
test_X = X_test.copy()
test_X['country_id'] = test_X['country_id'].astype('category')
test_X['region_id'] = test_X['region_id'].astype('category')
test_X['timezone_id'] = test_X['timezone_id'].astype('category')
test_X['domain'] = test_X['domain'].astype('category')
# train_X['second_domain'] = train_X['second_domain'].astype('category')
test_X['browser'] = test_X['browser'].astype('category')
test_X['os'] = test_X['os'].astype('category')
# train_X['browser_vesion'] = train_X['browser_version'].astype('category')
# train_X['os_version'] = train_X['os_version'].astype('category')
test_X.info()

In [None]:
preds_proba = model.predict_proba(test_X)[:, 1]
roc_auc_score(y_test, preds_proba)

In [None]:
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

# Define the model
model = XGBClassifier(
    n_estimators=700,
    learning_rate=0.01,
    random_state=42,
    max_depth=6,
    eval_metric='auc',
    use_label_encoder=False,
    enable_categorical=True
)

# Define the parameter grid for fine-tuning
param_grid = {
    'n_estimators': [700, 900],
    'learning_rate': [0.01, 0.05],
    'max_depth': [4, 6, 8],
    'min_child_weight': [1, 3, 5],  # Regularization parameter
    'subsample': [0.6,0.8, 1.0],    # Proportion of samples to use for training
    'colsample_bytree': [0.6, 0.8, 1.0]  # Proportion of features to use for training
}

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='roc_auc',         # Use AUC as the scoring metric
    cv=3,                      # Number of cross-validation folds
    verbose=1,
    n_jobs=-1                 # Use all available cores
)

# Fit the model with GridSearchCV
grid_search.fit(train_X, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best AUC Score:", best_score)


# Stacking

In [None]:
columns = ['country_id', 'region_id', 'domain', 'browser', 'os', 'timezone_id'] + [f'component{i}' for i in range(10)]
category = ['region_id', 'timezone_id', 'country_id', 'os', 'browser', 'domain']
X, y = df_new[columns], df_new['target']
for cat in category:
    X[cat] = X[cat].astype('category')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [None]:
import xgboost as xgb
import lightgbm as lgbm


xgb_model = xgb.XGBClassifier(n_estimators = 300, enable_categorical=True, random_seed=42, verbosity=1, learning_rate = 0.25, max_depth = 6, min_chile_weight=2)
lgbm_model = lgbm.LGBMClassifier(n_estimators = 300, random_seed=42, categorical_feature=category, verbose=1, learning_rate = 0.25, num_leaves = 63)
cat_model = CatBoostClassifier(cat_features=category, iterations = 700, depth = 6, l2_leaf_reg = 5, verbose=True, random_seed=42)

In [None]:
# список базовых моделей
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
import xgboost as xgb


estimators = [

    ("XGBoost", xgb_model),
    ("LightGBM", lgbm_model),
    ("CatBoost", cat_model),

    # То, что не дало прироста в ансамбле
    # ("SVM", make_pipeline(preprocessor, LinearSVC(verbose=False))),
    # ("MLP", make_pipeline(preprocessor, MLPClassifier(verbose=False, hidden_layer_sizes=(100, 30, ), alpha=0.001,random_state=75, max_iter = 1300, ))),

]

# в качестве мета-модели будем использовать LogisticRegression
# meta_model = StackingClassifier(
#     estimators=estimators,
#     final_estimator=LogisticRegression(random_state=42, verbose=True),
#     # final_estimator=RandomForestClassifier(n_estimators = 10_000,
#                                            # max_depth = 5,
#                                            # verbose=False),
#     n_jobs=-1,
#     verbose=True,
# )

from sklearn.neural_network import MLPClassifier

meta_model = StackingClassifier(
    estimators=estimators,
    final_estimator=xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    n_jobs=-1,
    verbose=True,
)

stacking_classifier = meta_model
stacking_classifier

In [None]:
stacking_classifier.fit(X_train, y_train)

In [None]:
# corr_df = pd.DataFrame()

# for model, (name, _) in zip(stacking_classifier.estimators_, stacking_classifier.estimators):
#     preprocessed = stacking_classifier.estimators[0][1].steps[0][1].fit(X_train, y_train).transform(X_test)
#     print(name, 'roc-auc: ', round(roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]), 4))

#     corr_df[name] = model.predict(X_test)


In [None]:
print('ensemble score:', round(roc_auc_score(y_test, stacking_classifier.predict_proba(X_test)[:, 1]), 4))

# Make test prediction

In [None]:
test_users

In [None]:
test2.head()

In [None]:
test2.info()

In [None]:
test2['country_id'] = test2['country_id'].astype('category')
test2['region_id'] = test2['region_id'].astype('category')
test2['timezone_id'] = test2['timezone_id'].astype('category')
test2['domain'] = test2['domain'].astype('category')
test2['second_domain'] = test2['second_domain'].astype('category')
test2['browser'] = test2['browser'].astype('category')
test2['os'] = test2['os'].astype('category')
test2['browser_vesion'] = test2['browser_version'].astype('category')
test2['os_version'] = test2['os_version'].astype('category')
test2.info()

In [None]:
test_preds = stacking_classifier.predict_proba(test2[columns])[:, 1]

In [None]:
test_preds.shape

In [None]:
test_users.shape

In [None]:
test_users['target'] = test_preds

In [None]:
test_users

In [None]:
test_users.to_csv('baseline_fix_cat_features.csv', index=False, sep=';')