In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train.head()

In [None]:
train.fillna({'experience_from': train['experience_from'].median()}, inplace=True)
test.fillna({'experience_from': test['experience_from'].median()}, inplace=True)
train.fillna('', inplace=True)
test.fillna('', inplace=True)

train['location'] = train['location'].str.split(' ').str[0]
test['location'] = test['location'].str.split(' ').str[0]

train['text'] = train['title'] + ' ' + train['company'] +' ' + train['location'] + train['description'] + ' ' + train['skills'] 
test['text'] = test['title'] + ' ' + test['company'] +' ' + test['location'] + test['description'] + ' ' + test['skills'] 

In [None]:
def clean_text(text):
    text = re.sub(r'\n\n+', '\n', text)
    text = re.sub(r'\t+', ' ', text)
    text = re.sub(r' +', ' ', text)
    return text.strip()

train['text'] = train['text'].apply(clean_text)
test['text'] = test['text'].apply(clean_text)

In [None]:
X = train.drop(['log_salary_from', 'salary_from', 'title', 'description', 'skills'], axis=1)
y = train['log_salary_from']

In [None]:
y_quantiles = np.percentile(y, [0, 25, 50, 75, 100])
bins = np.digitize(y, bins=y_quantiles) 

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
import random
import numpy as np
import torch
from transformers import AutoTokenizer
from datasets import Dataset

def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

MODEL_NAME = "ai-forever/ruBert-large"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

train_dataset = Dataset.from_dict({"text": X_train['text'], "label": y_train.tolist()})
val_dataset = Dataset.from_dict({"text": X_val['text'], "label": y_val.tolist()})

train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
val_dataset = val_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

In [None]:
from transformers.trainer_utils import get_last_checkpoint
from transformers import AutoModel
from torch import nn

class BertRegressor(nn.Module):
    def __init__(self, model_name):
        super(BertRegressor, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1)
        self.regressor = nn.Linear(self.bert.config.hidden_size, 1)
        self.loss_fn = nn.HuberLoss()

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output  # [CLS]
        logits = self.regressor(pooled_output).squeeze(-1)
        
        if labels is not None:
            loss = self.loss_fn(logits, labels)
            return {"loss": loss, "logits": logits}
        return {"logits": logits}   


model = BertRegressor(MODEL_NAME)

In [None]:
from transformers import TrainingArguments, Trainer

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.flatten()
    mse = ((predictions - labels) ** 2).mean()
    rmse = np.sqrt(mse)
    r2 = 1 - mse / np.var(labels)
    return {"rmse": rmse, "r2": r2}

output_dir = "./results_ruBERT"
training_args = TrainingArguments(
    output_dir=output_dir,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=200,
    learning_rate=6e-6,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=128,    
    save_total_limit=5,  
    num_train_epochs=25,
    weight_decay=0.1,
    warmup_ratio=0.05,
    max_grad_norm=1.5,
    lr_scheduler_type="linear",
    logging_dir=f"./{output_dir}/logs",
    bf16=torch.cuda.is_bf16_supported(), 
    fp16=not torch.cuda.is_bf16_supported(),
    seed=42,
    report_to="tensorboard",
    load_best_model_at_end=True,
    metric_for_best_model="r2", 
    greater_is_better=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

for param in model.parameters():
    param.data = param.data.contiguous()

In [None]:
checkpoint = get_last_checkpoint(output_dir)
trainer.train()

In [None]:
bert_train_pred = trainer.predict(train_dataset)
bert_val_pred = trainer.predict(val_dataset)

print(bert_train_pred.metrics)
print(bert_val_pred.metrics)

In [None]:
test_dataset = Dataset.from_dict({"text": test['text']})
test_dataset = test_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

bert_test_pred = trainer.predict(test_dataset)

In [None]:
submission = pd.DataFrame({'index': test.index, 'prediction': bert_test_pred.predictions.flatten()})

submission.to_csv('bert submission.csv', index=False)

In [None]:
X_train_bert_df = pd.DataFrame(bert_train_pred.predictions, index=X_train.index, columns=['bert'])
X_val_bert_df = pd.DataFrame(bert_val_pred.predictions, index=X_val.index, columns=['bert'])
X_test_bert_df = pd.DataFrame(bert_test_pred.predictions, index=test.index, columns=['bert'])

In [None]:
vectorizer = TfidfVectorizer(max_features=20000, stop_words=stopwords.words("russian"), 
                            sublinear_tf = True, analyzer = "word", token_pattern=r"\w{1,}", ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(X_train['text']).toarray()
X_val_tfidf = vectorizer.transform(X_val['text']).toarray()
X_test_tfidf = vectorizer.transform(test['text']).toarray()

svd = TruncatedSVD(n_components=400, random_state=52)

X_train_tfidf = svd.fit_transform(X_train_tfidf)
X_val_tfidf = svd.transform(X_val_tfidf)
X_test_tfidf = svd.transform(X_test_tfidf)

np.save('X_train_tfidf.npy', X_train_tfidf)
np.save('X_val_tfidf.npy', X_val_tfidf)
np.save('X_test_tfidf.npy', X_test_tfidf)

# X_train_tfidf = np.load('X_train_tfidf.npy')
# X_val_tfidf = np.load('X_val_tfidf.npy')
# X_test_tfidf = np.load('X_test_tfidf.npy')

In [None]:
svd_features = [f'svd_{i}' for i in range(X_train_tfidf.shape[1])]

X_train_tfidf_df = pd.DataFrame(X_train_tfidf, index=X_train.index, columns=svd_features)
X_val_tfidf_df = pd.DataFrame(X_val_tfidf, index=X_val.index, columns=svd_features)
X_test_tfidf_df = pd.DataFrame(X_test_tfidf, index=test.index, columns=svd_features)

In [None]:
cat_features = ['location', 'company']
num_features = ['experience_from']

catboost_train = pd.concat([X_train[cat_features], X_train[num_features], X_train_tfidf_df, X_train_bert_df], axis=1)
catboost_val = pd.concat([X_val[cat_features], X_val[num_features], X_val_tfidf_df, X_val_bert_df], axis=1)
catboost_test = pd.concat([test[cat_features], test[num_features], X_test_tfidf_df, X_test_bert_df], axis=1)

In [None]:
preprocessor = ColumnTransformer([
    ('scaler', StandardScaler(), num_features),
    ('ohe', OneHotEncoder(handle_unknown='ignore'), cat_features)
])

lgb_train = np.hstack([X_train_tfidf, X_train_bert_df, preprocessor.fit_transform(X_train).toarray()])
lgb_val = np.hstack([X_val_tfidf, X_val_bert_df, preprocessor.transform(X_val).toarray()])
lgb_test = np.hstack([X_test_tfidf, X_test_bert_df, preprocessor.transform(test).toarray()])

In [None]:
import lightgbm as lgb

gbr = lgb.LGBMRegressor(n_estimators=543, learning_rate=0.1301314273764898, max_depth=7, 
                        objective="huber", verbose="-1", random_state=42)

gbr.fit(lgb_train, y_train)

In [None]:
gbr_pred_val = gbr.predict(lgb_val)
print('R2 Score:', r2_score(y_val, gbr_pred_val))

In [None]:
ridge = Ridge(alpha=1)

ridge.fit(lgb_train, y_train)

In [None]:
ridge_pred_val = ridge.predict(lgb_val)
print('R2 Score:', r2_score(y_val, ridge_pred_val))

In [None]:
from catboost import CatBoostRegressor

catboost = CatBoostRegressor(
    iterations=808, learning_rate=0.1610083734630271, depth=9, l2_leaf_reg=0.00034558902847955095,
    cat_features=[i for i in range(len(cat_features))], 
    verbose=100, random_seed=42, task_type="GPU", eval_metric="R2", 
    od_wait=150, use_best_model=True,
)

catboost.fit(catboost_train, y_train, eval_set=(catboost_val, y_val))

In [None]:
catboost_pred_val = catboost.predict(catboost_val)
print('R2 Score:', r2_score(y_val, catboost_pred_val))

In [None]:
import matplotlib.pyplot as plt
import numpy as np

feature_importance = catboost.get_feature_importance()
feature_names = catboost.feature_names_

sorted_idx = np.argsort(feature_importance)[::-1]
top_n = 30
top_features = np.array(feature_names)[sorted_idx[:top_n]]
top_importance = feature_importance[sorted_idx[:top_n]]

plt.figure(figsize=(12, 6))
plt.barh(top_features[::-1], top_importance[::-1], color="royalblue")
plt.xlabel("Feature Importance")
plt.ylabel("Features")
plt.title("CatBoost Feature Importance (Top 30)")
plt.show()

In [None]:
X_meta = np.column_stack((gbr_pred_val, catboost_pred_val, ridge_pred_val))
meta_model = Ridge(alpha=0.8)
meta_model.fit(X_meta, y_val)
y_pred = meta_model.predict(X_meta)

print('R2 Score:', r2_score(y_val, y_pred))

In [None]:
X_meta_test = np.column_stack((gbr.predict(lgb_test), catboost.predict(catboost_test), ridge.predict(lgb_test)))
final_pred_test = meta_model.predict(X_meta_test)

submission = pd.DataFrame({'index': test.index, 'prediction': final_pred_test})

submission.to_csv('meta submission.csv', index=False)