In [24]:
import numpy as np
import pandas as pd
import pickle

In [25]:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
import string
!pip install stop_words
import stop_words
import re

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!




In [26]:
!pip install --upgrade scikit-learn==1.4.2



In [28]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

In [148]:
raw_data = pd.read_csv('/content/TRAIN_SAL.csv', encoding='utf-8')
raw_data.shape

  raw_data = pd.read_csv('/content/TRAIN_SAL.csv', encoding='utf-8')


(631117, 78)

In [149]:
raw_data = raw_data.sample(frac=1).reset_index(drop=True)


In [150]:
def preprocessing_salary_prediction(raw_data):
  #отбор наиболее релевантных колонок
  features_name = ['id', 'required_experience', 'vacancy_address_latitude', 'vacancy_address_longitude', 'professionalSphereName', 'position_requirements', 'salary']

  raw_data = raw_data[features_name]

  #удаление полей с отсутствующей зарплатой
  raw_data['salary'] = raw_data['salary'].replace({0:np.nan})
  raw_data = raw_data.dropna()

  vacancies_copy = raw_data.copy()

  # NLP препроцессинг поля position_requirements
  def remove_punctuation(text):
    return ''.join([ch if ch not in string.punctuation else ' ' for ch in text])
  def remove_numbers(text):
    return ''.join([i if not i.isdigit() else ' ' for i in text])
  def remove_eng(text):
    return ''.join([' ' if i[0] in ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] else i for i in text])
  def remove_multiple_spaces(text):
    return re.sub(r'\s+', ' ', text, flags=re.I)

  prep_text = [remove_multiple_spaces(remove_eng(remove_numbers(remove_punctuation(text.lower())))) for text in vacancies_copy['position_requirements'].astype('str')]


  # Кодируем position_requirements c помощью преобразования TF-IDF
  russian_stopwords = stop_words.get_stop_words('ru')
  tfidf = TfidfVectorizer(max_features=5, stop_words=russian_stopwords)
  skills_encoded = tfidf.fit_transform(prep_text)
  skills_encoded = pd.DataFrame(skills_encoded.toarray(), columns=tfidf.get_feature_names_out()).iloc[:, 1:]

  vacancies_copy = pd.concat([vacancies_copy, skills_encoded], axis=1)
  vacancies_copy.drop('position_requirements', axis=1, inplace=True)
  vacancies_copy = vacancies_copy.dropna()

  # Стандартизируем числовые признаки
  scaler = StandardScaler()
  vacancies_copy[['required_experience', 'vacancy_address_latitude', 'vacancy_address_longitude']] = scaler.fit_transform(vacancies_copy[['required_experience', 'vacancy_address_latitude', 'vacancy_address_longitude']])

  '''Удаление выбросов'''
  # Колонки, в которых ищем выбросы
  cols = ['vacancy_address_latitude', 'vacancy_address_longitude', 'salary']
  # Считаем квантили и межквартильный диапазон
  Q1 = vacancies_copy[cols].quantile(0.25)
  Q3 = vacancies_copy[cols].quantile(0.75)
  IQR = Q3 - Q1
  # Возвращает логический массив строк со значением признака в допустимых пределах
  condition = ~((vacancies_copy[cols] < (Q1 - 1.5 * IQR)) | (vacancies_copy[cols] > (Q3 + 1.5 * IQR))).any(axis=1)
  # Отбираем строки соответствующие условию
  vacancies_non_outs = vacancies_copy[condition]

  final = vacancies_non_outs.copy()
  # One-hot encoding поля professionalSphereName
  final = pd.get_dummies(final, columns=['professionalSphereName'])
  final = final.drop(['professionalSphereName_Маркетинг, реклама, PR'], axis=1)

  return final


In [151]:
vacancies_prepared = preprocessing_salary_prediction(raw_data)
vacancies_prepared = vacancies_prepared.drop('id', axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  raw_data['salary'] = raw_data['salary'].replace({0:np.nan})


In [154]:
n = 200000
y = vacancies_prepared['salary'][:n]
X = vacancies_prepared.drop('salary', axis=1)[:n]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [153]:
estimator = RandomForestRegressor()

parameters = {}

estimator.fit(X_train, y_train)

best_estimator = estimator

In [155]:
def preprocessing_salary_prediction_test(raw_data):
  #отбор наиболее релевантных колонок
  features_name = ['id', 'required_experience', 'vacancy_address_latitude', 'vacancy_address_longitude', 'professionalSphereName', 'position_requirements']

  raw_data = raw_data[features_name]
  vacancies_copy = raw_data.copy()

  # NLP препроцессинг поля position_requirements
  def remove_punctuation(text):
    return ''.join([ch if ch not in string.punctuation else ' ' for ch in text])
  def remove_numbers(text):
    return ''.join([i if not i.isdigit() else ' ' for i in text])
  def remove_eng(text):
    return ''.join([' ' if i[0] in ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] else i for i in text])
  def remove_multiple_spaces(text):
    return re.sub(r'\s+', ' ', text, flags=re.I)

  prep_text = [remove_multiple_spaces(remove_eng(remove_numbers(remove_punctuation(text.lower())))) for text in vacancies_copy['position_requirements'].astype('str')]


  # Кодируем position_requirements c помощью преобразования TF-IDF
  russian_stopwords = stop_words.get_stop_words('ru')
  tfidf = TfidfVectorizer(max_features=5, stop_words=russian_stopwords)
  skills_encoded = tfidf.fit_transform(prep_text)
  skills_encoded = pd.DataFrame(skills_encoded.toarray(), columns=tfidf.get_feature_names_out()).iloc[:, 1:]

  vacancies_copy = pd.concat([vacancies_copy, skills_encoded], axis=1)
  vacancies_copy.drop('position_requirements', axis=1, inplace=True)

  # Стандартизируем числовые признаки
  scaler = StandardScaler()
  vacancies_copy[['required_experience', 'vacancy_address_latitude', 'vacancy_address_longitude']] = scaler.fit_transform(vacancies_copy[['required_experience', 'vacancy_address_latitude', 'vacancy_address_longitude']])

  final = vacancies_copy.copy()
  # One-hot encoding поля professionalSphereName
  final = pd.get_dummies(final, columns=['professionalSphereName'])
  final = final.drop(['professionalSphereName_Маркетинг, реклама, PR'], axis=1)

  return final

In [156]:
test = pd.read_csv('TEST_SAL.csv')

test_pr = preprocessing_salary_prediction_test(test)

test_pr = test_pr.drop(['id'], axis=1)

y_pred_t = best_estimator.predict(test_pr)


In [157]:

y_pred = best_estimator.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print(str(estimator) + " RMSE: %.2f" % rmse)
custom_metric = max(0,1-(rmse/33000))
print("custom_metric: %.2f" % custom_metric)


RandomForestRegressor() RMSE: 7058.03
custom_metric: 0.79




In [158]:
import joblib
filename = 'salary_prediction_model_0.joblib'
joblib.dump(best_estimator, filename)

['salary_prediction_model_0.joblib']