In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.metrics import classification_report, precision_score
from sklearn.metrics import recall_score, f1_score, roc_auc_score
import json
from PIL import Image, UnidentifiedImageError
import requests
from transformers import CLIPProcessor, CLIPModel, pipeline
import torch
import torchvision.transforms as transforms
from accelerate.test_utils.testing import get_backend
from joblib import dump, load

### Загружаем данные из гугл карт по спортивным центрам города

In [None]:
city = pd.read_excel('-.xlsx')

In [None]:
features = ['input_id', 'group',
               'build', 'category', 'review_count', 'images']
city = city[features]

In [None]:
for col in city.columns[1:-2]:
    print(col)
    print()
    print(city[col].value_counts())
    print('____________')

### Приведем данные к нужному виду

In [None]:
def to_remake_category(value):
    value = value.lower()
    if value == 'gym' or value == 'fitness center':
        return 'gym'
    if re.search(r'\b(fitness)\b', value):
        return 'fitness'
    elif re.search(r'\b(yoga|gymnastic|pilates|health|dance|shaping|dietitian|spa|diabet|sauna|weight|nutritionist)\b', value):
        return 'yoga'
    elif re.search(r'\b(complex|athletic|stadium)\b', value):
        return 'complex'
    elif re.search(r'\b(court)\b', value):
        return 'court'
    elif 'school' in value:
        return 'school'
    else:
        return 'other'

def to_remake_group(value):
    if 'без' in value:
        return 'no_pool'
    elif 'бассейн' in value:
        return 'pool'
    elif 'студия' in value.lower():
        return 'studio'
    else:
        return 'other'

def to_remake_build(value):
    if value.lower() == 'ж':
        return 'res'
    elif value.lower() == 'адм':
        return 'adm'
    else:
        return 'com'
        

In [None]:
city['category'] = city['category'].apply(to_remake_category)
city['group'] = city['group'].apply(to_remake_group)
city['build'] = city['build'].apply(to_remake_build)

In [None]:
for col in city.columns[1:-3]:
    print(col)
    print()
    print(city[col].value_counts())
    print('____________')

In [None]:
city_encoded = pd.get_dummies(city, columns=['group', 'build', 'category'])
city_encoded = city_encoded[['input_id', 'images',
       'group_no_pool', 'group_pool', 'group_studio', 'build_adm', 'build_com', 'build_res', 'category_gym',
       'category_school', 'category_yoga', 'category_complex', 'category_fitness',
                                 'review_count']]

city_encoded[['group_no_pool', 'group_pool', 'group_studio',
               'build_adm', 'build_com', 'build_res',
               'category_gym','category_school',
                'category_yoga', 'category_complex',
                'category_fitness',]] = city_encoded.iloc[:,2:-1].astype('int')
correlation_matrix = round(city_encoded[['group_no_pool','group_pool',
                                           'group_studio', 'build_adm', 'build_com',
                                          'build_res', 'category_gym',
                                           'category_school', 'category_yoga',
                                           'category_complex', 'category_fitness',
                                          'review_count']].corr(), 2)


plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', square=True, cbar_kws={"shrink": .8}, fmt='.2f', linewidths=.5)
plt.title('Корреляционная матрица')
plt.show()

## Перейдем к классификации спортивных центров на те, что больше 1000 м2 и меньше 1000 м2
- `1`: > 1000 м2
- `0`: < 1000 м2

**Если есть следующие признаки:**

- `тип здания`
- `количетсво отзывов`
- `наличие бассейна`
- `студия?`
- `категория "комплекс"`

Воспользуемся `RandomForestClassifier`

In [None]:
rf_model = load("модель//rf_model_no_photo.pkl")
    
features = [
    'group_pool',
    'group_studio',
    'build_com',
    'build_res',
    'category_complex',
    'review_count'
]

y_pred = rf_model.predict(city[features])
city['target_class'] = y_pred

In [None]:
# # Обучение

# city_encoded = pd.read_excel('data/city_lines_depth.xlsx')
# start = city_encoded[city_encoded['target_class']==0].sample(499, random_state = 42)
# end = city_encoded[city_encoded['target_class']==1]

# df = pd.concat([start, end], axis = 0).reset_index(drop = True)

# features = [
#     # 'chain',
#     # 'group_no_pool',
#     'group_pool',
#     'group_studio',
#     # 'build_adm',
#     'build_com',
#     'build_res',
#     # 'category_gym',
#     'category_complex',
#     # 'category_yoga',
#     'review_count'
# ]


# X = df[features]
# y = df['target_class']

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# model = RandomForestClassifier(n_estimators=500, random_state=42, min_samples_leaf=4,
#                                warm_start=True, max_features = "log2")
# model.fit(X_train, y_train)

# y_pred = model.predict(X_test)
# y_proba = model.predict_proba(X_test)

# accuracy = accuracy_score(y_test, y_pred)
# precision = precision_score(y_test, y_pred)
# recall = recall_score(y_test, y_pred)
# f1 = f1_score(y_test, y_pred)
# roc_auc = roc_auc_score(y_test, y_proba[:, 1])

# print(f'Accuracy: {accuracy:.2f}')
# print(f'Precision: {precision:.2f}')
# print(f'Recall: {recall:.2f}')
# print(f'F1 Score: {f1:.2f}')
# print(f'ROC AUC: {roc_auc:.2f}')
# print('Classification Report:')
# print(classification_report(y_test, y_pred))

# # Метрики

# # Accuracy: 0.89
# # Precision: 0.87
# # Recall: 0.91
# # F1 Score: 0.89
# # ROC AUC: 0.95
# # Classification Report:
# #               precision    recall  f1-score   support

# #            0       0.91      0.87      0.89       155
# #            1       0.87      0.91      0.89       145

# #     accuracy                           0.89       300
# #    macro avg       0.89      0.89      0.89       300
# # weighted avg       0.89      0.89      0.89       300

### Отфильтруем изображения, указанные в табличке напротив спорт. центров, используя предобученную модель [openai/clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)

In [None]:
model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")

In [None]:
# city = pd.read_excel('train//city_encoded.xlsx')

def extract_images_and_titles(image_link_str):
    try:
        image_data = json.loads(image_link_str)
        return [item['image'] for item in image_data]
    except json.JSONDecodeError:
        return []

city['images_links'] = city['images'].apply(extract_images_and_titles)

In [None]:
def to_correct_links(links):
    global links_lst
    lst = []
    for link in links:
        if link.startswith('https:'):
            link = link
        else:
            link = 'https:'+link
        links_lst.append(link)
        lst.append(link)
    return lst
    
links_lst = []

city['images_links'] = city['images_links'].apply(to_correct_links)

In [None]:
len(links_lst)

#### Теперь отфильтруем только те, где уверенность в информативном классе более или равна 90 %

In [None]:
lst_probs = []
lst_useful_links = []
for url in links_lst:
    image = Image.open(requests.get(url, stream=True).raw)    
    inputs = processor(text=["indoor gym fitness center yoga studio pilates pool exercise room sports club with exercise equipment yoga mats mirror",
                            "building outdoors or branding sign or trees or other"],
                       images=image, return_tensors="pt", padding=True)
    
    outputs = model(**inputs)
    logits_per_image = outputs.logits_per_image
    prob = round(float(logits_per_image.softmax(dim=1)[0][0]), 2)
    lst_probs.append(prob)
    if prob >= 0.9:
        lst_useful_links.append(url)

In [None]:
def to_filter_links(links):
    lst = []
    for link in links:
        if link in lst_useful_links:
            lst.append(link)
    return lst
    
city['useful_images'] = city['images_links'].apply(to_filter_links)

In [None]:
city.to_excel('train//city_filtered_img.xlsx', index = False)

city_useful_img = lst_useful_links.copy()
with open('data//city_useful_img.txt', 'w') as file:
    for item in city_useful_img:
        file.write(f"{item}\n")

print("Список сохранён в файл city_useful_img.txt")

### Применим модель, которая описывает картинки [nlpconnect/vit-gpt2-image-captioning](https://huggingface.co/nlpconnect/vit-gpt2-image-captioning)

In [None]:
def convert_to_list(image_str):
    if image_str == '[]':
        return []
    else:
        return image_str[2:-2].split("', '")

# city_encoded = pd.read_excel('/content/drive/MyDrive/FitnessData/Кейсы/Фитнес_фото/city_smart_depth.xlsx')

city_encoded['useful_images'] = city_encoded['useful_images'].apply(convert_to_list)

In [None]:
pipe = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")

In [None]:
def to_text(links):
    global total
    total += 1
    if total % 500 == 0:
        print(total)
    lst = []
    global text_lst
    for link in links:
        try:
            text = pipe(Image.open(requests.get(link, stream=True).raw))[0]['generated_text']
            lst.append(text)
        except UnidentifiedImageError as e:
            print(f"Could not identify image from {link}: {e}")
            continue
        except Exception as e:
            print(f"Error processing {link}: {e}")
            continue
    if len(lst) == 0:
        text_lst.append([])
    else:
        text_lst.append(lst)
total = 0
text_lst = []
city_encoded['useful_images'].apply(to_text)

In [None]:
def to_clear(text):
    text  =  text.replace('[', ' ')
    text  =  text.replace(']', '')
    text  =  text.replace(" '", ", ")
    text  =  text.replace(" '", "")
    text  =  text.replace("'", "")
    text  =  text.replace(",, ", "")
    return text[:-1]
    
city_encoded['text_descriptions'] = city_encoded['category'] + city_encoded['text_descriptions']
city_encoded['text_descriptions'] = city_encoded['text_descriptions'].apply(to_clear)

### Применим модель, которая считает глубину [depth-anything/Depth-Anything-V2-Base-hf](https://huggingface.co/depth-anything/Depth-Anything-V2-Base-hf)

In [None]:
def convert_to_list(image_str):
    if image_str == '[]':
        return []
    else:
        return image_str[2:-2].split("', '")

city_encoded = pd.read_excel('train/city_smart_depth.xlsx')
city_encoded['useful_images'] = city_encoded['useful_images'].apply(convert_to_list)

In [None]:
device, _, _ = get_backend()
checkpoint = "depth-anything/Depth-Anything-V2-base-hf"
pipe = pipeline("depth-estimation", model=checkpoint, device=device)

In [None]:
def to_depth(links):
    global total
    total += 1
    if total % 500 == 0:
        print(total)
    lst = []
    global depth_lst
    for link in links:
        try:
            dictt = pipe(Image.open(requests.get(link, stream=True).raw))
            maxx = float(dictt['predicted_depth'].max())
            ten = maxx - dictt['predicted_depth']
            lst.append(ten)
        except UnidentifiedImageError as e:
            print(f"Could not identify image from {link}: {e}")
            continue
        except Exception as e:
            print(f"Error processing {link}: {e}")
            continue
    depth_lst[total] = lst


total = 0
depth_lst = {}
city_encoded['useful_images'].apply(to_depth)
print('DONE')

In [None]:
# with open('/content/drive/MyDrive/FitnessData/Кейсы/Фитнес_фото/depth_tensors_lst.txt', 'w') as file:
#     for item in depth_lst:
#         file.write(f"{item}\n")

# print("Список сохранён в файл depth_tensors_lst.txt")

In [None]:
depth_stats = []
for key, values in depth_lst.items():
  lst = []
  if len(values)>0:
    for ten in values:
      height = ten.shape[0]
      width = ten.shape[1]
      maxx = round(float(ten.max()), 4)
      floor_14_max = round(float(ten[height//4].max()), 4)
      floor_12_max = round(float(ten[height//2].max()), 4)
      floor_34_max = round(float(ten[height//4*3].max()), 4)
      wall_14_max = round(float(ten[:, width//4].max()), 4)
      wall_12_max = round(float(ten[:, width//2].max()), 4)
      wall_34_max = round(float(ten[:, width//4*3].max()), 4)

      floor_14_p = round(float(len(ten[height//4][ten[height//4]>floor_14_max-1.5])/width), 4)
      floor_12_p = round(float(len(ten[height//2][ten[height//2]>floor_12_max-1.5])/width), 4)
      floor_34_p = round(float(len(ten[height//4*3][ten[height//4*3]>floor_34_max-1.5])/width), 4)
      wall_14_p = round(float(len(ten[:, width//4][ten[:, width//4]>floor_14_max-1.5])/height), 4)
      wall_12_p = round(float(len(ten[:, width//2][ten[:, width//2]>floor_12_max-1.5])/height), 4)
      wall_34_p = round(float(len(ten[:, width//4*3][ten[:, width//4*3]>floor_34_max-1.5])/height), 4)

      floor_14_std = round(float(ten[height//4].std()), 4)
      floor_12_std = round(float(ten[height//2].std()), 4)
      floor_34_std = round(float(ten[height//4*3].std()), 4)
      wall_14_std = round(float(ten[:, width//4].std()), 4)
      wall_12_std = round(float(ten[:, width//2].std()), 4)
      wall_34_std = round(float(ten[:, width//4*3].std()), 4)

      floor_14_sum_norm = round(float(ten[height//4].sum()/width), 4)
      floor_12_sum_norm = round(float(ten[height//2].sum()/width), 4)
      floor_34_sum_norm = round(float(ten[height//4*3].sum()/width), 4)
      wall_14_sum_norm = round(float(ten[:, width//4].sum()/height), 4)
      wall_12_sum_norm = round(float(ten[:, width//2].sum()/height), 4)
      wall_34_sum_norm = round(float(ten[:, width//4*3].sum()/height), 4)


      lst.append([maxx, floor_14_max, floor_12_max, floor_34_max,
                  wall_14_max, wall_12_max, wall_34_max,
                  floor_14_p, floor_12_p, floor_34_p,
                  wall_14_p, wall_12_p, wall_34_p,
                  floor_14_std, floor_12_std, floor_34_std,
                  wall_14_std, wall_12_std, wall_34_std,
                  floor_14_sum_norm, floor_12_sum_norm, floor_34_sum_norm,
                  wall_14_sum_norm , wall_12_sum_norm , wall_34_sum_norm])
  depth_stats.append(lst)

In [None]:
# with open('/content/drive/MyDrive/FitnessData/Кейсы/Фитнес_фото/depth_stats_lst.txt', 'w') as file:
#     for item in depth_stats:
#         file.write(f"{item}\n")

# print("Список сохранён в файл depth_stats_lst.txt")

In [None]:
import numpy as np
itog = []
for line in depth_stats:
  if len(line)>0:
    id = int(np.argmax(np.array(line)[:,0]))
    itog.append(line[id])
  else:
    itog.append([np.nan for i in range(25)])

In [None]:
city_encoded[['maxx', 'floor_14_max', 'floor_12_max', 'floor_34_max',
                  'wall_14_max', 'wall_12_max', 'wall_34_max',
                  'floor_14_p', 'floor_12_p', 'floor_34_p',
                  'wall_14_p', 'wall_12_p', 'wall_34_p',
                 'floor_14_std', 'floor_12_std', 'floor_34_std',
                  'wall_14_std', 'wall_12_std', 'wall_34_std',
                  'floor_14_sum_norm', 'floor_12_sum_norm', 'floor_34_sum_norm',
                  'wall_14_sum_norm', 'wall_12_sum_norm', 'wall_34_sum_norm']] = itog

In [None]:
# city_encoded.to_excel('/content/drive/MyDrive/FitnessData/Кейсы/Фитнес_фото/city_lines_depth.xlsx', index = False)

### После того, как мы посчитали все нужные признаки, решим задачу классификации с помощью `HistGradientBoostingClassifier`, используя с google-maps только количество отзывов

In [None]:
boot_model = load("модель//rf_bosting_model_photo-reviews.pkl")
vectorizer = load('model/vectorizer_photo-reviews.joblib')

features = [
    'review_count',
    'maxx',
    'floor_14_max',
    'floor_12_max',
    'wall_14_max',
    'wall_12_max',
    'wall_34_max',
    'floor_14_p',
    'floor_12_p',
    'wall_14_p',
    'wall_12_p',
    'wall_34_p',
    'floor_14_std',
    'floor_12_std',
    'wall_14_std',
    'wall_12_std',
    'wall_34_std',
    'floor_14_sum_norm',
    'floor_12_sum_norm',
    'wall_14_sum_norm',
    'wall_12_sum_norm',
           ]

city_text = vectorizer.transform(cityl['text_descriptions'])
additional_features_city = csr_matrix(city[features].values)
combined_features_city = hstack([city_text, additional_features_city]).toarray()

y_pred = boost_model.predict(combined_features_city)
city['target_class'] = y_pred

In [None]:
# start_time = time.time()

# start = city_encoded[city_encoded['target_class'] == 0].sample(499, random_state=42)
# end = city_encoded[city_encoded['target_class'] == 1]

# # Объединение данных
# df = pd.concat([start, end], axis=0).reset_index(drop=True)

# features = [
#         'review_count',
#         'maxx', 'floor_14_max',
#         'floor_12_max',
#             # 'floor_34_max',
#         'wall_14_max',
#     'wall_12_max',
#         'wall_34_max',
#             'floor_14_p',
#         'floor_12_p',
#             # 'floor_34_p',
#         'wall_14_p',
#     'wall_12_p',
#         'wall_34_p',
#             'floor_14_std',
#         'floor_12_std',
#     # 'floor_34_std',
#         'wall_14_std',
#     'wall_12_std',
#         'wall_34_std',
#     'floor_14_sum_norm',
#         'floor_12_sum_norm',
#     # 'floor_34_sum_norm',
#         'wall_14_sum_norm',
#     'wall_12_sum_norm',
#         # 'wall_34_sum_norm'
#            ]
# # Предобработка текста
# X = df[['text_descriptions',
#         'review_count', 
#         'maxx',
#         'floor_14_max',
#         'floor_12_max',
#         # 'floor_34_max',
#         'wall_14_max',
#         'wall_12_max',
#         'wall_34_max',
#         'floor_14_p',
#         'floor_12_p',
#         # 'floor_34_p',
#         'wall_14_p',
#         'wall_12_p',
#         'wall_34_p',
#         'floor_14_std',
#         'floor_12_std',
#         # 'floor_34_std',
#         'wall_14_std',
#         'wall_12_std',
#         'wall_34_std',
#         'floor_14_sum_norm',
#         'floor_12_sum_norm',
#         # 'floor_34_sum_norm',
#         'wall_14_sum_norm',
#         'wall_12_sum_norm',
#         # 'wall_34_sum_norm'
#        ]]
# y = df['target_class']

# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# stops_eng = set(stopwords.words('english'))

# vectorizer = TfidfVectorizer(token_pattern=r'\b[a-zA-Zа-яА-ЯёЁ]+\b', 
#                              stop_words=list(stops_eng), 
#                              ngram_range=(1, 3), max_df=0.6, min_df=10)

# X_train_text = vectorizer.fit_transform(X_train['text_descriptions'])
# X_val_text = vectorizer.transform(X_val['text_descriptions'])

# additional_features_train = csr_matrix(X_train[features].values)
# combined_features_train = hstack([X_train_text, additional_features_train]).toarray()

# additional_features_val = csr_matrix(X_val[features].values)
# combined_features_val = hstack([X_val_text, additional_features_val]).toarray()

# model = HistGradientBoostingClassifier(random_state=42, max_iter=1000,  learning_rate=0.08, l2_regularization=1, min_samples_leaf=12, warm_start=True)
# model.fit(combined_features_train, y_train)

# y_pred = model.predict(combined_features_val)

# y_proba = model.predict_proba(combined_features_val)

# accuracy = accuracy_score(y_val, y_pred)
# precision = precision_score(y_val, y_pred)
# recall = recall_score(y_val, y_pred)
# f1 = f1_score(y_val, y_pred)
# roc_auc = roc_auc_score(y_val, y_proba[:, 1])

# end_time = time.time()
# execution_time = end_time - start_time

# print(f'Accuracy: {accuracy:.2f}')
# print(f'Precision: {precision:.2f}')
# print(f'Recall: {recall:.2f}')
# print(f'F1 Score: {f1:.2f}')
# print(f'ROC AUC: {roc_auc:.2f}')
# print('Classification Report:')
# print(classification_report(y_val, y_pred))
# print(f"\nВремя выполнения: {execution_time // 60:.2f} минут")

# # Метрики

# # Accuracy: 0.85
# # Precision: 0.88
# # Recall: 0.84
# # F1 Score: 0.86
# # ROC AUC: 0.91
# # Classification Report:
# #               precision    recall  f1-score   support

# #            0       0.83      0.87      0.85        94
# #            1       0.88      0.84      0.86       106

# #     accuracy                           0.85       200
# #    macro avg       0.85      0.86      0.85       200
# # weighted avg       0.86      0.85      0.86       200


# # Время выполнения: 0.00 минут

### Решим задачу классификации с помощью `HistGradientBoostingClassifier`, используя с google-maps количество отзывов, информацию о наличии бассейна и студия это или нет

In [None]:
boot_model = load("модель//rf_bosting_model_photo-reviews-ps.pkl")
vectorizer = load('model/vectorizer_photo-reviews-ps.joblib')

features = [
    'group_pool',
    'group_studio',
    'review_count',
    'maxx',
    'floor_14_max',
    'floor_12_max',
    'wall_14_max',
    'wall_12_max',
    'wall_34_max',
    'floor_14_p',
    'floor_12_p',
    'wall_14_p',
    'wall_12_p',
    'wall_34_p',
    'floor_14_std',
    'floor_12_std',
    'wall_14_std',
    'wall_12_std',
    'wall_34_std',
    'floor_14_sum_norm',
    'floor_12_sum_norm',
    'wall_14_sum_norm',
    'wall_12_sum_norm',
           ]

city_text = vectorizer.transform(cityl['text_descriptions'])
additional_features_city = csr_matrix(city[features].values)
combined_features_city = hstack([city_text, additional_features_city]).toarray()

y_pred = boost_model.predict(combined_features_city)
city['target_class'] = y_pred

In [None]:
# start_time = time.time()

# start = city_encoded[city_encoded['target_class'] == 0].sample(499, random_state=42)
# end = city_encoded[city_encoded['target_class'] == 1]

# df = pd.concat([start, end], axis=0).reset_index(drop=True)

# features = [
#     'group_pool',
#     'group_studio',
#     'review_count',
#     'maxx',
#     'floor_14_max',
#     'floor_12_max',
#     # 'floor_34_max',
#     'wall_14_max',
#     'wall_12_max',
#     'wall_34_max',
#     'floor_14_p',
#     'floor_12_p',
#     # 'floor_34_p',
#     'wall_14_p',
#     'wall_12_p',
#     'wall_34_p',
#     'floor_14_std',
#     'floor_12_std',
#     # 'floor_34_std',
#     'wall_14_std',
#     'wall_12_std',
#     'wall_34_std',
#     'floor_14_sum_norm',
#     'floor_12_sum_norm',
#     # 'floor_34_sum_norm',
#     'wall_14_sum_norm',
#     'wall_12_sum_norm',
#     # 'wall_34_sum_norm'
#            ]

# X = df[['text_descriptions',
#         'group_pool',
#         'group_studio',
#         'review_count', 
#         'maxx',
#         'floor_14_max',
#         'floor_12_max',
#         # 'floor_34_max',
#         'wall_14_max',
#         'wall_12_max',
#         'wall_34_max',
#         'floor_14_p',
#         'floor_12_p',
#         # 'floor_34_p',
#         'wall_14_p',
#         'wall_12_p',
#         'wall_34_p',
#         'floor_14_std',
#         'floor_12_std',
#         # 'floor_34_std',
#         'wall_14_std',
#         'wall_12_std',
#         'wall_34_std',
#         'floor_14_sum_norm',
#         'floor_12_sum_norm',
#         # 'floor_34_sum_norm',
#         'wall_14_sum_norm',
#         'wall_12_sum_norm',
#         # 'wall_34_sum_norm'
#        ]]
# y = df['target_class']

# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# stops_eng = set(stopwords.words('english'))

# vectorizer = TfidfVectorizer(token_pattern=r'\b[a-zA-Zа-яА-ЯёЁ]+\b', 
#                              stop_words=list(stops_eng), 
#                              ngram_range=(1, 3), max_df=0.6, min_df=10)

# X_train_text = vectorizer.fit_transform(X_train['text_descriptions'])
# X_val_text = vectorizer.transform(X_val['text_descriptions'])

# additional_features_train = csr_matrix(X_train[features].values)
# combined_features_train = hstack([X_train_text, additional_features_train]).toarray()

# additional_features_val = csr_matrix(X_val[features].values)
# combined_features_val = hstack([X_val_text, additional_features_val]).toarray()

# model = HistGradientBoostingClassifier(random_state=42, max_iter=1000,  learning_rate=0.08, l2_regularization=1, min_samples_leaf=12, warm_start=True)
# model.fit(combined_features_train, y_train)

# y_pred = model.predict(combined_features_val)

# y_proba = model.predict_proba(combined_features_val)

# accuracy = accuracy_score(y_val, y_pred)
# precision = precision_score(y_val, y_pred)
# recall = recall_score(y_val, y_pred)
# f1 = f1_score(y_val, y_pred)
# roc_auc = roc_auc_score(y_val, y_proba[:, 1])

# end_time = time.time()
# execution_time = end_time - start_time

# print(f'Accuracy: {accuracy:.2f}')
# print(f'Precision: {precision:.2f}')
# print(f'Recall: {recall:.2f}')
# print(f'F1 Score: {f1:.2f}')
# print(f'ROC AUC: {roc_auc:.2f}')
# print('Classification Report:')
# print(classification_report(y_val, y_pred))
# print(f"\nВремя выполнения: {execution_time // 60:.2f} минут")

# # Метрики

# # Accuracy: 0.85
# # Precision: 0.87
# # Recall: 0.84
# # F1 Score: 0.86
# # ROC AUC: 0.93
# # Classification Report:
# #               precision    recall  f1-score   support

# #            0       0.83      0.86      0.84        94
# #            1       0.87      0.84      0.86       106

# #     accuracy                           0.85       200
# #    macro avg       0.85      0.85      0.85       200
# # weighted avg       0.85      0.85      0.85       200


# # Время выполнения: 1.00 минут