# импорты и загрузки

In [2]:
import pandas as pd
import numpy as np

In [318]:
xlsx = pd.ExcelFile("data.xlsx")
xlsx.sheet_names

['test data', 'train', 'TV Viewing']

In [326]:
df = xlsx.parse('train')

# preprocessing

In [231]:
df['Break flight start'] = pd.to_datetime(df['Break flight start'], format='%H:%M:%S')
df['Break flight end'] = pd.to_datetime(df['Break flight end'], format='%H:%M:%S')
df['Programme flight start'] = pd.to_datetime(df['Programme flight start'], format='%H:%M:%S')
df['Programme flight end'] = pd.to_datetime(df['Programme flight end'], format='%H:%M:%S')

In [232]:
df['day_of_year'] = df['Date'].dt.dayofyear
df['день недели'] = df['Date'].dt.day_name()
df['тип дня'] = df['Date'].dt.dayofweek.apply(lambda x: 'weekday' if x < 5 else 'weekend')

In [233]:
df['длительность рекламы'] = (pd.to_datetime(df['Break flight end']) - pd.to_datetime(df['Break flight start'])).dt.total_seconds() / 60
# Исправляем длительность рекламы для случаев, когда конец рекламы на следующий день

for i in range(len(df)):
    if pd.to_datetime(df['Break flight end'][i]) < pd.to_datetime(df['Break flight start'][i]):
        df.loc[i, 'длительность рекламы'] = ((pd.to_datetime(df['Break flight end'][i]) + pd.Timedelta(days=1)) - pd.to_datetime(df['Break flight start'][i])).total_seconds() / 60


In [234]:
df['длительность программы'] = (pd.to_datetime(df['Programme flight end']) - pd.to_datetime(df['Programme flight start'])).dt.total_seconds() / 60

# Исправляем длительность рекламы для случаев, когда конец рекламы на следующий день
for i in range(len(df)):
    if pd.to_datetime(df['Programme flight end'][i]) < pd.to_datetime(df['Programme flight start'][i]):
        df.loc[i, 'длительность программы'] = ((pd.to_datetime(df['Programme flight end'][i]) + pd.Timedelta(days=1)) - pd.to_datetime(df['Programme flight start'][i])).total_seconds() / 60

In [235]:
df['начало рекламы'] = (pd.to_datetime(df['Break flight end']) - pd.to_datetime(df['Programme flight start'])).dt.total_seconds() / 60

# Исправляем длительность рекламы для случаев, когда конец рекламы на следующий день
for i in range(len(df)):
    if pd.to_datetime(df['Break flight end'][i]) < pd.to_datetime(df['Programme flight start'][i]):
        df.loc[i, 'начало рекламы'] = ((pd.to_datetime(df['Break flight end'][i]) + pd.Timedelta(days=1)) - pd.to_datetime(df['Programme flight start'][i])).total_seconds() / 60

In [236]:
df.loc[(df['Programme flight start'] < df['Programme flight end']) &
       (df['Break flight start'] < df['Programme flight start']), 'начало рекламы'] = -1
df.loc[(df['Programme flight start'] < df['Programme flight end']) &
       (df['Break flight start'] < df['Programme flight start']), 'начало рекламы'] = -1

In [237]:
df['длительность рекламы'] = df['длительность рекламы'].round(1)
df['длительность программы'] = df['длительность программы'].round(1)
df['начало рекламы'] = df['начало рекламы'].round(1)

In [238]:
df['между программами'] = np.zeros(len(df))

df.loc[(df['Programme flight start'] < df['Programme flight end']) &
       (df['Break flight start'] < df['Programme flight start']), 'между программами'] = 1

df.loc[(df['Programme flight start'] < df['Programme flight end']) &
       (df['Break flight start'] < df['Programme flight start']), 'между программами'] = 1

In [239]:
bins = np.arange(7)

df['начало прогрммы бин'] = pd.cut(pd.to_datetime(df['Programme flight start'], format='%H:%M:%S'), 
                                   bins=[pd.to_datetime('00:00', format='%H:%M'), 
                                         pd.to_datetime('07:00', format='%H:%M'), 
                                         pd.to_datetime('10:00', format='%H:%M'),
                                         pd.to_datetime('12:00', format='%H:%M'),
                                         pd.to_datetime('15:00', format='%H:%M'),
                                         pd.to_datetime('18:00', format='%H:%M'),
                                         pd.to_datetime('22:00', format='%H:%M'),
                                         pd.to_datetime('23:59', format='%H:%M')],
                                   labels=bins)

In [240]:
def get_season(month):
    if month in [3, 4, 5]:
        return 'spring'
    elif month in [6, 7, 8]:
        return 'summer'
    elif month in [9, 10, 11]:
        return 'autumn'
    else:
        return 'winter'

df['season'] = df['Date'].dt.month.apply(get_season)
df['month'] = df['Date'].dt.strftime('%B')

In [241]:
Label_encoder_cols = ['Break content', 'Break distribution', 'Programme category',
                      'Programme genre', 'Programme', 'день недели', 'тип дня',
                      'month', 'season']
one_hot_cols = ['Programme']
del_cols = ['Date', 'Break flight ID', 'Break flight start', 'Break flight end',
            'Programme flight start', 'Programme flight end']
target_cols = ['TVR Index']

In [242]:
df['emb'] = df['Break content']
emb_cols = ['Break distribution', 'Programme', 'день недели', 'тип дня', 'season', 'month']
for col in emb_cols:
    df['emb'] = df['emb'] + ', ' + df[col]

In [243]:
from sklearn.preprocessing import LabelEncoder

for col in Label_encoder_cols:
    df[col] = LabelEncoder().fit_transform(df[col])

In [355]:
df['TVR Index'] = df['TVR Index'].round(3)
df = df[df['TVR Index'] < 2]
df = df[df['TVR Index'] > 0.1]

In [245]:
df = df.drop(columns = del_cols)

In [356]:
df.head(5)

Unnamed: 0,TVR Index,Date,Break flight ID,Break flight start,Break flight end,Break content,Break distribution,Programme,Programme flight start,Programme flight end,Programme category,Programme genre
0,0.615,2023-01-02,4870830561,08:17:33,08:21:40,Commercial,Network,"Telekanal ""Dobroe utro""",08:00:13,10:00:14,Morning airplay,Entertainment programs
1,0.87,2023-01-02,4870830614,08:34:45,08:38:52,Commercial,Network,"Telekanal ""Dobroe utro""",08:00:13,10:00:14,Morning airplay,Entertainment programs
2,0.99,2023-01-02,4870830629,08:52:19,08:56:23,Commercial,Network,"Telekanal ""Dobroe utro""",08:00:13,10:00:14,Morning airplay,Entertainment programs
3,0.885,2023-01-02,4870830684,08:56:31,08:57:28,Announcement,Network,"Telekanal ""Dobroe utro""",08:00:13,10:00:14,Morning airplay,Entertainment programs
4,0.84,2023-01-02,4870830685,09:12:04,09:16:13,Commercial,Network,"Telekanal ""Dobroe utro""",08:00:13,10:00:14,Morning airplay,Entertainment programs


In [247]:
df = df.fillna(0)

## сохранение/загрузка готового датасета

In [225]:
df.sort_index().to_csv("new_data_5.csv")

In [321]:
df = pd.read_csv('new_data_5.csv')
df = df.drop(columns='Unnamed: 0')

# модели

In [229]:
df

Unnamed: 0,TVR Index,Break content,Break distribution,Programme,Programme category,Programme genre,day_of_year,день недели,тип дня,длительность рекламы,длительность программы,начало рекламы,между программами,начало прогрммы бин,month,season
0,0.615,1,0,17,7,1,2,1,0,4.1,120.0,21.4,0.0,1.0,January,зима
1,0.870,1,0,17,7,1,2,1,0,4.1,120.0,38.6,0.0,1.0,January,зима
2,0.990,1,0,17,7,1,2,1,0,4.1,120.0,56.2,0.0,1.0,January,зима
3,0.885,0,0,17,7,1,2,1,0,1.0,120.0,57.2,0.0,1.0,January,зима
4,0.840,1,0,17,7,1,2,1,0,4.2,120.0,76.0,0.0,1.0,January,зима
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27805,0.150,1,0,11,2,1,304,5,0,0.5,52.0,17.7,0.0,0.0,October,осень
27806,0.165,0,0,11,2,1,304,5,0,0.9,52.0,32.6,0.0,0.0,October,осень
27807,0.240,1,0,11,2,1,304,5,0,4.1,52.0,36.7,0.0,0.0,October,осень
27808,0.210,1,0,11,2,1,304,5,0,4.1,52.0,52.0,0.0,0.0,October,осень


In [248]:
df = df.drop(columns='emb')

In [258]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=target_cols)
Y = df[target_cols]

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [250]:
from xgboost import XGBRegressor
model = XGBRegressor(n_estimators = 500, tree_method='gpu_hist')
model.fit(X_train.values, y_train.values)

In [251]:
from sklearn.metrics import mean_absolute_percentage_error
predictions = model.predict(X_test.values)
mean_absolute_percentage_error(y_test, predictions)

0.17116879735538032

скор:

с эмбендингами: 0.1745903989630204

без ембендингов: 0.17121545961409496

In [None]:
!pip install pytorch_tabnet

In [256]:
from pytorch_tabnet.tab_model import TabNetRegressor
import torch
import numpy as np

# преобразуем данные в массивы NumPy
X_train_np = X_train.values
y_train_np = y_train.values
X_test_np = X_test.values
y_test_np = y_test.values

# define the model
tabnet_model = TabNetRegressor(optimizer_fn=torch.optim.Adam,
                      scheduler_params={"step_size":10, 
                                        "gamma":0.9},
                      scheduler_fn=torch.optim.lr_scheduler.StepLR,
                     )

# fit the model 
tabnet_model.fit(
    X_train_np, y_train_np,
    eval_set=[(X_train_np, y_train_np), (X_test_np, y_test_np)],
    eval_name=['train', 'test'],
    eval_metric=['rmse'],
    max_epochs=200, patience=60,
    batch_size=512, virtual_batch_size=512,
    num_workers=0,
    drop_last=False
)




epoch 0  | loss: 0.87819 | train_rmse: 0.4925  | test_rmse: 0.4878  |  0:00:02s
epoch 1  | loss: 0.21008 | train_rmse: 0.49815 | test_rmse: 0.49802 |  0:00:04s
epoch 2  | loss: 0.12862 | train_rmse: 0.36955 | test_rmse: 0.37309 |  0:00:07s
epoch 3  | loss: 0.09432 | train_rmse: 0.34377 | test_rmse: 0.34475 |  0:00:09s
epoch 4  | loss: 0.08986 | train_rmse: 0.32414 | test_rmse: 0.32375 |  0:00:11s
epoch 5  | loss: 0.08303 | train_rmse: 0.29552 | test_rmse: 0.28905 |  0:00:14s
epoch 6  | loss: 0.07786 | train_rmse: 0.2991  | test_rmse: 0.29431 |  0:00:16s
epoch 7  | loss: 0.0756  | train_rmse: 0.3046  | test_rmse: 0.30129 |  0:00:18s
epoch 8  | loss: 0.07346 | train_rmse: 0.29993 | test_rmse: 0.2958  |  0:00:21s
epoch 9  | loss: 0.07276 | train_rmse: 0.28243 | test_rmse: 0.27853 |  0:00:23s
epoch 10 | loss: 0.07525 | train_rmse: 0.3201  | test_rmse: 0.31916 |  0:00:25s
epoch 11 | loss: 0.06867 | train_rmse: 0.30216 | test_rmse: 0.3002  |  0:00:27s
epoch 12 | loss: 0.06975 | train_rmse: 0



In [257]:
from sklearn.metrics import mean_absolute_percentage_error
predictions = tabnet_model.predict(X_test_np)
mean_absolute_percentage_error(y_test, predictions)

0.22325552365527224

In [None]:
# from catboost import CatBoostRegressor
# import lightgbm as lgb
# from xgboost import XGBRegressor
# from pytorch_tabnet.tab_model import TabNetRegressor
# import numpy as np

# # Создание и обучение отдельных моделей
# cat_model = CatBoostRegressor(iterations=500, verbose=100)
# lgb_model = lgb.LGBMRegressor(n_estimators=500)
# xgb_model = XGBRegressor(n_estimators=500, tree_method='gpu_hist')

# cat_model.fit(X_train_np, y_train_np)
# lgb_model.fit(X_train_np, y_train_np)
# xgb_model.fit(X_train_np, y_train_np)

# Получение прогнозов от каждой модели
cat_preds = cat_model.predict(X_test_np)
lgb_preds = lgb_model.predict(X_test_np)
xgb_preds = xgb_model.predict(X_test_np)
tabnet_preds = tabnet_model.predict(X_test_np)

In [292]:
ensemble_preds = lgb_preds + tabnet_preds

In [312]:
ensemble_preds = np.zeros(len(lgb_preds))
for i in range(len(lgb_preds)):
    ensemble_preds[i] = (lgb_preds[i] + cat_preds[i] + xgb_preds[i] + tabnet_preds[i]) / 4


In [313]:
ensemble_preds

array([1.21324074, 0.72248435, 0.95899642, ..., 1.15378594, 1.00915158,
       1.52091467])

In [317]:
from sklearn.metrics import mean_absolute_percentage_error
# predictions = voit_model.predict(X_test.values)
mean_absolute_percentage_error(y_test_np, tabnet_preds)

0.22325552365527224

# embendingi

In [None]:
import nltk
from nltk.tokenize import word_tokenize

sentence = '. '.join(df['emb'])

# Токенизация слов
tokens = word_tokenize(sentence, language='english')

# Вывод токенов
print(tokens)

In [130]:
from gensim.models import Word2Vec
# Обучение модели Word2Vec
model = Word2Vec(tokens, min_count=1)

In [131]:
text_data = df['emb']
sentence_embeddings = []
for sentence in text_data:
    words = sentence.split()
    sentence_vectors = []
    for word in words:
        if word in model.wv:
            sentence_vectors.append(model.wv[word])
        else:
            sentence_vectors.append(np.array([0] * 100))
    if sentence_vectors:
        sentence_embeddings.append(sum(sentence_vectors) / len(sentence_vectors))

In [253]:
data = pd.DataFrame(sentence_embeddings, columns = [ f"emb_cols_{i}" for i in range(100)])
data['index'] = np.arange(len(data))
df['index'] = np.arange(len(df))
# df = df.drop(columns='emb')

In [254]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=target_cols)
Y = df[target_cols]

X = X.merge(data, on='index')
X = X.drop(columns = 'index')

X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=42)

# нейронка для регрессии 

In [157]:
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Activation, Add, GlobalAveragePooling1D, Reshape

In [159]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [161]:
import torch
import torch.nn as nn
import torch.optim as optim

In [162]:
# Нормализация данных
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Преобразование данных в тензоры PyTorch
X_train, X_test = torch.tensor(X_train).float(), torch.tensor(X_test).float()
y_train, y_test = torch.tensor(y_train.values).float(), torch.tensor(y_test.values).float()

In [None]:
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_val = scaler.transform(X_val)

In [163]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Загрузка данных в формате TensorFlow Dataset
train_data = tf.data.Dataset.from_tensor_slices((X_train, y_train))
train_data = train_data.shuffle(len(X_train)).batch(100)

val_data = tf.data.Dataset.from_tensor_slices((X_test, y_test))
val_data = val_data.batch(100)


In [164]:
import tensorflow as tf
from tensorflow.keras import layers, models

model = models.Sequential([
    layers.Dense(128, activation='relu', input_shape=(115,)),  # Полносвязный слой с 128 нейронами и функцией активации ReLU
    layers.Dropout(0.5),  # Слой регуляризации для предотвращения переобучения
    layers.Dense(64, activation='relu'),  # Еще один полносвязный слой с 64 нейронами и функцией активации ReLU
    layers.BatchNormalization(),  # Слой нормализации для стабилизации обучения
    layers.Dense(32, activation='relu'),  # Еще один полносвязный слой с 32 нейронами и функцией активации ReLU
    layers.Dense(1)  # Выходной слой с одним нейроном (для регрессии) без функции активации
])

In [173]:
model.compile(optimizer='adam', loss='mean_squared_error')  # Используем оптимизатор Adam и функцию потерь среднеквадратичной ошибки
model.fit(train_data, epochs=10, batch_size=100)  # Обучаем модель на данных в течение 10 эпох с размером пакета 32

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1da1c0bb5b0>

In [174]:
from sklearn.metrics import mean_absolute_percentage_error
predictions = model.predict(val_data)
mean_absolute_percentage_error(y_val, predictions)



0.7084517493835688