# EDA, Feature engineering

In [1]:
import pandas as pd
df = pd.read_csv("../data/interim/full_collected_data.csv", parse_dates=[0])

In [2]:
df

Unnamed: 0,Date,15_year_mortgage_rate,30_year_mortgage_rate,auto_exports,average_house_prices,average_mortgage_size,balance_of_trade,bank_lending_rate,banks_balance_sheet,building_permits,...,unemployed_persons,unemployment_rate,used_car_prices_mom,used_car_prices_yoy,wage_growth,wages,wages_in_manufacturing,weekly_crude_oil_production,wholesale_inventories,youth_unemployment_rate
0,2001-01-01,6.6400,7.0325,67.0,208100.0,163.857143,-32.242,9.50,6192.220,1543.0,...,5634.0,3.9,0.2,1.8,7.02,14.29,14.50,5929.434783,0.0,9.2
1,2001-02-01,6.6400,7.0500,52.1,209000.0,162.756522,-32.258,9.05,6143.275,1699.0,...,6023.0,4.2,0.0,2.5,5.52,14.29,14.48,5884.750000,-0.1,9.6
2,2001-03-01,6.5080,6.9520,47.0,211000.0,160.110000,-35.202,8.50,6163.050,1656.0,...,6089.0,4.2,0.1,2.2,4.92,14.36,14.55,5874.227273,0.1,9.6
3,2001-04-01,6.5950,7.0775,44.2,210200.0,159.509091,-29.300,8.32,6224.075,1659.0,...,6141.0,4.3,-0.7,0.9,4.79,14.42,14.59,5867.380952,0.0,9.8
4,2001-05-01,6.6960,7.1640,52.2,205500.0,159.542857,-32.811,7.80,6207.180,1666.0,...,6271.0,4.4,-0.2,0.9,4.68,14.44,14.63,5814.217391,0.5,10.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
287,2024-12-01,5.9300,6.7150,72.9,478400.0,402.873000,-73.730,7.75,23721.200,1493.0,...,7121.0,4.2,1.3,0.2,5.00,30.58,28.22,13598.250000,-0.1,9.4
288,2025-01-01,6.1640,6.9580,55.4,509700.0,400.930000,-78.240,7.50,23693.400,1482.0,...,6886.0,4.1,-0.3,0.4,5.12,30.67,28.33,13447.800000,-0.4,9.0
289,2025-02-01,6.0300,6.8425,68.7,510000.0,403.416000,-98.060,7.50,23766.975,1473.0,...,6849.0,4.0,0.4,0.8,5.04,30.80,28.58,13500.250000,0.8,9.0
290,2025-03-01,5.8275,6.6500,64.4,487100.0,397.516000,-130.650,7.50,23975.150,1459.0,...,7052.0,4.1,-0.7,0.1,4.23,30.91,28.68,13575.500000,0.5,9.7


In [3]:
# 1. Получим минимальную и максимальную дату
start = df['Date'].min()
end = df['Date'].max()

# 2. Создаём список всех последних дней месяца в диапазоне
expected_month_ends = pd.date_range(start=start, end=end, freq='MS')

# 3. Получаем уникальные даты из таблицы
actual_month_ends = pd.Series(df['Date'].unique()).sort_values()

# 4. Сравним
missing = expected_month_ends.difference(actual_month_ends)

# 5. Выводим
if missing.empty:
    print("Все месяцы присутствуют.")
else:
    print("Пропущены месяцы:")
    print(missing)

Все месяцы присутствуют.


In [4]:
from statsmodels.tsa.stattools import adfuller
import matplotlib.pyplot as plt

# Преобразуем Date в индекс
df['Date'] = pd.to_datetime(df['Date'])
df = df.set_index('Date')

target = df['inflation_rate']

def check_stationarity(series, name):
    result_adf = adfuller(series.dropna(), autolag='AIC')

    return {
        'feature': name,
        'adf_pvalue': result_adf[1],
        'adf_statistic': result_adf[0]
    }

stationarity_results = check_stationarity(target, 'inflation_rate')
p_val = stationarity_results["adf_pvalue"]

if p_val < 0.05:
    print(f"p-значение: {p_val}, значит гипотеза о нестационарности отвергается на уровне значимости 0.05.")
else:
    print("Нет возомжноси отвергнуть нулевую гипотезу о нестационарности.")

p-значение: 0.03864665608011116, значит гипотеза о нестационарности отвергается на уровне значимости 0.05.


In [6]:
intervals = df.index.to_series().diff().value_counts()
print(intervals)

Date
31 days    170
30 days     96
28 days     19
29 days      6
Name: count, dtype: int64


In [7]:
import pandas as pd
from tsfresh import extract_features, select_features
from tsfresh.utilities.dataframe_functions import impute, roll_time_series

# 1. Загрузка и подготовка данных
df = pd.read_csv("../data/interim/full_collected_data.csv")
df['Date'] = pd.to_datetime(df['Date'])
df = df.set_index('Date')
df = df.sort_index()

# 2. Условия генерации окон
window_size = 12  # длина окна в месяцах
stride = 1        # шаг сдвига
id_column = "window_id"
time_column = "time"

# 3. Подготовка к roll_time_series
exog = df.drop(columns=['inflation_rate']).copy()
exog[time_column] = exog.index
exog[id_column] = 1  # временно одна серия

# 4. Генерация окон
rolled = roll_time_series(
    exog,
    column_id=id_column,
    column_sort=time_column,
    max_timeshift=window_size,
    min_timeshift=0,
    rolling_direction=1
)

# 5. Извлечение признаков из каждого окна
features = extract_features(
    rolled,
    column_id="id",
    column_sort="time",
    n_jobs=6
)
features = impute(features)


Rolling: 100%|██████████| 20/20 [00:01<00:00, 14.60it/s]
Feature Extraction: 100%|██████████| 30/30 [04:36<00:00,  9.23s/it]
 '15_year_mortgage_rate__partial_autocorrelation__lag_7'
 '15_year_mortgage_rate__partial_autocorrelation__lag_8' ...
 'window_id__fourier_entropy__bins_10'
 'window_id__fourier_entropy__bins_100'
 'window_id__query_similarity_count__query_None__threshold_0.0'] did not have any finite values. Filling with zeros.


In [10]:
features

Unnamed: 0,Unnamed: 1,15_year_mortgage_rate__variance_larger_than_standard_deviation,15_year_mortgage_rate__has_duplicate_max,15_year_mortgage_rate__has_duplicate_min,15_year_mortgage_rate__has_duplicate,15_year_mortgage_rate__sum_values,15_year_mortgage_rate__abs_energy,15_year_mortgage_rate__mean_abs_change,15_year_mortgage_rate__mean_change,15_year_mortgage_rate__mean_second_derivative_central,15_year_mortgage_rate__median,...,window_id__fourier_entropy__bins_5,window_id__fourier_entropy__bins_10,window_id__fourier_entropy__bins_100,window_id__permutation_entropy__dimension_3__tau_1,window_id__permutation_entropy__dimension_4__tau_1,window_id__permutation_entropy__dimension_5__tau_1,window_id__permutation_entropy__dimension_6__tau_1,window_id__permutation_entropy__dimension_7__tau_1,window_id__query_similarity_count__query_None__threshold_0.0,window_id__mean_n_absolute_max__number_of_maxima_7
1,2001-01-01,0.0,0.0,0.0,0.0,6.6400,44.089600,0.108958,-0.016333,-0.000307,6.6400,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2001-02-01,0.0,1.0,1.0,1.0,13.2800,88.179200,0.000000,0.000000,-0.000307,6.6400,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2001-03-01,0.0,1.0,0.0,1.0,19.7880,130.533264,0.066000,-0.066000,-0.066000,6.6400,...,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2001-04-01,0.0,1.0,0.0,1.0,26.3830,174.027289,0.073000,-0.015000,0.021750,6.6175,...,0.0,0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,1.0
1,2001-05-01,0.0,0.0,0.0,1.0,33.0790,218.863705,0.080000,0.014000,0.016833,6.6400,...,0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,1.0
1,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,2024-12-01,0.0,0.0,0.0,0.0,77.6640,465.153499,0.221792,-0.017292,0.007091,6.1060,...,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,1.0
1,2025-01-01,0.0,0.0,0.0,0.0,77.6905,465.479489,0.220167,0.023333,-0.001114,6.1060,...,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,1.0
1,2025-02-01,0.0,0.0,0.0,0.0,77.8365,467.218933,0.209792,-0.009375,-0.007568,6.1060,...,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,1.0
1,2025-03-01,0.0,0.0,0.0,0.0,77.5215,463.448383,0.223958,-0.028958,-0.013182,6.0300,...,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,1.0


In [11]:
features.index.names = ['feature', 'dt']

In [18]:
features.reset_index(inplace=True)
del features["feature"]

In [26]:
features.head()

Unnamed: 0,dt,15_year_mortgage_rate__variance_larger_than_standard_deviation,15_year_mortgage_rate__has_duplicate_max,15_year_mortgage_rate__has_duplicate_min,15_year_mortgage_rate__has_duplicate,15_year_mortgage_rate__sum_values,15_year_mortgage_rate__abs_energy,15_year_mortgage_rate__mean_abs_change,15_year_mortgage_rate__mean_change,15_year_mortgage_rate__mean_second_derivative_central,...,window_id__fourier_entropy__bins_5,window_id__fourier_entropy__bins_10,window_id__fourier_entropy__bins_100,window_id__permutation_entropy__dimension_3__tau_1,window_id__permutation_entropy__dimension_4__tau_1,window_id__permutation_entropy__dimension_5__tau_1,window_id__permutation_entropy__dimension_6__tau_1,window_id__permutation_entropy__dimension_7__tau_1,window_id__query_similarity_count__query_None__threshold_0.0,window_id__mean_n_absolute_max__number_of_maxima_7
0,2001-01-01,0.0,0.0,0.0,0.0,6.64,44.0896,0.108958,-0.016333,-0.000307,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2001-02-01,0.0,1.0,1.0,1.0,13.28,88.1792,0.0,0.0,-0.000307,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2001-03-01,0.0,1.0,0.0,1.0,19.788,130.533264,0.066,-0.066,-0.066,...,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,2001-04-01,0.0,1.0,0.0,1.0,26.383,174.027289,0.073,-0.015,0.02175,...,0.0,0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,1.0
4,2001-05-01,0.0,0.0,0.0,1.0,33.079,218.863705,0.08,0.014,0.016833,...,0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,1.0


In [29]:
from sklearn.feature_selection import VarianceThreshold
# Подготовка: удалим целевую переменную и приведём к числовым признакам
features_numeric = features.select_dtypes(include='number')

# Применим фильтр по дисперсии
var_filter = VarianceThreshold(threshold=1e-6)
features_filtered_array = var_filter.fit_transform(features_numeric)

# Получим имена признаков, прошедших фильтр
selected_columns = features_numeric.columns[var_filter.get_support()]
features_filtered = pd.DataFrame(features_filtered_array, columns=selected_columns, index=features_numeric.index)

# Выведем количество удалённых признаков
features_filtered.shape, features_numeric.shape

((292, 57142), (292, 134676))

In [32]:
time_range = features["dt"].copy(deep=True)

In [34]:
features_filtered["dt"] = time_range
features_filtered.set_index("dt", inplace=True)

In [37]:
# 6. Подготовка целевой переменной (инфляция через 12 месяцев)
# Возьмём таргет на момент времени t+12 для каждого окна
inflation = df["inflation_rate"].reset_index(drop=True)
y = inflation.shift(-12)
y = y.dropna()
y = y.iloc[window_size:]  # выравниваем с features по окнам
y.index = features_filtered.index[:len(y)]  # синхронизируем индексы

# 7. Отбор признаков
selected_features = select_features(features_filtered.iloc[:len(y)], y)

# 8. Результаты
print("Отобрано признаков:", selected_features.shape[1])
print("Список отобранных признаков:")
print(selected_features.columns.tolist())

Отобрано признаков: 11381
Список отобранных признаков:
['cfnai_personal_consumption_and_housing_index__linear_trend__attr_"stderr"', 'retail_sales_mom__mean_abs_change', 'cfnai_personal_consumption_and_housing_index__change_quantiles__f_agg_"var"__isabs_True__qh_1.0__ql_0.0', 'retail_sales_mom__change_quantiles__f_agg_"mean"__isabs_True__qh_1.0__ql_0.0', 'total_vehicle_sales__mean_abs_change', 'total_vehicle_sales__change_quantiles__f_agg_"var"__isabs_False__qh_1.0__ql_0.0', 'cfnai_personal_consumption_and_housing_index__change_quantiles__f_agg_"var"__isabs_False__qh_1.0__ql_0.0', 'total_vehicle_sales__change_quantiles__f_agg_"mean"__isabs_True__qh_0.8__ql_0.0', 'cfnai_personal_consumption_and_housing_index__cid_ce__normalize_False', 'total_vehicle_sales__change_quantiles__f_agg_"mean"__isabs_True__qh_1.0__ql_0.0', 'total_vehicle_sales__cid_ce__normalize_False', 'core_inflation_rate_mom__lempel_ziv_complexity__bins_100', 'retail_sales_mom__change_quantiles__f_agg_"var"__isabs_False__qh

In [39]:
selected_features.to_csv("../data/interim/data_after_tsfresh.csv")