# EDA, Feature engineering

In [1]:
import pandas as pd

from statsmodels.tsa.stattools import adfuller
import matplotlib.pyplot as plt

from tsfresh import extract_features, select_features
from tsfresh.utilities.dataframe_functions import impute, roll_time_series
from tsfresh.feature_extraction import EfficientFCParameters

import warnings

# Отключить все предупреждения tsfresh
warnings.filterwarnings("ignore", module="tsfresh")

In [2]:
df = pd.read_csv("../data/interim/full_collected_data.csv", parse_dates=[0])
df.rename(columns={"Date":"time"}, inplace=True)
df.tail()

Unnamed: 0,time,15_year_mortgage_rate,30_year_mortgage_rate,auto_exports,average_house_prices,average_mortgage_size,balance_of_trade,bank_lending_rate,banks_balance_sheet,building_permits,...,unemployed_persons,unemployment_rate,used_car_prices_mom,used_car_prices_yoy,wage_growth,wages,wages_in_manufacturing,weekly_crude_oil_production,wholesale_inventories,youth_unemployment_rate
287,2024-12-01,5.93,6.715,72.9,478400.0,402.873,-73.73,7.75,23721.2,1493.0,...,7121.0,4.2,1.3,0.2,5.0,30.58,28.22,13598.25,-0.1,9.4
288,2025-01-01,6.164,6.958,55.4,509700.0,400.93,-78.24,7.5,23693.4,1482.0,...,6886.0,4.1,-0.3,0.4,5.12,30.67,28.33,13447.8,-0.4,9.0
289,2025-02-01,6.03,6.8425,68.7,510000.0,403.416,-98.06,7.5,23766.975,1473.0,...,6849.0,4.0,0.4,0.8,5.04,30.8,28.58,13500.25,0.8,9.0
290,2025-03-01,5.8275,6.65,64.4,487100.0,397.516,-130.65,7.5,23975.15,1459.0,...,7052.0,4.1,-0.7,0.1,4.23,30.91,28.68,13575.5,0.5,9.7
291,2025-04-01,5.9025,6.725,101.1,497700.0,381.921,-122.66,7.5,24156.2,1467.0,...,7083.0,4.2,-0.7,-0.2,3.54,30.96,28.92,13460.0,0.5,9.4


**Проверка наличия всех месяцев в датах**

In [3]:
start = df['time'].min()
end = df['time'].max()

expected_month_ends = pd.date_range(start=start, end=end, freq='MS')
actual_month_ends = pd.Series(df['time'].unique()).sort_values()
missing = expected_month_ends.difference(actual_month_ends)

if missing.empty:
    print("Все месяцы присутствуют.")
else:
    print("Пропущены месяцы:")
    print(missing)

Все месяцы присутствуют.


**Проверка таргета на стационарность**

In [4]:
target = df['inflation_rate']

def check_stationarity(series, name):
    result_adf = adfuller(series.dropna(), autolag='AIC')

    return {
        'feature': name,
        'adf_pvalue': result_adf[1],
        'adf_statistic': result_adf[0]
    }

stationarity_results = check_stationarity(target, 'inflation_rate')
p_val = stationarity_results["adf_pvalue"]

if p_val < 0.05:
    print(f"p-значение: {p_val}, значит гипотеза о нестационарности отвергается на уровне значимости 0.05.")
else:
    print("Нет возомжноси отвергнуть нулевую гипотезу о нестационарности.")

p-значение: 0.03864665608011116, значит гипотеза о нестационарности отвергается на уровне значимости 0.05.


**Работа с Tsfresh для генерациии новых признаков в виде новых рядов**

In [None]:
def tsfresh_feature_engineering(dataset: pd.DataFrame) -> pd.DataFrame:

    wide_dataset = dataset.drop(columns=['inflation_rate']).copy()
    wide_dataset.set_index("time", inplace=True)
    wide_dataset.sort_index(inplace=True)

    # Определяем начальную и конечную дату для генерации признаков
    start_date_obj = wide_dataset.index.min()
    start_date_str = start_date_obj.strftime("%Y-%m-%d")
    last_date_obj = wide_dataset.index.max()
    last_date_str = last_date_obj.strftime('%Y-%m-%d')

    extracted_features_dataset = pd.DataFrame(index=pd.date_range(start=start_date_str,
                                                                  end=last_date_str,
                                                                  freq="MS"))
    extracted_features_dataset.index.name = "time"
    
    long_dataset = wide_dataset.reset_index().melt(id_vars=['time'], var_name='id', value_name='value')

    for size in [2,3,6]:
        # 1. Условия генерации окон
        window_size = size
        id_column = "id"
        time_column = "time"

        print(f"Start with window_size {size}!")
        # 2. Генерация окон
        rolled = roll_time_series(
            long_dataset,
            column_id=id_column,
            column_sort=time_column,
            max_timeshift=window_size-1,
            min_timeshift=0,
            rolling_direction=1
        )

        # 3. Извлечение признаков из каждого окна
        new_features = extract_features(
            rolled,
            column_id=id_column,
            column_sort=time_column,
            n_jobs=6,
            default_fc_parameters=EfficientFCParameters()
        )
        # 4. Заполнение пропусков
        new_extracted_features_for_size = impute(new_features)

        
        new_extracted_features_for_size.index.names = ['feature', 'time']

        unstacked_frame = new_extracted_features_for_size.unstack(level=0)
        unstacked_frame.columns = ['__'.join(col).strip() for col in unstacked_frame.columns.values]
        unstacked_frame = unstacked_frame.reset_index().set_index('time')
        suf = f'__{size}M'
        unstacked_frame = unstacked_frame.add_suffix(suf)
            
        extracted_features_dataset = extracted_features_dataset.join(unstacked_frame,
                                                                     on='time',
                                                                     how='left')
        del rolled
        del new_features
        del new_extracted_features_for_size
        del unstacked_frame

        print(f"Finish with window_size {size}!")

    return extracted_features_dataset

In [6]:
%%time
extracted_features_dataset = tsfresh_feature_engineering(df)

Start with window_size 2!


Rolling: 100%|██████████| 20/20 [00:03<00:00,  5.94it/s]
Feature Extraction: 100%|██████████| 30/30 [03:33<00:00,  7.11s/it]


Finish with window_size 2!
Start with window_size 3!


Rolling: 100%|██████████| 20/20 [00:03<00:00,  5.96it/s]
Feature Extraction: 100%|██████████| 30/30 [03:57<00:00,  7.91s/it]


Finish with window_size 3!
Start with window_size 6!


Rolling: 100%|██████████| 20/20 [00:03<00:00,  5.93it/s]
Feature Extraction: 100%|██████████| 30/30 [04:08<00:00,  8.27s/it]


Finish with window_size 6!
CPU times: user 5min 42s, sys: 1min 38s, total: 7min 20s
Wall time: 16min 48s


In [7]:
min_var = df.drop(columns=["time"]).describe().loc["std"].min()
print(f"Минимальная дисперсия в исходных данных: {min_var}")
del min_var

Минимальная дисперсия в исходных данных: 0.1365060545643701


In [8]:
full_features_dataset = pd.merge(df, extracted_features_dataset, on="time")
full_features_dataset.shape[1] == df.shape[1] + extracted_features_dataset.shape[1]
del extracted_features_dataset
del full_features_dataset["inflation_rate"]

In [9]:
from sklearn.feature_selection import VarianceThreshold
features_numeric = full_features_dataset.select_dtypes(include='number')
# Применим фильтр по дисперсии
var_filter = VarianceThreshold(threshold=0.1)
features_filtered_array = var_filter.fit_transform(features_numeric)
selected_columns = features_numeric.columns[var_filter.get_support()]
features_filtered = pd.DataFrame(features_filtered_array, columns=selected_columns, index=features_numeric.index)
# Выведем количество удалённых признаков
print(features_filtered.shape, features_numeric.shape, full_features_dataset.shape)
del features_numeric

(292, 48243) (292, 398772) (292, 398773)


In [10]:
forecast_horizon = 6
df.set_index("time", inplace=True)

In [11]:
features_filtered["time"] = full_features_dataset["time"]
features_filtered.set_index("time", inplace=True)

In [None]:
inflation = df["inflation_rate"]
y = inflation.shift(-forecast_horizon)
y = y.dropna()
y = y[forecast_horizon - 1:]

selected_features = select_features(features_filtered.iloc[forecast_horizon - 1:-forecast_horizon], y)

print("Отобрано признаков:", selected_features.shape[1])
#print("Список отобранных признаков:")
#print(selected_features.columns.tolist())

Отобрано признаков: 14317


In [None]:
inflation = df["inflation_rate"]
y = inflation.shift(-forecast_horizon)
y = y.dropna()
y = y[forecast_horizon - 1:]

from sklearn.feature_selection import mutual_info_regression

# Расчёт взаимной информации
mi = mutual_info_regression(selected_features.reset_index(drop=True), y, random_state=42)
mi_scores = pd.Series(mi, index=selected_features.reset_index(drop=True).columns).sort_values(ascending=False)
# Берем топ-300 признаков по взаимной информации
top_features = mi_scores.head(300).index.tolist()

# Выводим имена и значения MI
# print("Top 300 признаков по взаимной информации:")
# print(mi_scores.head(300))

In [14]:
df["inflation_rate"]

time
2001-01-01    3.4
2001-02-01    3.7
2001-03-01    3.5
2001-04-01    2.9
2001-05-01    3.3
             ... 
2024-12-01    2.7
2025-01-01    2.9
2025-02-01    3.0
2025-03-01    2.8
2025-04-01    2.4
Name: inflation_rate, Length: 292, dtype: float64

In [17]:
X_selected = full_features_dataset[["time"] + top_features].copy()
X_selected = pd.merge(X_selected, df["inflation_rate"], on="time", how="left")

In [19]:
X_selected.columns = [col.replace('.', '_').replace(', ', '_').replace(')', '').replace('(', '') for col in X_selected.columns]
X_selected.to_csv("../data/interim/data_after_selection.csv")