In [75]:
import pandas as pd 

### Feature engineering

Features:
1) Amount of alarms for last day by regions.
2) Amount of alarms for last 12 hours by regions.
3) Amount of alarms for last 6 hours by regions.
4) Amount of alarms for last 3 hours by regions.
5) Amount of alarms for last hour by regions.

In [76]:
data = pd.read_csv(r"..\clean_data\final_data.csv")

In [77]:
data = data.drop("Unnamed: 0", axis=1)

In [78]:
data.head()

Unnamed: 0,region_city_x,hour_datetimeEpoch,day_datetime,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,...,327,328,329,330,331,332,333,334,335,date
0,Луцьк,1645653600,2022-02-24,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,0.00075,0.000516,0.007649,0.003805,0.00783,0.009322,-0.011622,-0.011459,4.569509e-15,2022-02-24
1,Луцьк,1645657200,2022-02-24,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,0.00075,0.000516,0.007649,0.003805,0.00783,0.009322,-0.011622,-0.011459,4.569509e-15,2022-02-24
2,Луцьк,1645660800,2022-02-24,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,0.00075,0.000516,0.007649,0.003805,0.00783,0.009322,-0.011622,-0.011459,4.569509e-15,2022-02-24
3,Луцьк,1645664400,2022-02-24,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,0.00075,0.000516,0.007649,0.003805,0.00783,0.009322,-0.011622,-0.011459,4.569509e-15,2022-02-24
4,Луцьк,1645668000,2022-02-24,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,0.00075,0.000516,0.007649,0.003805,0.00783,0.009322,-0.011622,-0.011459,4.569509e-15,2022-02-24


In [79]:
data["hour_datetime"] = pd.to_datetime(data["hour_datetimeEpoch"], unit="s")

In [80]:
df_features = data[["region_city_x", "hour_datetimeEpoch", "is_alarm"]]
df_features = df_features.set_index("hour_datetimeEpoch")
df_features.index = pd.to_datetime(df_features.index, unit="s")

In [81]:
df_features.head(3)

Unnamed: 0_level_0,region_city_x,is_alarm
hour_datetimeEpoch,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-02-23 22:00:00,Луцьк,0
2022-02-23 23:00:00,Луцьк,0
2022-02-24 00:00:00,Луцьк,0


In [82]:
def alarms_amount_for_last(df, time_delta, feature_name):
    df_grouped = df.groupby("region_city_x").resample(time_delta).sum().reset_index()
    df_grouped_final = df_grouped.set_index("hour_datetimeEpoch").groupby("region_city_x").shift(1).reset_index()
    df_grouped_final["region_city_x"] = df_grouped["region_city_x"]
    df_grouped_final.dropna(inplace=True)
    df_grouped_final.rename(columns={"is_alarm": feature_name, 
                                     "hour_datetimeEpoch": "hour_datetime"}, inplace=True)
    return df_grouped_final

In [83]:
lda = alarms_amount_for_last(df_features, "d", "last_day_alarms")
la_12h = alarms_amount_for_last(df_features, "12H", "last_12H_alarms")
la_6h = alarms_amount_for_last(df_features, "6H", "last_6H_alarms")
la_3h = alarms_amount_for_last(df_features, "3H", "last_3H_alarms")
la_1h = alarms_amount_for_last(df_features, "1H", "last_1H_alarms")

In [84]:
lda.head()

Unnamed: 0,hour_datetime,last_day_alarms,region_city_x
1,2022-02-24,0.0,Івано-Франківськ
2,2022-02-25,0.0,Івано-Франківськ
3,2022-02-26,0.0,Івано-Франківськ
4,2022-02-27,0.0,Івано-Франківськ
5,2022-02-28,0.0,Івано-Франківськ


### Merge datasets and convert categorical columns to dummies

In [85]:
def merge_datasets_by(to_data, merge_data, on, how="left"):
    result_data = to_data.copy()
    for feature in merge_data:
        result_data = result_data.merge(feature, how=how, on=on)
    return result_data

In [86]:
features = [lda, la_12h, la_6h, la_3h, la_1h]
final_data = merge_datasets_by(data, features, ["hour_datetime", "region_city_x"])
final_data = pd.get_dummies(final_data, columns=['region_city_x'], prefix="", prefix_sep="")
final_data.set_index("hour_datetimeEpoch", inplace=True)

In [87]:
final_data.to_csv(r"..\clean_data\final_data_v2.csv")