In [2]:
import pandas as pd
import datetime as dt
import pytz
import swifter

from dateutil import parser

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
#constants
LARGE_DATASET_LOC = 'data/4_all_data_preprocessed/all_hourly_weather_events_v2.csv'
FEATURE_ENGINEERING_FOLDER = 'data/3_feature_engineering/'

ALARMS_ONLY_DATASET = 'data/1_events/alarms.csv'
HOLIDAY_DATASET = 'data/1_holidays/holidays.csv'

RESULT_DATASET = 'features_generated'

In [3]:
tz = pytz.timezone('Europe/Kyiv')

def localize(datetime_str):
    return pytz.utc.localize(parser.parse(datetime_str)).astimezone(tz).replace(tzinfo=None)

def get_row_date_time(row):
    return parser.parse(f"{row['day_datetime']}T{row['hour_datetime']}")

def read_df(): #reads main dataset
    df = pd.read_csv(LARGE_DATASET_LOC, sep=';')   
    return df

def save_df(df, name):
    df.to_csv(FEATURE_ENGINEERING_FOLDER + name + '.csv', sep=';')

def read_df_custom(filename):
    df = pd.read_csv(filename, sep=';')    
    return df

def read_alarms_only_df(): # read dataset with alarms
    df_alarms_only = read_df_custom(ALARMS_ONLY_DATASET)
    df_alarms_only[['start', 'end']] = df_alarms_only[['start', 'end']].swifter.apply(lambda row: pd.Series([localize(date) for date in row]), axis=1)
    return df_alarms_only

def read_holiday_df(): # custom made dataset with most "important" russian hollidays
    holiday_df = pd.read_csv(HOLIDAY_DATASET, sep=';')
    holiday_df['date'] = holiday_df['date'].apply(pd.to_datetime)
    holiday_df = holiday_df.sort_values(by=['date'])
    holiday_df = holiday_df.set_index('date')
    return holiday_df

In [7]:
# calculating features

def alarm_past_24_hours(df, df_alarms_only):
    df['event_alarms_past_24'] = df[['datetime_cache', 'region_alt']].swifter.apply(lambda row: df_alarms_only[(df_alarms_only['region_title'] == row['region_alt']) &
                                                                      (df_alarms_only['end'] > row['datetime_cache'] - pd.Timedelta(hours=24)) &
                                                                      (df_alarms_only['end'] < row['datetime_cache'])].shape[0], axis=1)

"""def alarm_past_24_hours(df, df_alarms_only):    
    df['event_alarms_past_24'] = 0
    
    for index, row in df.iterrows():
        datetime = get_row_date_time(row)
        filter1 = (df_alarms_only['region_title'] == row['region_alt']) & (df_alarms_only['end'] > datetime - pd.Timedelta(hours=24)) & (df_alarms_only['end'] < datetime)
        count = len(df_alarms_only[filter1].index)
        df['event_alarms_past_24'][index] = count
        
        if(index % 1000 == 0):
            print(index)"""


def count_alarm_overlap(df, df_alarms_only):
    df['event_simultaneous_alarms'] = df['datetime_cache'].apply(lambda date: ((df_alarms_only['start'] < date) & 
                                                              (df_alarms_only['end'] > date)).sum())


"""def count_alarm_overlap(df, df_alarms_only):    
    df['event_simultaneous_alarms'] = -1
    
    for index, row in df.iterrows():
        datetime = get_row_date_time(row)
        filter1 = (df_alarms_only['start'] < datetime) & (df_alarms_only['end'] > datetime)
        
        count = len(df_alarms_only[filter1].index)
        df['event_simultaneous_alarms'][index] = count
        
        if(index % 1000 == 0):
            print(index)"""

def hours_from_prev_alarm(df, df_alarms_only):
    df['event_hours_from_last_alarm'] = df[['datetime_cache', 'region_alt']].swifter.apply(lambda row: (row['datetime_cache'] - df_alarms_only[(df_alarms_only['region_title'] == row['region_alt']) &
                                                             (df_alarms_only['end'] < row['datetime_cache'])]['end'].max()).seconds / 3600, axis=1)

"""def hours_from_prev_alarm(df, df_alarms_only):
    df['event_hours_from_last_alarm'] = -1
    
    for index, row in df.iterrows():
        datetime = get_row_date_time(row)
        filter1 = (df_alarms_only['region_title'] == row['region_alt']) & (df_alarms_only['end'] < datetime)
        value = pd.Timedelta(datetime - df_alarms_only[filter1]['end'].max()).seconds / 3600;
        df['event_hours_from_last_alarm'][index] = value
        
        if(index % 1000 == 0):
            print(index)"""

def holiday_is_near(df, holiday_df):
    df['event_holiday_is_near'] = df['datetime_cache'].apply(lambda date: abs(pd.Timedelta(date - 
                                                holiday_df.index[holiday_df.index.get_loc(date, method='nearest')]).days) <= 3)

"""def holiday_is_near(df, holiday_df):
    df['event_holiday_is_near'] = False
    
    for index, row in df.iterrows():
        datetime = parser.parse(row['day_datetime'])
        closest_holiday = holiday_df.index[holiday_df.index.get_loc(datetime, method='nearest')]
        value = abs(pd.Timedelta(datetime - closest_holiday).days) <= 3
        df['event_holiday_is_near'][index] = value
        
        if(index % 1000 == 0):
            print(index)"""

"def holiday_is_near(df, holiday_df):\n    df['event_holiday_is_near'] = False\n    \n    for index, row in df.iterrows():\n        datetime = parser.parse(row['day_datetime'])\n        closest_holiday = holiday_df.index[holiday_df.index.get_loc(datetime, method='nearest')]\n        value = abs(pd.Timedelta(datetime - closest_holiday).days) <= 3\n        df['event_holiday_is_near'][index] = value\n        \n        if(index % 1000 == 0):\n            print(index)"

In [None]:
# 20 min
# read all datasets
df = read_df()
df_alarms_only = read_alarms_only_df()
holiday_df = read_holiday_df()

# cache row time
df['datetime_cache'] = df[['day_datetime', 'hour_datetime']].swifter.apply(get_row_date_time, axis=1)

# generate features
print('start alarm_past_24_hours')
alarm_past_24_hours(df, df_alarms_only)
print('start count_alarm_overlap')
count_alarm_overlap(df, df_alarms_only)
print('start hours_from_prev_alarm')
hours_from_prev_alarm(df, df_alarms_only)


In [8]:
print('start holiday_is_near')
holiday_is_near(df, holiday_df)

# drop cached time
df.drop('datetime_cache', axis=1, inplace=True)

# save
save_df(df, f'{RESULT_DATASET}_v1')

start holiday_is_near


In [5]:
df = pd.read_csv(FEATURE_ENGINEERING_FOLDER + f'{RESULT_DATASET}_v1' + '.csv', sep=';')

df[100:300][['day_datetime', 'hour_datetime', 'event_start_hour', 'event_end_hour', 'event_alarms_past_24', 'event_hours_from_last_alarm', 'event_simultaneous_alarms', 'event_holiday_is_near']]

Unnamed: 0,day_datetime,hour_datetime,event_start_hour,event_end_hour,event_alarms_past_24,event_hours_from_last_alarm,event_simultaneous_alarms,event_holiday_is_near
100,2022-02-28,03:00:00,,,2,5.115556,0,False
101,2022-02-28,04:00:00,,,2,6.115556,1,False
102,2022-02-28,05:00:00,,,2,7.115556,0,False
103,2022-02-28,06:00:00,,,2,8.115556,0,False
104,2022-02-28,07:00:00,,,2,9.115556,0,False
105,2022-02-28,08:00:00,,,2,10.115556,0,False
106,2022-02-28,09:00:00,,,2,11.115556,0,False
107,2022-02-28,10:00:00,,,2,12.115556,1,False
108,2022-02-28,11:00:00,,,2,13.115556,2,False
109,2022-02-28,12:00:00,,,2,14.115556,1,False
