In [110]:
import pandas as pd
from dateutil import parser

In [111]:
#constants
LARGE_DATASET_LOC = 'data/4_all_data_preprocessed/all_hourly_weather_events_v2.csv'
FEATURE_ENGINEERING_FOLDER = 'data/3_feature_engineering/'

ALARMS_ONLY_DATASET = 'data/1_events/alarms.csv'
HOLIDAY_DATASET = 'data/1_holidays/holidays.csv'

RESULT_DATASET = 'features_generated'

In [112]:
def read_df(): #reads main dataset
    df = pd.read_csv(LARGE_DATASET_LOC, sep=';')   
    return df

def save_df(df, name):
    df.to_csv(FEATURE_ENGINEERING_FOLDER + name + '.csv', sep=';')

def read_df_custom(filename):
    df = pd.read_csv(filename, sep=';')    
    return df

def read_alarms_only_df(): # read dataset with alarms
    df_alarms_only = read_df_custom(ALARMS_ONLY_DATASET)
    df_alarms_only[['start', 'end']] = df_alarms_only[['start', 'end']].apply(pd.to_datetime)
    return df_alarms_only

def read_holiday_df(): # custom made dataset with most "important" russian hollidays
    holiday_df = pd.read_csv(HOLIDAY_DATASET, sep=';')
    holiday_df['date'] = holiday_df['date'].apply(pd.to_datetime)
    holiday_df = holiday_df.sort_values(by=['date'])
    holiday_df = holiday_df.set_index('date')
    return holiday_df

In [113]:
def get_row_date_time(row):
    return parser.parse(row['day_datetime'] + ' ' + row['hour_datetime'])

In [114]:
# calculating features

def alarm_past_24_hours(df, df_alarms_only):
    df['event_alarms_past_24'] = df.apply(lambda row: df_alarms_only[(df_alarms_only['region_title'] == row['region_alt']) &
                                                                      (df_alarms_only['end'] > row['datetime_cache'] - pd.Timedelta(hours=24)) &
                                                                      (df_alarms_only['end'] < row['datetime_cache'])].shape[0], axis=1)

"""def alarm_past_24_hours(df, df_alarms_only):    
    df['event_alarms_past_24'] = 0
    
    for index, row in df.iterrows():
        datetime = get_row_date_time(row)
        filter1 = (df_alarms_only['region_title'] == row['region_alt']) & (df_alarms_only['end'] > datetime - pd.Timedelta(hours=24)) & (df_alarms_only['end'] < datetime)
        count = len(df_alarms_only[filter1].index)
        df['event_alarms_past_24'][index] = count
        
        if(index % 1000 == 0):
            print(index)"""


def count_alarm_overlap(df, df_alarms_only):
    df['event_simultaneous_alarms'] = df.apply(lambda row: ((df_alarms_only['start'] < row['datetime_cache']) & 
                                                              (df_alarms_only['end'] > row['datetime_cache'])).sum(), axis=1)


"""def count_alarm_overlap(df, df_alarms_only):    
    df['event_simultaneous_alarms'] = -1
    
    for index, row in df.iterrows():
        datetime = get_row_date_time(row)
        filter1 = (df_alarms_only['start'] < datetime) & (df_alarms_only['end'] > datetime)
        
        count = len(df_alarms_only[filter1].index)
        df['event_simultaneous_alarms'][index] = count
        
        if(index % 1000 == 0):
            print(index)"""

def hours_from_prev_alarm(df, df_alarms_only):
    df['event_hours_from_last_alarm'] = df.apply(lambda row: (row['datetime_cache'] - df_alarms_only[(df_alarms_only['region_title'] == row['region_alt']) &
                                                             (df_alarms_only['end'] < row['datetime_cache'])]['end'].max()).seconds / 3600, axis=1)

"""def hours_from_prev_alarm(df, df_alarms_only):
    df['event_hours_from_last_alarm'] = -1
    
    for index, row in df.iterrows():
        datetime = get_row_date_time(row)
        filter1 = (df_alarms_only['region_title'] == row['region_alt']) & (df_alarms_only['end'] < datetime)
        value = pd.Timedelta(datetime - df_alarms_only[filter1]['end'].max()).seconds / 3600;
        df['event_hours_from_last_alarm'][index] = value
        
        if(index % 1000 == 0):
            print(index)"""

def holiday_is_near(df, holiday_df):
    df['event_holiday_is_near'] = df.apply(lambda row: abs(pd.Timedelta(parser.parse(row['day_datetime']) - 
                                                holiday_df.index[holiday_df.index.get_loc(parser.parse(row['day_datetime']), method='nearest')]).days) <= 3, axis=1)

"""def holiday_is_near(df, holiday_df):
    df['event_holiday_is_near'] = False
    
    for index, row in df.iterrows():
        datetime = parser.parse(row['day_datetime'])
        closest_holiday = holiday_df.index[holiday_df.index.get_loc(datetime, method='nearest')]
        value = abs(pd.Timedelta(datetime - closest_holiday).days) <= 3
        df['event_holiday_is_near'][index] = value
        
        if(index % 1000 == 0):
            print(index)"""

"def holiday_is_near(df, holiday_df):\n    df['event_holiday_is_near'] = False\n    \n    for index, row in df.iterrows():\n        datetime = parser.parse(row['day_datetime'])\n        closest_holiday = holiday_df.index[holiday_df.index.get_loc(datetime, method='nearest')]\n        value = abs(pd.Timedelta(datetime - closest_holiday).days) <= 3\n        df['event_holiday_is_near'][index] = value\n        \n        if(index % 1000 == 0):\n            print(index)"

In [115]:
# 20 min
# read all datasets
df = read_df()
df_alarms_only = read_alarms_only_df()
holiday_df = read_holiday_df()

# cache row time
df['datetime_cache'] = df.apply(get_row_date_time, axis=1)

# generate features
print('start alarm_past_24_hours')
alarm_past_24_hours(df, df_alarms_only)
print('start count_alarm_overlap')
count_alarm_overlap(df, df_alarms_only)
print('start hours_from_prev_alarm')
hours_from_prev_alarm(df, df_alarms_only)
print('start holiday_is_near')
holiday_is_near(df, holiday_df)

# drop cached time
df.drop('datetime_cache', axis=1, inplace=True)

# save
save_df(df, f'{RESULT_DATASET}_v1')    

start alarm_past_24_hours
start count_alarm_overlap
start hours_from_prev_alarm
start holiday_is_near


  holiday_df.index[holiday_df.index.get_loc(parser.parse(row['day_datetime']), method='nearest')]).days) <= 3, axis=1)
