In [112]:
import datetime as dt
import pandas as pd
import numpy as np
from dateutil import parser

from scripts.weather_collection import get_weather
from scripts.data_collection import save_by_date
from scripts.data_preprocessing import get_report_lemm, get_report_tfidf_vector
import pickle

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

def isNaN(num):
    return num != num

HOLIDAY_DATASET = './data/1_holidays/holidays.csv'
REGIONS_DATASET = './data/0_meta/regions.csv'


In [113]:
city_dict = {
    'Lutsk': 1,
    'Zhytomyr': 2,
    'Rivne': 3,
    'Poltava': 4,
    'Ternopil': 5,
    'Uzhgorod': 6,
    'Donetsk': 7,
    'Zaporozhye': 8,
    'Sumy': 9,
    'Cherkasy': 10,
    'Vinnytsia': 11,
    'Lviv': 12,
    'Ivano-Frankivsk': 13,
    'Kherson': 14,
    'Kyiv': 15,
    'Dnipro': 16,
    'Chernivtsi': 17,
    'Kropyvnytskyi': 18,
    'Kharkiv': 19,
    'Mykolaiv': 20,
    'Khmelnytskyi': 21,
    'Odesa': 22,
    'Chernihiv': 23
}

df = []

df_regions = pd.read_csv(REGIONS_DATASET, sep=',')

for index, row in df_regions.iterrows():
    print(row['center_city_en'])
    try:
        weather = get_weather(row['center_city_en'])
        df_city = pd.DataFrame(weather["forecast"])
    except:
        weather = get_weather(row['center_city_en'] + '(UA)')
        df_city = pd.DataFrame(weather["forecast"])
    df_city['region_id'] = float(row['region_id'])
    df_city['city'] = row['center_city_en']
    df.append(df_city)

df = pd.concat(df, axis=0, ignore_index=True)

Vinnytsia
200
Lutsk
200
Dnipro
200
Donetsk
200
Zhytomyr
200
Uzhgorod
200
Zaporozhye
200
Ivano-Frankivsk
200
Kyiv
200
Kropyvnytskyi
200
Lviv
500
200
Mykolaiv
200
Odesa
200
Poltava
200
Rivne
200
Sumy
500
200
Ternopil
200
Kharkiv
200
Kherson
200
Khmelnytskyi
200
Cherkasy
200
Chernivtsi
200
Chernihiv
200


In [114]:
def read_holiday_df(): # custom made dataset with most "important" russian hollidays
    holiday_df = pd.read_csv(HOLIDAY_DATASET, sep=';')
    holiday_df['date'] = holiday_df['date'].apply(pd.to_datetime)
    holiday_df = holiday_df.sort_values(by=['date'])
    holiday_df = holiday_df.set_index('date')
    return holiday_df

def event_holiday_is_near(holiday_df, row):
    datetime = parser.parse(f"{row['day_datetime']} {row['hour_datetime']}")
    closest_holiday = holiday_df.index[holiday_df.index.get_loc(datetime, method='nearest')]
    value = abs(pd.Timedelta(datetime - closest_holiday).days) <= 3
    return 1.0 if value and not isNaN(value) else 0.0

holiday_df = read_holiday_df()
df['event_holiday_is_near'] = df.apply(lambda row: event_holiday_is_near(holiday_df, row), axis=1)
df['event_alarms_past_24'] = 0.0
df['event_simultaneous_alarms'] = 0.0
df['event_hours_from_last_alarm'] = 0.0

df.columns

  closest_holiday = holiday_df.index[holiday_df.index.get_loc(datetime, method='nearest')]


Index(['day_datetime', 'day_dew', 'day_humidity', 'day_moonphase',
       'day_precip', 'day_precipcover', 'day_solarenergy',
       'day_solarradiation', 'day_sunrise', 'day_sunset', 'day_temp',
       'day_tempmax', 'day_tempmin', 'day_uvindex', 'hour_cloudcover',
       'hour_conditions', 'hour_datetime', 'hour_dew', 'hour_humidity',
       'hour_precip', 'hour_precipprob', 'hour_preciptype', 'hour_pressure',
       'hour_severerisk', 'hour_snow', 'hour_snowdepth', 'hour_solarenergy',
       'hour_solarradiation', 'hour_temp', 'hour_uvindex', 'hour_visibility',
       'hour_winddir', 'hour_windgust', 'hour_windspeed', 'region_id', 'city',
       'event_holiday_is_near', 'event_alarms_past_24',
       'event_simultaneous_alarms', 'event_hours_from_last_alarm'],
      dtype='object')

In [115]:
# Convet time to float
df['day_sunset'] = df['day_sunset'].apply(lambda x: 
    (parser.parse(x) - dt.datetime.strptime("00:00:00", "%H:%M:%S")).total_seconds()
)
df['day_sunrise'] = df['day_sunrise'].apply(lambda x: 
    (parser.parse(x) - dt.datetime.strptime("00:00:00", "%H:%M:%S")).total_seconds()
)
df['datetime'] = df.apply(lambda row: f"{row['day_datetime']} {row['hour_datetime']}", axis=1)
df['hour_datetime'] = df['hour_datetime'].apply(lambda x:
    (parser.parse(x) - dt.datetime.strptime("00:00:00", "%H:%M:%S")).total_seconds()//3600
)

In [116]:
# Encode categorical values
df['hour_preciptype'] = df['hour_preciptype'].apply(lambda a: str(a) if a else np.nan)
le = pickle.load(open('./model/preciptype_encoder_v1.pkl', 'rb'))
df['hour_preciptype'] = le.transform(df['hour_preciptype']).astype(float)

le = le = pickle.load(open('./model/conditions_encoder_v1.pkl', 'rb'))
df['hour_conditions'] = le.transform(df['hour_conditions']).astype(float)

print(df.shape)

(276, 41)


In [117]:
cities = df['city'].unique()
city_dict = {cities[i]: i+1 for i in range(len(cities))}

In [118]:
def get_yesterday_report(day_str):
    date = parser.parse(day_str) - dt.timedelta(days=1)
    while 'Error' in save_by_date(date):
        date -= dt.timedelta(days=1)
    lemm = get_report_lemm(f"./Reports/{date.strftime('%Y-%m-%d')}.html")
    tfidf_vector = get_report_tfidf_vector(lemm)
    return pd.concat([pd.DataFrame([day_str], columns=['date_tomorrow_datetime']), tfidf_vector], axis=1)

df_tfidf = []

for day in df['day_datetime'].unique():
    df_tfidf.append(get_yesterday_report(day))

df_tfidf = pd.concat(df_tfidf, axis=0, ignore_index=True)

# Merge weather events dataset with yesterday report tfidf matrix (takes 2m to execute)
df = df.merge(df_tfidf.add_prefix("isw_"),
                                how="left",
                                left_on="day_datetime",
                                right_on="isw_date_tomorrow_datetime")

# Fillna
df.fillna(0.0, inplace=True)

# Normalize
df['region_id_int'] = df['region_id'].astype(int)
scaler = pickle.load(open('model/scaler_v1.pkl', 'rb'))
df_float_values = df[scaler.get_feature_names_out()]
df_float_values_scaled = pd.DataFrame(scaler.transform(df_float_values), columns=df_float_values.columns)
df = pd.concat([df[['datetime', 'region_id_int', 'day_datetime', 'city']], df_float_values_scaled], axis=1)

df['day_datetime'] = pd.to_datetime(df['day_datetime'])
df['year'] = df['day_datetime'].dt.year
df['month'] = df['day_datetime'].dt.month
df['day'] = df['day_datetime'].dt.day
df['day_of_week'] = df['day_datetime'].dt.dayofweek

df['season'] = (df['day_datetime'].dt.month % 12 // 3).replace({0: 'winter', 1: 'spring', 2: 'summer', 3: 'fall'})
df_seasons = pd.get_dummies(df, columns=['season']).reindex(columns=['season_fall', 'season_spring', 'season_summer', 'season_winter'], fill_value=0)
df = pd.concat([df.drop(['season'], axis=1), df_seasons], axis=1)

df['city_id'] = df['city'].map(city_dict)
df.drop(['city', 'day_datetime'], axis=1, inplace=True)

df.columns


2023-04-23
lennatizing
2023-04-24
2023-04-23
lennatizing




Index(['datetime', 'region_id_int', 'day_tempmax', 'day_tempmin', 'day_temp',
       'day_dew', 'day_humidity', 'day_precip', 'day_precipcover',
       'day_solarradiation',
       ...
       'isw_zone', 'year', 'month', 'day', 'day_of_week', 'season_winter',
       'season_spring', 'season_summer', 'season_fall', 'city_id'],
      dtype='object', length=767)

In [122]:
model = pickle.load(open('./model/6__AdaBoost__v2.pkl', 'rb'))
model.predict(df.drop(['datetime', 'region_id_int'], axis=1).values)



array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])