In [16]:
import datetime
import numpy as np
import pandas as pd
import pickle
import os

from dotenv import load_dotenv
load_dotenv("../.path_env")

True

In [17]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [18]:
REPORTS_DATA_FILE = "./results/tfidf.csv"

OUTPUT_FOLDER = "results"
ISW_OUTPUT_DATA_FILE = "all_isw.csv"
WEATHER_EVENTS_OUTPUT_DATA_FILE = "all_hourly_weather_events.csv"

MODEL_FOLDER = "model"

tfidf_transformer_model = "tfidf_transformer"
count_vectorizer_model = "count_vectorizer"

tfidf_transformer_version = "v1"
count_vectorizer_version = "v1"

In [19]:
def isNaN(num):
    return num != num

## reading data

In [20]:
df_isw = pd.read_csv(f"{REPORTS_DATA_FILE}", sep=",")

In [21]:
df_isw.head(5)

Unnamed: 0,Name,Date,Keywords
0,assessment-2022-02-24,2022-02-24,"{'airport': 0.2989225615551494, 'kyiv': 0.2142..."
1,assessment-2022-02-25,2022-02-25,"{'kyiv': 0.3623260669568973, 'local': 0.189735..."
2,assessment-2022-02-26,2022-02-26,"{'kyiv': 0.4540653793502749, 'zaprozhia': 0.15..."
3,assessment-2022-02-28,2022-02-28,"{'kyiv': 0.3075720489357905, 'asset': 0.176497..."
4,assessment-2022-03-01,2022-03-01,"{'kyiv': 0.374212983451302, 'chernihiv': 0.233..."


## preparing ISW reports

## reading models

In [22]:
# #load the content
# tfidf = pickle.load(open(f"{MODEL_FOLDER}/{tfidf_transformer_model}_{tfidf_transformer_version}.pkl", "rb"))
# cv = pickle.load(open(f"{MODEL_FOLDER}/{count_vectorizer_model}_{count_vectorizer_version}.pkl", "rb"))

In [23]:
# df_isw['keywords'] = df_isw['text_preprocessed_lemm'].apply(lambda x: tf_idf.conver_doc_to_vector(x,cv,tfidf))

In [24]:
# df_isw.head(5)

In [25]:
df_isw["date_datetime"] = pd.to_datetime(df_isw["Date"])

In [26]:
df_isw['date_tomorrow_datetime'] = df_isw['date_datetime'].apply(lambda x: x+datetime.timedelta(days=1))
df_isw['event_time'] = np.nan

In [27]:
df_isw = df_isw.rename(columns = {"date_datetime":"report_date"})
df_isw.to_csv(f"{OUTPUT_FOLDER}/{ISW_OUTPUT_DATA_FILE}", sep=";", index=False)

In [28]:
df_isw.head(5)

Unnamed: 0,Name,Date,Keywords,report_date,date_tomorrow_datetime,event_time
0,assessment-2022-02-24,2022-02-24,"{'airport': 0.2989225615551494, 'kyiv': 0.2142...",2022-02-24,2022-02-25,
1,assessment-2022-02-25,2022-02-25,"{'kyiv': 0.3623260669568973, 'local': 0.189735...",2022-02-25,2022-02-26,
2,assessment-2022-02-26,2022-02-26,"{'kyiv': 0.4540653793502749, 'zaprozhia': 0.15...",2022-02-26,2022-02-27,
3,assessment-2022-02-28,2022-02-28,"{'kyiv': 0.3075720489357905, 'asset': 0.176497...",2022-02-28,2022-03-01,
4,assessment-2022-03-01,2022-03-01,"{'kyiv': 0.374212983451302, 'chernihiv': 0.233...",2022-03-01,2022-03-02,


## prepare events data

In [29]:
EVENTS_DATA_FILE = "../" + os.getenv("ALARMS_DATA_FILE")

In [30]:
df_events = pd.read_csv(f"{EVENTS_DATA_FILE}", sep=";")

In [31]:
df_events_v2 = df_events.drop(["id","region_id"],axis=1)
df_events_v2['event_time'] = np.nan

In [32]:
df_events_v2.head(5)
df_events_v2.shape

(19933, 8)

In [33]:
# df_events_v2["start_time"] = df_events_v2.apply(lambda x: x["start"] if not isNaN(x["start"]) else x["event_time"] , axis=1)
# df_events_v2["end_time"] = df_events_v2.apply(lambda x: x["end"] if not isNaN(x["end"]) else x["event_time"], axis=1)

In [34]:
df_events_v2["start_time"] = pd.to_datetime(df_events_v2["start"])
df_events_v2["end_time"] = pd.to_datetime(df_events_v2["end"])
df_events_v2["event_time"] = pd.to_datetime(df_events_v2["event_time"])

In [35]:
df_events_v2["start_hour"] = df_events_v2['start_time'].dt.floor('H')
df_events_v2["end_hour"] = df_events_v2['end_time'].dt.ceil('H')
df_events_v2["event_hour"] = df_events_v2['event_time'].dt.round('H')

In [36]:
df_events_v2["start_hour"] = df_events_v2.apply(lambda x: x["start_hour"] if not isNaN(x["start_hour"]) else x["event_hour"] , axis=1)
df_events_v2["end_hour"] = df_events_v2.apply(lambda x: x["end_hour"] if not isNaN(x["end_hour"]) else x["event_hour"] , axis=1)

In [37]:
df_events_v2["day_date"] = df_events_v2["start_time"].dt.date

df_events_v2.head(10)

df_events_v2["start_hour_datetimeEpoch"] = df_events_v2['start_hour'].apply(lambda x: int(x.timestamp())  if not isNaN(x) else None)
df_events_v2["end_hour_datetimeEpoch"] = df_events_v2['end_hour'].apply(lambda x: int(x.timestamp())  if not isNaN(x) else None)

# df_events_v2.head(10)

In [38]:
# df_events_v2[~(df_events_v2["type"]=="alarm")].shape

In [39]:
# df_events_v2[~(df_events_v2["type"]=="alarm")].head(5)

In [40]:
# df_events_v2[df_events_v2["type"]=="alarm"].shape

## prepare weather

In [41]:
WEATHER_DATA_FILE = "../" + os.getenv("WEATHER_DATA_FILE")

In [42]:
df_weather = pd.read_csv(f"{WEATHER_DATA_FILE}")
df_weather["day_datetime"] = pd.to_datetime(df_weather["day_datetime"])

FileNotFoundError: [Errno 2] No such file or directory: '../external_data/weather/all_weather_by_hour.csv/all_weather_by_hour.csv'

In [None]:
df_weather.shape

In [None]:
df_weather.head(15)

In [None]:
# len(clmns)

In [None]:
# exclude
weather_exclude = [
"day_feelslikemax",
"day_feelslikemin",
"day_sunriseEpoch",
"day_sunsetEpoch",
"day_description",
"city_latitude",
"city_longitude",
"city_address",
"city_timezone",
"city_tzoffset",
"day_feelslike",
"day_precipprob",
"day_snow",
"day_snowdepth",
"day_windgust",
"day_windspeed",
"day_winddir",
"day_pressure",
"day_cloudcover",
"day_visibility",
"day_severerisk",
"day_conditions",
"day_icon",
"day_source",
"day_preciptype",
"day_stations",
"hour_icon",
"hour_source",
"hour_stations",
"hour_feelslike"
]

In [None]:
# new_list = [x for x in clmns if (x not in weather_exclude)]
# new_list

In [None]:
df_weather_v2 = df_weather.drop(weather_exclude, axis=1)

In [None]:
df_weather_v2["city"] = df_weather_v2["city_resolvedAddress"].apply(lambda x: x.split(",")[0])
df_weather_v2["city"] = df_weather_v2["city"].replace('Хмельницька область', "Хмельницький")

In [None]:
df_weather_v2.head(5)

In [None]:
df_weather_v2.shape

## merging data

In [None]:
REGIONS_DATA_FOLDER = "../external_data/additions"
REGIONS_DATA_FILE = "regions.csv"
df_regions = pd.read_csv(f"{REGIONS_DATA_FOLDER}/{REGIONS_DATA_FILE}")

In [None]:
df_regions.head(5)

In [None]:
df_weather_reg = pd.merge(df_weather_v2, df_regions, left_on="city",right_on="center_city_ua")

In [None]:
df_weather_reg.head(10)

In [None]:
df_weather_reg.shape

In [None]:
df_weather_v2.shape

### Merging weather and events

In [None]:
# df_events_v2["start_hour_datetimeEpoch"] = df_events_v2['start_hour'].apply(lambda x: int(x.strftime('%s'))  if not isNaN(x) else 0)
# df_events_v2["end_hour_datetimeEpoch"] = df_events_v2['end_hour'].apply(lambda x: int(x.strftime('%s'))  if not isNaN(x) else 0)

In [None]:
df_events_v2.dtypes

In [None]:
df_events_v2.shape

In [None]:
df_events_v2.head(10)

In [None]:
# df_events_v2_sample = df_events_v2.sample(10)
# df_events_v2_sample.shape

events_dict = df_events_v2.to_dict('records')
events_by_hour = []

In [None]:
events_dict[0]

In [None]:
for event in events_dict:
    for d in pd.date_range(start=event["start_hour"], end=event["end_hour"], freq='1H'):
        et = event.copy()
        et["hour_level_event_time"] = d
        events_by_hour.append(et)

In [None]:
df_events_v3 = pd.DataFrame.from_dict(events_by_hour)
df_events_v3["hour_level_event_datetimeEpoch"] = df_events_v3["hour_level_event_time"].apply(lambda x: int(x.timestamp()) if not isNaN(x) else None)


In [None]:
df_events_v3.shape

In [None]:
df_events_v3.head(15)

In [None]:
df_weather_reg.head(5)

In [None]:
df_weather_reg.shape

In [None]:
df_events_v3.head(10)
df_weather_reg.head(10)

In [None]:
df_events_v4 = df_events_v3.copy().add_prefix('event_')
df_events_v4.head(10)


In [None]:
df_weather_v4 = df_weather_reg.merge(df_events_v4, 
                                     how="left", 
                                     left_on=["region_alt","hour_datetimeEpoch"],
                                     right_on=["event_region_title","event_hour_level_event_datetimeEpoch"])

In [None]:
df_weather_v4.head(300)

In [None]:
df_weather_v4.shape

In [None]:
df_weather_v4.to_csv(f"{OUTPUT_FOLDER}/{WEATHER_EVENTS_OUTPUT_DATA_FILE}", sep=";", index=False)