In [134]:
import datetime
import calendar
import numpy as np
import pandas as pd

import pickle

from sklearn.feature_extraction.text import TfidfTransformer

In [135]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [136]:
REPORTS_DATA_FOLDER = "data/2_isw_preprocessed"
REPORTS_DATA_FILE = "all_days.csv"

TFIDF_NUMBER = 100

EVENTS_DATA_FOLDER = "data/1_events"
EVENTS_DATA_FILE = "alarms.csv"

WEATHER_DATA_FOLDER = "data/1_weather"
WEATHER_DATA_FILE = "all_weather_by_hour_v2.csv"

REGIONS_DATA_FOLDER = "data/0_meta"
REGIONS_DATA_FILE = "regions.csv"

MODEL_FOLDER = "model"

OUTPUT_FOLDER = "data/4_all_data_preprocessed"
ISW_OUTPUT_DATA_FILE = "all_isw.csv"
WEATHER_EVENTS_OUTPUT_DATA_FILE = "all_hourly_weather_events_v2.csv"
WEATHER_EVENTS_KEYWORDS_OUTPUT_DATA_FILE = "all_hourly_weather_events_isw_v2.csv"

tfidf_transformer_model = "tfidf_transformer"
count_vectorizer_model = "count_vectorizer"

tfidf_transformer_version = "v4"
count_vectorizer_version = "v4"

In [137]:
def isNaN(num):
    return num != num

## Merging ISW reports, merging with TF-IDF

In [138]:
df_isw = pd.read_csv(f"{REPORTS_DATA_FOLDER}/{REPORTS_DATA_FILE}", sep=";")

In [139]:
df_isw.head(5)

Unnamed: 0,date,text,lemm,stemm
0,2022-02-24,(Ukraine local time) that Russian forces have...,ukrain local time russian forc not achiev bre...,ukrain local time russian forc not achiev bre...
1,2022-02-25,\n\n\n\n\n\n\n\n\nRussia-Ukraine Warning Updat...,russia ukrain warn updat russian offens campa...,russia ukrain warn updat russian offen campai...
2,2022-02-26,\n\n\n\n\n\n\n\n\nRussia-Ukraine Warning Updat...,russia ukrain warn updat russian offens campa...,russia ukrain warn updat russian offen campai...
3,2022-02-27,\n\n\n\n\n\n\n\n\nRussia-Ukraine Warning Updat...,russia ukrain warn updat russian offens campa...,russia ukrain warn updat russian offen campai...
4,2022-02-28,\n\n\n\n\n\n\n\n\nRussian Offensive Campaign A...,russian offens campaign ass februari twenti e...,russian offen campaign assess februari twenti...


## preparing ISW reports

## reading models

In [140]:
#load the content
tfidf = pickle.load(open(f"{MODEL_FOLDER}/{tfidf_transformer_model}_{tfidf_transformer_version}.pkl", "rb"))
cv = pickle.load(open(f"{MODEL_FOLDER}/{count_vectorizer_model}_{count_vectorizer_version}.pkl", "rb"))

In [141]:
def sort_coo(matrix):
    tuples = zip(matrix.col, matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_top_n_from_vector(feature_names, sorted_items, n):
    sorted_items = sorted_items[:n]

    score_vals = []
    feature_vals = []

    #word index
    for idx, score in sorted_items:
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    results = {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]] = score_vals[idx]

    return results

def convert_to_vector(doc, tfidf_transformer, count_vectorized):
    feature_names = count_vectorized.get_feature_names_out()
    tfidf_vector = tfidf_transformer.transform(count_vectorized.transform([doc]))

    sorted_items = sort_coo(tfidf_vector.tocoo())

    keywords = extract_top_n_from_vector(feature_names, sorted_items, TFIDF_NUMBER)

    return keywords

In [142]:
df_isw['keywords'] = df_isw['lemm'].apply(lambda x: convert_to_vector(x, tfidf, cv))

In [143]:
df_isw.head(5)


Unnamed: 0,date,text,lemm,stemm,keywords
0,2022-02-24,(Ukraine local time) that Russian forces have...,ukrain local time russian forc not achiev bre...,ukrain local time russian forc not achiev bre...,"{'odesa': 0.226, 'pm': 0.217, 'amphibi': 0.195..."
1,2022-02-25,\n\n\n\n\n\n\n\n\nRussia-Ukraine Warning Updat...,russia ukrain warn updat russian offens campa...,russia ukrain warn updat russian offen campai...,"{'februari': 0.329, 'pm': 0.323, 'kyiv': 0.311..."
2,2022-02-26,\n\n\n\n\n\n\n\n\nRussia-Ukraine Warning Updat...,russia ukrain warn updat russian offens campa...,russia ukrain warn updat russian offen campai...,"{'februari': 0.401, 'kyiv': 0.374, 'six': 0.24..."
3,2022-02-27,\n\n\n\n\n\n\n\n\nRussia-Ukraine Warning Updat...,russia ukrain warn updat russian offens campa...,russia ukrain warn updat russian offen campai...,"{'februari': 0.546, 'kyiv': 0.274, 'advanc': 0..."
4,2022-02-28,\n\n\n\n\n\n\n\n\nRussian Offensive Campaign A...,russian offens campaign ass februari twenti e...,russian offen campaign assess februari twenti...,"{'februari': 0.525, 'eight': 0.377, 'twenti': ..."


In [144]:
df_isw["date_datetime"] = pd.to_datetime(df_isw["date"])

In [145]:
df_isw['date_tomorrow_datetime'] = df_isw['date_datetime'].apply(lambda x: x+datetime.timedelta(days=1))

In [146]:
df_isw = df_isw.rename(columns = {"date_datetime":"report_date"})
df_isw.to_csv(f"{OUTPUT_FOLDER}/{ISW_OUTPUT_DATA_FILE}", sep=";", index=False)

In [147]:
df_isw.head(5)

Unnamed: 0,date,text,lemm,stemm,keywords,report_date,date_tomorrow_datetime
0,2022-02-24,(Ukraine local time) that Russian forces have...,ukrain local time russian forc not achiev bre...,ukrain local time russian forc not achiev bre...,"{'odesa': 0.226, 'pm': 0.217, 'amphibi': 0.195...",2022-02-24,2022-02-25
1,2022-02-25,\n\n\n\n\n\n\n\n\nRussia-Ukraine Warning Updat...,russia ukrain warn updat russian offens campa...,russia ukrain warn updat russian offen campai...,"{'februari': 0.329, 'pm': 0.323, 'kyiv': 0.311...",2022-02-25,2022-02-26
2,2022-02-26,\n\n\n\n\n\n\n\n\nRussia-Ukraine Warning Updat...,russia ukrain warn updat russian offens campa...,russia ukrain warn updat russian offen campai...,"{'februari': 0.401, 'kyiv': 0.374, 'six': 0.24...",2022-02-26,2022-02-27
3,2022-02-27,\n\n\n\n\n\n\n\n\nRussia-Ukraine Warning Updat...,russia ukrain warn updat russian offens campa...,russia ukrain warn updat russian offen campai...,"{'februari': 0.546, 'kyiv': 0.274, 'advanc': 0...",2022-02-27,2022-02-28
4,2022-02-28,\n\n\n\n\n\n\n\n\nRussian Offensive Campaign A...,russian offens campaign ass februari twenti e...,russian offen campaign assess februari twenti...,"{'februari': 0.525, 'eight': 0.377, 'twenti': ...",2022-02-28,2022-03-01


## prepare events data

In [148]:
df_events = pd.read_csv(f"{EVENTS_DATA_FOLDER}/{EVENTS_DATA_FILE}", sep=";")

In [149]:
df_events_v2 = df_events.drop(["id","region_id"],axis=1)

In [150]:
df_events_v2.head(5)

Unnamed: 0,region_title,region_city,all_region,start,end,clean_end,intersection_alarm_id
0,Вінниччина,Вінниця,0,2022-02-25 22:55:42,2022-02-25 23:41:53,2022-02-25 23:41:53,
1,Львівщина,Львів,0,2022-02-26 06:26:17,2022-02-26 07:15:28,2022-02-26 07:15:28,
2,Одещина,Одеса,0,2022-02-26 07:16:58,2022-02-26 07:47:03,2022-02-26 07:47:03,
3,Житомирщина,Житомир,0,2022-02-26 08:05:54,2022-02-26 09:36:36,2022-02-26 09:36:36,
4,Вінниччина,Вінниця,0,2022-02-26 08:39:39,2022-02-26 10:42:41,2022-02-26 10:42:41,


In [151]:
df_events_v2["start_time"] = pd.to_datetime(df_events_v2["start"])
df_events_v2["end_time"] = pd.to_datetime(df_events_v2["end"])

In [152]:
df_events_v2["start_hour"] = df_events_v2['start_time'].dt.floor('H')
df_events_v2["end_hour"] = df_events_v2['end_time'].dt.ceil('H')

In [153]:
df_events_v2["start_hour"] = df_events_v2.apply(lambda x: x["start_hour"] if not isNaN(x["start_hour"]) else x["event_hour"] , axis=1)
df_events_v2["end_hour"] = df_events_v2.apply(lambda x: x["end_hour"] if not isNaN(x["end_hour"]) else x["event_hour"] , axis=1)
df_events_v2.head(10)

Unnamed: 0,region_title,region_city,all_region,start,end,clean_end,intersection_alarm_id,start_time,end_time,start_hour,end_hour
0,Вінниччина,Вінниця,0,2022-02-25 22:55:42,2022-02-25 23:41:53,2022-02-25 23:41:53,,2022-02-25 22:55:42,2022-02-25 23:41:53,2022-02-25 22:00:00,2022-02-26 00:00:00
1,Львівщина,Львів,0,2022-02-26 06:26:17,2022-02-26 07:15:28,2022-02-26 07:15:28,,2022-02-26 06:26:17,2022-02-26 07:15:28,2022-02-26 06:00:00,2022-02-26 08:00:00
2,Одещина,Одеса,0,2022-02-26 07:16:58,2022-02-26 07:47:03,2022-02-26 07:47:03,,2022-02-26 07:16:58,2022-02-26 07:47:03,2022-02-26 07:00:00,2022-02-26 08:00:00
3,Житомирщина,Житомир,0,2022-02-26 08:05:54,2022-02-26 09:36:36,2022-02-26 09:36:36,,2022-02-26 08:05:54,2022-02-26 09:36:36,2022-02-26 08:00:00,2022-02-26 10:00:00
4,Вінниччина,Вінниця,0,2022-02-26 08:39:39,2022-02-26 10:42:41,2022-02-26 10:42:41,,2022-02-26 08:39:39,2022-02-26 10:42:41,2022-02-26 08:00:00,2022-02-26 11:00:00
5,Вінниччина,Вінниця,0,2022-02-26 10:58:23,2022-02-26 11:59:40,2022-02-26 11:59:40,,2022-02-26 10:58:23,2022-02-26 11:59:40,2022-02-26 10:00:00,2022-02-26 12:00:00
6,Львівщина,Львів,0,2022-02-26 13:44:44,2022-02-26 14:27:25,2022-02-26 14:27:25,,2022-02-26 13:44:44,2022-02-26 14:27:25,2022-02-26 13:00:00,2022-02-26 15:00:00
7,Рівненщина,Рівненська обл.,1,2022-02-26 15:54:53,2022-02-26 16:14:46,2022-02-26 16:14:46,,2022-02-26 15:54:53,2022-02-26 16:14:46,2022-02-26 15:00:00,2022-02-26 17:00:00
8,Волинь,Волинська обл.,1,2022-02-26 16:08:26,2022-02-26 16:39:26,2022-02-26 16:39:26,,2022-02-26 16:08:26,2022-02-26 16:39:26,2022-02-26 16:00:00,2022-02-26 17:00:00
9,Хмельниччина,Деражня,0,2022-02-26 16:10:29,2022-02-26 17:19:57,2022-02-26 17:19:57,,2022-02-26 16:10:29,2022-02-26 17:19:57,2022-02-26 16:00:00,2022-02-26 18:00:00


In [154]:
df_events_v2["day_date"] = df_events_v2["start_time"].dt.date

df_events_v2["start_hour_datetimeEpoch"] = df_events_v2['start_hour'].apply(lambda x: int(calendar.timegm(x.timetuple()))  if not isNaN(x) else None)
df_events_v2["end_hour_datetimeEpoch"] = df_events_v2['end_hour'].apply(lambda x: int(calendar.timegm(x.timetuple()))  if not isNaN(x) else None)

df_events_v2.head(10)

Unnamed: 0,region_title,region_city,all_region,start,end,clean_end,intersection_alarm_id,start_time,end_time,start_hour,end_hour,day_date,start_hour_datetimeEpoch,end_hour_datetimeEpoch
0,Вінниччина,Вінниця,0,2022-02-25 22:55:42,2022-02-25 23:41:53,2022-02-25 23:41:53,,2022-02-25 22:55:42,2022-02-25 23:41:53,2022-02-25 22:00:00,2022-02-26 00:00:00,2022-02-25,1645826400,1645833600
1,Львівщина,Львів,0,2022-02-26 06:26:17,2022-02-26 07:15:28,2022-02-26 07:15:28,,2022-02-26 06:26:17,2022-02-26 07:15:28,2022-02-26 06:00:00,2022-02-26 08:00:00,2022-02-26,1645855200,1645862400
2,Одещина,Одеса,0,2022-02-26 07:16:58,2022-02-26 07:47:03,2022-02-26 07:47:03,,2022-02-26 07:16:58,2022-02-26 07:47:03,2022-02-26 07:00:00,2022-02-26 08:00:00,2022-02-26,1645858800,1645862400
3,Житомирщина,Житомир,0,2022-02-26 08:05:54,2022-02-26 09:36:36,2022-02-26 09:36:36,,2022-02-26 08:05:54,2022-02-26 09:36:36,2022-02-26 08:00:00,2022-02-26 10:00:00,2022-02-26,1645862400,1645869600
4,Вінниччина,Вінниця,0,2022-02-26 08:39:39,2022-02-26 10:42:41,2022-02-26 10:42:41,,2022-02-26 08:39:39,2022-02-26 10:42:41,2022-02-26 08:00:00,2022-02-26 11:00:00,2022-02-26,1645862400,1645873200
5,Вінниччина,Вінниця,0,2022-02-26 10:58:23,2022-02-26 11:59:40,2022-02-26 11:59:40,,2022-02-26 10:58:23,2022-02-26 11:59:40,2022-02-26 10:00:00,2022-02-26 12:00:00,2022-02-26,1645869600,1645876800
6,Львівщина,Львів,0,2022-02-26 13:44:44,2022-02-26 14:27:25,2022-02-26 14:27:25,,2022-02-26 13:44:44,2022-02-26 14:27:25,2022-02-26 13:00:00,2022-02-26 15:00:00,2022-02-26,1645880400,1645887600
7,Рівненщина,Рівненська обл.,1,2022-02-26 15:54:53,2022-02-26 16:14:46,2022-02-26 16:14:46,,2022-02-26 15:54:53,2022-02-26 16:14:46,2022-02-26 15:00:00,2022-02-26 17:00:00,2022-02-26,1645887600,1645894800
8,Волинь,Волинська обл.,1,2022-02-26 16:08:26,2022-02-26 16:39:26,2022-02-26 16:39:26,,2022-02-26 16:08:26,2022-02-26 16:39:26,2022-02-26 16:00:00,2022-02-26 17:00:00,2022-02-26,1645891200,1645894800
9,Хмельниччина,Деражня,0,2022-02-26 16:10:29,2022-02-26 17:19:57,2022-02-26 17:19:57,,2022-02-26 16:10:29,2022-02-26 17:19:57,2022-02-26 16:00:00,2022-02-26 18:00:00,2022-02-26,1645891200,1645898400


## prepare weather

In [155]:
df_weather = pd.read_csv(f"{WEATHER_DATA_FOLDER}/{WEATHER_DATA_FILE}")
df_weather["day_datetime"] = pd.to_datetime(df_weather["day_datetime"])

print(df_weather.shape)
df_weather.head(10)

(182712, 67)


Unnamed: 0,city_latitude,city_longitude,city_resolvedAddress,city_address,city_timezone,city_tzoffset,day_datetime,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_feelslikemax,day_feelslikemin,day_feelslike,day_dew,day_humidity,day_precip,day_precipprob,day_precipcover,day_snow,day_snowdepth,day_windgust,day_windspeed,day_winddir,day_pressure,day_cloudcover,day_visibility,day_solarradiation,day_solarenergy,day_uvindex,day_severerisk,day_sunrise,day_sunriseEpoch,day_sunset,day_sunsetEpoch,day_moonphase,day_conditions,day_description,day_icon,day_source,day_preciptype,day_stations,hour_datetime,hour_datetimeEpoch,hour_temp,hour_feelslike,hour_humidity,hour_dew,hour_precip,hour_precipprob,hour_snow,hour_snowdepth,hour_preciptype,hour_windgust,hour_windspeed,hour_winddir,hour_pressure,hour_visibility,hour_cloudcover,hour_solarradiation,hour_solarenergy,hour_uvindex,hour_severerisk,hour_conditions,hour_icon,hour_source,hour_stations
0,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,2.6,4.0,-3.1,-0.2,0.0,83.7,0.118,100.0,4.17,0.1,0.1,32.4,15.5,252.7,1022.3,72.3,12.2,36.9,2.8,1.0,10.0,07:13:36,1645679616,17:51:06,1645717866,0.77,"Snow, Partially cloudy",Partly cloudy throughout the day with morning ...,snow,obs,snow,33177099999;UKLR;remote;33301099999,00:00:00,1645653600,2.4,-1.6,89.18,0.8,0.0,0.0,0.1,0.2,['snow'],31.3,15.5,275.6,1020.0,0.0,91.5,0.0,,0.0,10.0,Overcast,snow,obs,remote
1,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,2.6,4.0,-3.1,-0.2,0.0,83.7,0.118,100.0,4.17,0.1,0.1,32.4,15.5,252.7,1022.3,72.3,12.2,36.9,2.8,1.0,10.0,07:13:36,1645679616,17:51:06,1645717866,0.77,"Snow, Partially cloudy",Partly cloudy throughout the day with morning ...,snow,obs,snow,33177099999;UKLR;remote;33301099999,01:00:00,1645657200,2.4,-1.5,87.9,0.6,0.0,0.0,0.0,0.2,['snow'],27.7,14.8,280.3,1021.0,0.2,88.2,0.0,,0.0,10.0,Partially cloudy,fog,obs,remote
2,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,2.6,4.0,-3.1,-0.2,0.0,83.7,0.118,100.0,4.17,0.1,0.1,32.4,15.5,252.7,1022.3,72.3,12.2,36.9,2.8,1.0,10.0,07:13:36,1645679616,17:51:06,1645717866,0.77,"Snow, Partially cloudy",Partly cloudy throughout the day with morning ...,snow,obs,snow,33177099999;UKLR;remote;33301099999,02:00:00,1645660800,2.9,-0.8,88.58,1.2,0.0,0.0,0.0,0.1,['snow'],29.2,14.4,310.0,1022.0,10.0,100.0,,,,10.0,Overcast,cloudy,obs,33177099999
3,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,2.6,4.0,-3.1,-0.2,0.0,83.7,0.118,100.0,4.17,0.1,0.1,32.4,15.5,252.7,1022.3,72.3,12.2,36.9,2.8,1.0,10.0,07:13:36,1645679616,17:51:06,1645717866,0.77,"Snow, Partially cloudy",Partly cloudy throughout the day with morning ...,snow,obs,snow,33177099999;UKLR;remote;33301099999,03:00:00,1645664400,2.3,-1.3,86.63,0.3,0.0,0.0,0.0,0.1,['snow'],23.8,13.3,295.1,1021.0,0.1,92.0,0.0,,0.0,10.0,Overcast,fog,obs,remote
4,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,2.6,4.0,-3.1,-0.2,0.0,83.7,0.118,100.0,4.17,0.1,0.1,32.4,15.5,252.7,1022.3,72.3,12.2,36.9,2.8,1.0,10.0,07:13:36,1645679616,17:51:06,1645717866,0.77,"Snow, Partially cloudy",Partly cloudy throughout the day with morning ...,snow,obs,snow,33177099999;UKLR;remote;33301099999,04:00:00,1645668000,1.9,-1.8,87.85,0.1,0.0,0.0,0.0,0.1,['snow'],24.5,13.3,305.8,1021.0,0.0,93.8,0.0,,0.0,10.0,Overcast,cloudy,obs,remote
5,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,2.6,4.0,-3.1,-0.2,0.0,83.7,0.118,100.0,4.17,0.1,0.1,32.4,15.5,252.7,1022.3,72.3,12.2,36.9,2.8,1.0,10.0,07:13:36,1645679616,17:51:06,1645717866,0.77,"Snow, Partially cloudy",Partly cloudy throughout the day with morning ...,snow,obs,snow,33177099999;UKLR;remote;33301099999,05:00:00,1645671600,1.9,-1.3,91.66,0.6,0.0,0.0,0.0,0.1,,23.4,10.8,296.0,1022.5,10.0,100.0,0.0,,0.0,10.0,Overcast,cloudy,obs,33177099999;33301099999
6,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,2.6,4.0,-3.1,-0.2,0.0,83.7,0.118,100.0,4.17,0.1,0.1,32.4,15.5,252.7,1022.3,72.3,12.2,36.9,2.8,1.0,10.0,07:13:36,1645679616,17:51:06,1645717866,0.77,"Snow, Partially cloudy",Partly cloudy throughout the day with morning ...,snow,obs,snow,33177099999;UKLR;remote;33301099999,06:00:00,1645675200,2.0,-1.1,93.09,1.0,0.0,0.0,0.0,0.1,['snow'],20.9,10.8,300.0,1021.0,10.0,100.0,0.0,,0.0,10.0,Overcast,cloudy,obs,UKLR;33301099999
7,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,2.6,4.0,-3.1,-0.2,0.0,83.7,0.118,100.0,4.17,0.1,0.1,32.4,15.5,252.7,1022.3,72.3,12.2,36.9,2.8,1.0,10.0,07:13:36,1645679616,17:51:06,1645717866,0.77,"Snow, Partially cloudy",Partly cloudy throughout the day with morning ...,snow,obs,snow,33177099999;UKLR;remote;33301099999,07:00:00,1645678800,2.0,-1.1,93.09,1.0,0.0,0.0,0.0,0.1,['snow'],19.1,10.8,300.0,1022.0,10.0,100.0,0.0,,0.0,10.0,Overcast,cloudy,obs,UKLR;33301099999
8,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,2.6,4.0,-3.1,-0.2,0.0,83.7,0.118,100.0,4.17,0.1,0.1,32.4,15.5,252.7,1022.3,72.3,12.2,36.9,2.8,1.0,10.0,07:13:36,1645679616,17:51:06,1645717866,0.77,"Snow, Partially cloudy",Partly cloudy throughout the day with morning ...,snow,obs,snow,33177099999;UKLR;remote;33301099999,08:00:00,1645682400,1.8,-0.3,91.32,0.6,0.118,100.0,0.0,0.1,['snow'],16.9,7.2,303.0,1024.2,4.4,100.0,,,,10.0,"Snow, Overcast",rain,obs,33177099999;UKLR;33301099999
9,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,2.6,4.0,-3.1,-0.2,0.0,83.7,0.118,100.0,4.17,0.1,0.1,32.4,15.5,252.7,1022.3,72.3,12.2,36.9,2.8,1.0,10.0,07:13:36,1645679616,17:51:06,1645717866,0.77,"Snow, Partially cloudy",Partly cloudy throughout the day with morning ...,snow,obs,snow,33177099999;UKLR;remote;33301099999,09:00:00,1645686000,2.0,-1.1,93.09,1.0,0.0,0.0,0.0,0.1,,15.5,10.8,300.0,1024.0,2.0,100.0,15.0,0.1,0.0,10.0,Overcast,cloudy,obs,UKLR;33301099999


In [156]:
weather_exclude = [
    "day_feelslikemax",
    "day_feelslikemin",
    "day_sunriseEpoch",
    "day_sunsetEpoch",
    "day_description",
    "city_latitude",
    "city_longitude",
    "city_address",
    "city_timezone",
    "city_tzoffset",
    "day_feelslike",
    "day_precipprob",
    "day_snow",
    "day_snowdepth",
    "day_windgust",
    "day_windspeed",
    "day_winddir",
    "day_pressure",
    "day_cloudcover",
    "day_visibility",
    "day_severerisk",
    "day_conditions",
    "day_icon",
    "day_source",
    "day_preciptype",
    "day_stations",
    "hour_icon",
    "hour_source",
    "hour_stations",
    "hour_feelslike"
]

df_weather_v2 = df_weather.drop(weather_exclude, axis=1)

In [157]:
df_weather_v2["city"] = df_weather_v2["city_resolvedAddress"].apply(lambda x: x.split(",")[0])
df_weather_v2["city"] = df_weather_v2["city"].replace('Хмельницька область', "Хмельницький")

In [158]:
print(df_weather_v2.shape)
df_weather_v2.head(5)

(182712, 38)


Unnamed: 0,city_resolvedAddress,day_datetime,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,day_solarradiation,day_solarenergy,day_uvindex,day_sunrise,day_sunset,day_moonphase,hour_datetime,hour_datetimeEpoch,hour_temp,hour_humidity,hour_dew,hour_precip,hour_precipprob,hour_snow,hour_snowdepth,hour_preciptype,hour_windgust,hour_windspeed,hour_winddir,hour_pressure,hour_visibility,hour_cloudcover,hour_solarradiation,hour_solarenergy,hour_uvindex,hour_severerisk,hour_conditions,city
0,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,36.9,2.8,1.0,07:13:36,17:51:06,0.77,00:00:00,1645653600,2.4,89.18,0.8,0.0,0.0,0.1,0.2,['snow'],31.3,15.5,275.6,1020.0,0.0,91.5,0.0,,0.0,10.0,Overcast,Луцьк
1,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,36.9,2.8,1.0,07:13:36,17:51:06,0.77,01:00:00,1645657200,2.4,87.9,0.6,0.0,0.0,0.0,0.2,['snow'],27.7,14.8,280.3,1021.0,0.2,88.2,0.0,,0.0,10.0,Partially cloudy,Луцьк
2,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,36.9,2.8,1.0,07:13:36,17:51:06,0.77,02:00:00,1645660800,2.9,88.58,1.2,0.0,0.0,0.0,0.1,['snow'],29.2,14.4,310.0,1022.0,10.0,100.0,,,,10.0,Overcast,Луцьк
3,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,36.9,2.8,1.0,07:13:36,17:51:06,0.77,03:00:00,1645664400,2.3,86.63,0.3,0.0,0.0,0.0,0.1,['snow'],23.8,13.3,295.1,1021.0,0.1,92.0,0.0,,0.0,10.0,Overcast,Луцьк
4,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,36.9,2.8,1.0,07:13:36,17:51:06,0.77,04:00:00,1645668000,1.9,87.85,0.1,0.0,0.0,0.0,0.1,['snow'],24.5,13.3,305.8,1021.0,0.0,93.8,0.0,,0.0,10.0,Overcast,Луцьк


## merging data

In [159]:
df_regions = pd.read_csv(f"{REGIONS_DATA_FOLDER}/{REGIONS_DATA_FILE}")
df_regions.head(5)

Unnamed: 0,region,center_city_ua,center_city_en,region_alt,region_id
0,АР Крим,Сімферополь,Simferopol,Крим,1
1,Вінницька,Вінниця,Vinnytsia,Вінниччина,2
2,Волинська,Луцьк,Lutsk,Волинь,3
3,Дніпропетровська,Дніпро,Dnipro,Дніпропетровщина,4
4,Донецька,Донецьк,Donetsk,Донеччина,5


In [160]:
df_weather_reg = pd.merge(df_weather_v2, df_regions, left_on="city",right_on="center_city_ua")

print(df_weather_reg.shape)
df_weather_reg.head(5)

(182712, 43)


Unnamed: 0,city_resolvedAddress,day_datetime,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,day_solarradiation,day_solarenergy,day_uvindex,day_sunrise,day_sunset,day_moonphase,hour_datetime,hour_datetimeEpoch,hour_temp,hour_humidity,hour_dew,hour_precip,hour_precipprob,hour_snow,hour_snowdepth,hour_preciptype,hour_windgust,hour_windspeed,hour_winddir,hour_pressure,hour_visibility,hour_cloudcover,hour_solarradiation,hour_solarenergy,hour_uvindex,hour_severerisk,hour_conditions,city,region,center_city_ua,center_city_en,region_alt,region_id
0,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,36.9,2.8,1.0,07:13:36,17:51:06,0.77,00:00:00,1645653600,2.4,89.18,0.8,0.0,0.0,0.1,0.2,['snow'],31.3,15.5,275.6,1020.0,0.0,91.5,0.0,,0.0,10.0,Overcast,Луцьк,Волинська,Луцьк,Lutsk,Волинь,3
1,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,36.9,2.8,1.0,07:13:36,17:51:06,0.77,01:00:00,1645657200,2.4,87.9,0.6,0.0,0.0,0.0,0.2,['snow'],27.7,14.8,280.3,1021.0,0.2,88.2,0.0,,0.0,10.0,Partially cloudy,Луцьк,Волинська,Луцьк,Lutsk,Волинь,3
2,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,36.9,2.8,1.0,07:13:36,17:51:06,0.77,02:00:00,1645660800,2.9,88.58,1.2,0.0,0.0,0.0,0.1,['snow'],29.2,14.4,310.0,1022.0,10.0,100.0,,,,10.0,Overcast,Луцьк,Волинська,Луцьк,Lutsk,Волинь,3
3,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,36.9,2.8,1.0,07:13:36,17:51:06,0.77,03:00:00,1645664400,2.3,86.63,0.3,0.0,0.0,0.0,0.1,['snow'],23.8,13.3,295.1,1021.0,0.1,92.0,0.0,,0.0,10.0,Overcast,Луцьк,Волинська,Луцьк,Lutsk,Волинь,3
4,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,36.9,2.8,1.0,07:13:36,17:51:06,0.77,04:00:00,1645668000,1.9,87.85,0.1,0.0,0.0,0.0,0.1,['snow'],24.5,13.3,305.8,1021.0,0.0,93.8,0.0,,0.0,10.0,Overcast,Луцьк,Волинська,Луцьк,Lutsk,Волинь,3


In [161]:
df_weather_v2.shape

(182712, 38)

### Merging weather and events

In [162]:
df_events_v2.dtypes

region_title                        object
region_city                         object
all_region                           int64
start                               object
end                                 object
clean_end                           object
intersection_alarm_id              float64
start_time                  datetime64[ns]
end_time                    datetime64[ns]
start_hour                  datetime64[ns]
end_hour                    datetime64[ns]
day_date                            object
start_hour_datetimeEpoch             int64
end_hour_datetimeEpoch               int64
dtype: object

In [163]:
print(df_events_v2.shape)
df_events_v2.head(5)

(19933, 14)


Unnamed: 0,region_title,region_city,all_region,start,end,clean_end,intersection_alarm_id,start_time,end_time,start_hour,end_hour,day_date,start_hour_datetimeEpoch,end_hour_datetimeEpoch
0,Вінниччина,Вінниця,0,2022-02-25 22:55:42,2022-02-25 23:41:53,2022-02-25 23:41:53,,2022-02-25 22:55:42,2022-02-25 23:41:53,2022-02-25 22:00:00,2022-02-26 00:00:00,2022-02-25,1645826400,1645833600
1,Львівщина,Львів,0,2022-02-26 06:26:17,2022-02-26 07:15:28,2022-02-26 07:15:28,,2022-02-26 06:26:17,2022-02-26 07:15:28,2022-02-26 06:00:00,2022-02-26 08:00:00,2022-02-26,1645855200,1645862400
2,Одещина,Одеса,0,2022-02-26 07:16:58,2022-02-26 07:47:03,2022-02-26 07:47:03,,2022-02-26 07:16:58,2022-02-26 07:47:03,2022-02-26 07:00:00,2022-02-26 08:00:00,2022-02-26,1645858800,1645862400
3,Житомирщина,Житомир,0,2022-02-26 08:05:54,2022-02-26 09:36:36,2022-02-26 09:36:36,,2022-02-26 08:05:54,2022-02-26 09:36:36,2022-02-26 08:00:00,2022-02-26 10:00:00,2022-02-26,1645862400,1645869600
4,Вінниччина,Вінниця,0,2022-02-26 08:39:39,2022-02-26 10:42:41,2022-02-26 10:42:41,,2022-02-26 08:39:39,2022-02-26 10:42:41,2022-02-26 08:00:00,2022-02-26 11:00:00,2022-02-26,1645862400,1645873200


In [164]:
events_dict = df_events_v2.to_dict('records')
events_by_hour = []

events_dict[0]

{'region_title': 'Вінниччина',
 'region_city': 'Вінниця',
 'all_region': 0,
 'start': '2022-02-25 22:55:42',
 'end': '2022-02-25 23:41:53',
 'clean_end': '2022-02-25 23:41:53',
 'intersection_alarm_id': nan,
 'start_time': Timestamp('2022-02-25 22:55:42'),
 'end_time': Timestamp('2022-02-25 23:41:53'),
 'start_hour': Timestamp('2022-02-25 22:00:00'),
 'end_hour': Timestamp('2022-02-26 00:00:00'),
 'day_date': datetime.date(2022, 2, 25),
 'start_hour_datetimeEpoch': 1645826400,
 'end_hour_datetimeEpoch': 1645833600}

In [165]:
for event in events_dict:
    for d in pd.date_range(start=event["start_hour"], end=event["end_hour"], freq='1H'):
        et = event.copy()
        et["hour_level_event_time"] = d
        events_by_hour.append(et)

df_events_v3 = pd.DataFrame.from_dict(events_by_hour)
df_events_v3["hour_level_event_datetimeEpoch"] = df_events_v3["hour_level_event_time"].apply(lambda x: int(calendar.timegm(x.timetuple()))  if not isNaN(x) else None)

print(df_events_v3.shape)
df_events_v3.head(5)

(58860, 16)


Unnamed: 0,region_title,region_city,all_region,start,end,clean_end,intersection_alarm_id,start_time,end_time,start_hour,end_hour,day_date,start_hour_datetimeEpoch,end_hour_datetimeEpoch,hour_level_event_time,hour_level_event_datetimeEpoch
0,Вінниччина,Вінниця,0,2022-02-25 22:55:42,2022-02-25 23:41:53,2022-02-25 23:41:53,,2022-02-25 22:55:42,2022-02-25 23:41:53,2022-02-25 22:00:00,2022-02-26 00:00:00,2022-02-25,1645826400,1645833600,2022-02-25 22:00:00,1645826400
1,Вінниччина,Вінниця,0,2022-02-25 22:55:42,2022-02-25 23:41:53,2022-02-25 23:41:53,,2022-02-25 22:55:42,2022-02-25 23:41:53,2022-02-25 22:00:00,2022-02-26 00:00:00,2022-02-25,1645826400,1645833600,2022-02-25 23:00:00,1645830000
2,Вінниччина,Вінниця,0,2022-02-25 22:55:42,2022-02-25 23:41:53,2022-02-25 23:41:53,,2022-02-25 22:55:42,2022-02-25 23:41:53,2022-02-25 22:00:00,2022-02-26 00:00:00,2022-02-25,1645826400,1645833600,2022-02-26 00:00:00,1645833600
3,Львівщина,Львів,0,2022-02-26 06:26:17,2022-02-26 07:15:28,2022-02-26 07:15:28,,2022-02-26 06:26:17,2022-02-26 07:15:28,2022-02-26 06:00:00,2022-02-26 08:00:00,2022-02-26,1645855200,1645862400,2022-02-26 06:00:00,1645855200
4,Львівщина,Львів,0,2022-02-26 06:26:17,2022-02-26 07:15:28,2022-02-26 07:15:28,,2022-02-26 06:26:17,2022-02-26 07:15:28,2022-02-26 06:00:00,2022-02-26 08:00:00,2022-02-26,1645855200,1645862400,2022-02-26 07:00:00,1645858800


In [166]:
df_events_v4 = df_events_v3.copy().add_prefix('event_')
df_events_v4.head(5)

Unnamed: 0,event_region_title,event_region_city,event_all_region,event_start,event_end,event_clean_end,event_intersection_alarm_id,event_start_time,event_end_time,event_start_hour,event_end_hour,event_day_date,event_start_hour_datetimeEpoch,event_end_hour_datetimeEpoch,event_hour_level_event_time,event_hour_level_event_datetimeEpoch
0,Вінниччина,Вінниця,0,2022-02-25 22:55:42,2022-02-25 23:41:53,2022-02-25 23:41:53,,2022-02-25 22:55:42,2022-02-25 23:41:53,2022-02-25 22:00:00,2022-02-26 00:00:00,2022-02-25,1645826400,1645833600,2022-02-25 22:00:00,1645826400
1,Вінниччина,Вінниця,0,2022-02-25 22:55:42,2022-02-25 23:41:53,2022-02-25 23:41:53,,2022-02-25 22:55:42,2022-02-25 23:41:53,2022-02-25 22:00:00,2022-02-26 00:00:00,2022-02-25,1645826400,1645833600,2022-02-25 23:00:00,1645830000
2,Вінниччина,Вінниця,0,2022-02-25 22:55:42,2022-02-25 23:41:53,2022-02-25 23:41:53,,2022-02-25 22:55:42,2022-02-25 23:41:53,2022-02-25 22:00:00,2022-02-26 00:00:00,2022-02-25,1645826400,1645833600,2022-02-26 00:00:00,1645833600
3,Львівщина,Львів,0,2022-02-26 06:26:17,2022-02-26 07:15:28,2022-02-26 07:15:28,,2022-02-26 06:26:17,2022-02-26 07:15:28,2022-02-26 06:00:00,2022-02-26 08:00:00,2022-02-26,1645855200,1645862400,2022-02-26 06:00:00,1645855200
4,Львівщина,Львів,0,2022-02-26 06:26:17,2022-02-26 07:15:28,2022-02-26 07:15:28,,2022-02-26 06:26:17,2022-02-26 07:15:28,2022-02-26 06:00:00,2022-02-26 08:00:00,2022-02-26,1645855200,1645862400,2022-02-26 07:00:00,1645858800


In [167]:
df_weather_v4 = df_weather_reg.merge(df_events_v4, 
                                     how="left", 
                                     left_on=["region_alt","hour_datetimeEpoch"],
                                     right_on=["event_region_title","event_hour_level_event_datetimeEpoch"])

In [168]:
print(df_weather_v4.shape)
df_weather_v4.sample(5)

(195084, 59)


Unnamed: 0,city_resolvedAddress,day_datetime,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,day_solarradiation,day_solarenergy,day_uvindex,day_sunrise,day_sunset,day_moonphase,hour_datetime,hour_datetimeEpoch,hour_temp,hour_humidity,hour_dew,hour_precip,hour_precipprob,hour_snow,hour_snowdepth,hour_preciptype,hour_windgust,hour_windspeed,hour_winddir,hour_pressure,hour_visibility,hour_cloudcover,hour_solarradiation,hour_solarenergy,hour_uvindex,hour_severerisk,hour_conditions,city,region,center_city_ua,center_city_en,region_alt,region_id,event_region_title,event_region_city,event_all_region,event_start,event_end,event_clean_end,event_intersection_alarm_id,event_start_time,event_end_time,event_start_hour,event_end_hour,event_day_date,event_start_hour_datetimeEpoch,event_end_hour_datetimeEpoch,event_hour_level_event_time,event_hour_level_event_datetimeEpoch
3246,"Луцьк, Луцький район, Україна",2022-07-05,1656968400,30.6,14.5,24.0,14.2,58.2,0.0,0.0,270.5,23.3,9.0,05:12:48,21:33:25,0.19,15:00:00,1657022400,30.0,35.01,12.9,0.0,0.0,0.0,0.0,,15.8,6.5,16.0,1014.9,26.3,60.0,823.0,3.0,8.0,10.0,Partially cloudy,Луцьк,Волинська,Луцьк,Lutsk,Волинь,3,,,,,,,,NaT,NaT,NaT,NaT,,,,NaT,
93317,"Хмельницька область, Україна",2022-09-02,1662066000,19.0,9.2,13.9,6.6,63.8,1.5,12.5,141.4,12.1,6.0,06:29:06,19:53:38,0.19,07:00:00,1662091200,9.3,77.67,5.6,0.0,0.0,0.0,0.0,,5.4,2.9,47.8,1021.0,24.1,86.3,6.0,0.0,0.0,10.0,Partially cloudy,Хмельницький,Хмельницька,Хмельницький,Khmelnytskyi,Хмельниччина,22,,,,,,,,NaT,NaT,NaT,NaT,,,,NaT,
171467,"Черкаси, Україна",2022-03-29,1648501200,19.0,5.2,11.3,-1.6,42.2,0.4,4.17,180.9,15.5,7.0,06:35:44,19:18:19,0.89,08:00:00,1648530000,6.2,44.2,-5.1,0.0,0.0,0.0,0.0,,29.5,17.6,224.9,1009.0,24.1,0.0,93.0,0.3,1.0,10.0,Clear,Черкаси,Черкаська,Черкаси,Cherkasy,Черкащина,23,Черкащина,Черкаська обл.,1.0,2022-03-29 03:31:57,2022-03-29 04:14:34,2022-03-29 04:14:34,,2022-03-29 03:31:57,2022-03-29 04:14:34,2022-03-29 03:00:00,2022-03-29 05:00:00,2022-03-29,1648523000.0,1648530000.0,2022-03-29 05:00:00,1648530000.0
135888,"Житомир, Україна",2022-10-30,1667077200,14.7,10.3,12.3,8.9,79.7,0.0,0.0,83.5,7.6,4.0,06:51:56,16:45:20,0.15,11:00:00,1667120400,12.9,72.12,8.0,0.0,0.0,0.0,0.0,,37.8,20.2,307.5,1018.0,24.1,75.1,355.0,1.3,4.0,10.0,Partially cloudy,Житомир,Житомирська,Житомир,Zhytomyr,Житомирщина,6,,,,,,,,NaT,NaT,NaT,NaT,,,,NaT,
135357,"Житомир, Україна",2022-10-08,1665176400,19.2,3.4,10.2,5.0,73.8,0.0,0.0,151.5,13.1,5.0,07:16:02,18:28:59,0.4,15:00:00,1665230400,19.2,38.97,4.9,0.0,0.0,0.0,0.0,,21.6,7.2,180.0,1021.6,10.0,0.0,482.0,1.7,5.0,10.0,Clear,Житомир,Житомирська,Житомир,Zhytomyr,Житомирщина,6,,,,,,,,NaT,NaT,NaT,NaT,,,,NaT,


In [169]:
df_weather_v4.dtypes

city_resolvedAddress                            object
day_datetime                            datetime64[ns]
day_datetimeEpoch                                int64
day_tempmax                                    float64
day_tempmin                                    float64
day_temp                                       float64
day_dew                                        float64
day_humidity                                   float64
day_precip                                     float64
day_precipcover                                float64
day_solarradiation                             float64
day_solarenergy                                float64
day_uvindex                                    float64
day_sunrise                                     object
day_sunset                                      object
day_moonphase                                  float64
hour_datetime                                   object
hour_datetimeEpoch                               int64
hour_temp 

In [130]:
df_weather_v4.to_csv(f"{OUTPUT_FOLDER}/{WEATHER_EVENTS_OUTPUT_DATA_FILE}", sep=";", index=False)

In [131]:
# df_weather_v5 = df_weather_v4.merge(df_isw[["date_tomorrow_datetime", "keywords"]].add_prefix('isw_'),
#                                     how="left",
#                                     left_on="day_datetime",
#                                     right_on="isw_date_tomorrow_datetime")
# df_weather_v5.head(10)

In [132]:
# df_weather_v5.head(5)

In [133]:
# df_weather_v5.to_csv(f"{OUTPUT_FOLDER}/{WEATHER_EVENTS_KEYWORDS_OUTPUT_DATA_FILE}", sep=";", index=False)