In [249]:
import datetime
import calendar
import numpy as np
import pandas as pd

import pickle

from sklearn.feature_extraction.text import TfidfTransformer

In [250]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [251]:
REPORTS_DATA_FOLDER = "data/2_isw_preprocessed"
REPORTS_DATA_FILE = "all_days.csv"

TFIDF_NUMBER = 100

EVENTS_DATA_FOLDER = "data/1_events"
EVENTS_DATA_FILE = "alarms.csv"

WEATHER_DATA_FOLDER = "data/1_weather"
WEATHER_DATA_FILE = "all_weather_by_hour_v2.csv"

REGIONS_DATA_FOLDER = "data/0_meta"
REGIONS_DATA_FILE = "regions.csv"

MODEL_FOLDER = "model"

OUTPUT_FOLDER = "data/4_all_data_preprocessed"
ISW_OUTPUT_DATA_FILE = "all_isw.csv"
WEATHER_EVENTS_OUTPUT_DATA_FILE = "all_hourly_weather_events_v2.csv"
WEATHER_EVENTS_KEYWORDS_OUTPUT_DATA_FILE = "all_hourly_weather_events_isw_v2.csv"

tfidf_transformer_model = "tfidf_transformer"
count_vectorizer_model = "count_vectorizer"

tfidf_transformer_version = "v4"
count_vectorizer_version = "v4"

In [252]:
def isNaN(num):
    return num != num

## prepare events data

In [253]:
# read events data
df_events = pd.read_csv(f"{EVENTS_DATA_FOLDER}/{EVENTS_DATA_FILE}", sep=";")

# evets dataset uses differet ids for the regions
df_events_v2 = df_events.drop(["id","region_id"],axis=1)
df_events_v2.sample(5)

Unnamed: 0,region_title,region_city,all_region,start,end,clean_end,intersection_alarm_id
13529,Львівщина,Львівська обл.,1,2022-08-24 13:14:24,2022-08-24 13:37:59,2022-08-24 13:37:59,
7463,Київщина,Київ,0,2022-05-08 17:50:02,2022-05-08 19:03:10,2022-05-08 17:51:17,9754.0
8677,Буковина,Чернівецька обл.,1,2022-05-25 20:05:26,2022-05-25 21:53:52,2022-05-25 21:53:52,
12674,Одещина,Одеська обл.,1,2022-08-10 15:07:12,2022-08-10 15:42:12,2022-08-10 15:42:12,
10982,Запоріжжя,Запорізька обл.,1,2022-07-05 02:52:59,2022-07-05 04:45:06,2022-07-05 04:45:06,


In [254]:
# calculate descrete hours for event start and end
df_events_v2["start_time"] = pd.to_datetime(df_events_v2["start"])
df_events_v2["end_time"] = pd.to_datetime(df_events_v2["end"])

df_events_v2["start_hour"] = df_events_v2['start_time'].dt.floor('H')
df_events_v2["end_hour"] = df_events_v2['end_time'].dt.ceil('H')

df_events_v2["start_hour"] = df_events_v2.apply(lambda x: x["start_hour"] if not isNaN(x["start_hour"]) else x["event_hour"] , axis=1)
df_events_v2["end_hour"] = df_events_v2.apply(lambda x: x["end_hour"] if not isNaN(x["end_hour"]) else x["event_hour"] , axis=1)

# Present date as separate colum
df_events_v2["day_date"] = df_events_v2["start_time"].dt.date

# Preset tiime as epoch time (faster ops)
df_events_v2["start_hour_datetimeEpoch"] = df_events_v2['start_hour'].apply(lambda x: int(calendar.timegm(x.timetuple()))  if not isNaN(x) else None)
df_events_v2["end_hour_datetimeEpoch"] = df_events_v2['end_hour'].apply(lambda x: int(calendar.timegm(x.timetuple()))  if not isNaN(x) else None)

df_events_v2.head(5)

Unnamed: 0,region_title,region_city,all_region,start,end,clean_end,intersection_alarm_id,start_time,end_time,start_hour,end_hour,day_date,start_hour_datetimeEpoch,end_hour_datetimeEpoch
0,Вінниччина,Вінниця,0,2022-02-25 22:55:42,2022-02-25 23:41:53,2022-02-25 23:41:53,,2022-02-25 22:55:42,2022-02-25 23:41:53,2022-02-25 22:00:00,2022-02-26 00:00:00,2022-02-25,1645826400,1645833600
1,Львівщина,Львів,0,2022-02-26 06:26:17,2022-02-26 07:15:28,2022-02-26 07:15:28,,2022-02-26 06:26:17,2022-02-26 07:15:28,2022-02-26 06:00:00,2022-02-26 08:00:00,2022-02-26,1645855200,1645862400
2,Одещина,Одеса,0,2022-02-26 07:16:58,2022-02-26 07:47:03,2022-02-26 07:47:03,,2022-02-26 07:16:58,2022-02-26 07:47:03,2022-02-26 07:00:00,2022-02-26 08:00:00,2022-02-26,1645858800,1645862400
3,Житомирщина,Житомир,0,2022-02-26 08:05:54,2022-02-26 09:36:36,2022-02-26 09:36:36,,2022-02-26 08:05:54,2022-02-26 09:36:36,2022-02-26 08:00:00,2022-02-26 10:00:00,2022-02-26,1645862400,1645869600
4,Вінниччина,Вінниця,0,2022-02-26 08:39:39,2022-02-26 10:42:41,2022-02-26 10:42:41,,2022-02-26 08:39:39,2022-02-26 10:42:41,2022-02-26 08:00:00,2022-02-26 11:00:00,2022-02-26,1645862400,1645873200


## prepare weather

In [255]:
# read weather time series set
df_weather = pd.read_csv(f"{WEATHER_DATA_FOLDER}/{WEATHER_DATA_FILE}")
df_weather["day_datetime"] = pd.to_datetime(df_weather["day_datetime"])

print(df_weather.shape)
df_weather.head(5)

(182712, 67)


Unnamed: 0,city_latitude,city_longitude,city_resolvedAddress,city_address,city_timezone,city_tzoffset,day_datetime,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_feelslikemax,day_feelslikemin,day_feelslike,day_dew,day_humidity,day_precip,day_precipprob,day_precipcover,day_snow,day_snowdepth,day_windgust,day_windspeed,day_winddir,day_pressure,day_cloudcover,day_visibility,day_solarradiation,day_solarenergy,day_uvindex,day_severerisk,day_sunrise,day_sunriseEpoch,day_sunset,day_sunsetEpoch,day_moonphase,day_conditions,day_description,day_icon,day_source,day_preciptype,day_stations,hour_datetime,hour_datetimeEpoch,hour_temp,hour_feelslike,hour_humidity,hour_dew,hour_precip,hour_precipprob,hour_snow,hour_snowdepth,hour_preciptype,hour_windgust,hour_windspeed,hour_winddir,hour_pressure,hour_visibility,hour_cloudcover,hour_solarradiation,hour_solarenergy,hour_uvindex,hour_severerisk,hour_conditions,hour_icon,hour_source,hour_stations
0,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,2.6,4.0,-3.1,-0.2,0.0,83.7,0.118,100.0,4.17,0.1,0.1,32.4,15.5,252.7,1022.3,72.3,12.2,36.9,2.8,1.0,10.0,07:13:36,1645679616,17:51:06,1645717866,0.77,"Snow, Partially cloudy",Partly cloudy throughout the day with morning ...,snow,obs,snow,33177099999;UKLR;remote;33301099999,00:00:00,1645653600,2.4,-1.6,89.18,0.8,0.0,0.0,0.1,0.2,['snow'],31.3,15.5,275.6,1020.0,0.0,91.5,0.0,,0.0,10.0,Overcast,snow,obs,remote
1,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,2.6,4.0,-3.1,-0.2,0.0,83.7,0.118,100.0,4.17,0.1,0.1,32.4,15.5,252.7,1022.3,72.3,12.2,36.9,2.8,1.0,10.0,07:13:36,1645679616,17:51:06,1645717866,0.77,"Snow, Partially cloudy",Partly cloudy throughout the day with morning ...,snow,obs,snow,33177099999;UKLR;remote;33301099999,01:00:00,1645657200,2.4,-1.5,87.9,0.6,0.0,0.0,0.0,0.2,['snow'],27.7,14.8,280.3,1021.0,0.2,88.2,0.0,,0.0,10.0,Partially cloudy,fog,obs,remote
2,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,2.6,4.0,-3.1,-0.2,0.0,83.7,0.118,100.0,4.17,0.1,0.1,32.4,15.5,252.7,1022.3,72.3,12.2,36.9,2.8,1.0,10.0,07:13:36,1645679616,17:51:06,1645717866,0.77,"Snow, Partially cloudy",Partly cloudy throughout the day with morning ...,snow,obs,snow,33177099999;UKLR;remote;33301099999,02:00:00,1645660800,2.9,-0.8,88.58,1.2,0.0,0.0,0.0,0.1,['snow'],29.2,14.4,310.0,1022.0,10.0,100.0,,,,10.0,Overcast,cloudy,obs,33177099999
3,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,2.6,4.0,-3.1,-0.2,0.0,83.7,0.118,100.0,4.17,0.1,0.1,32.4,15.5,252.7,1022.3,72.3,12.2,36.9,2.8,1.0,10.0,07:13:36,1645679616,17:51:06,1645717866,0.77,"Snow, Partially cloudy",Partly cloudy throughout the day with morning ...,snow,obs,snow,33177099999;UKLR;remote;33301099999,03:00:00,1645664400,2.3,-1.3,86.63,0.3,0.0,0.0,0.0,0.1,['snow'],23.8,13.3,295.1,1021.0,0.1,92.0,0.0,,0.0,10.0,Overcast,fog,obs,remote
4,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,2.6,4.0,-3.1,-0.2,0.0,83.7,0.118,100.0,4.17,0.1,0.1,32.4,15.5,252.7,1022.3,72.3,12.2,36.9,2.8,1.0,10.0,07:13:36,1645679616,17:51:06,1645717866,0.77,"Snow, Partially cloudy",Partly cloudy throughout the day with morning ...,snow,obs,snow,33177099999;UKLR;remote;33301099999,04:00:00,1645668000,1.9,-1.8,87.85,0.1,0.0,0.0,0.0,0.1,['snow'],24.5,13.3,305.8,1021.0,0.0,93.8,0.0,,0.0,10.0,Overcast,cloudy,obs,remote


In [256]:
# Reducing number of features and duplicate data
weather_exclude = [
    "day_feelslikemax",
    "day_feelslikemin",
    "day_sunriseEpoch",
    "day_sunsetEpoch",
    "day_description",
    "city_latitude",
    "city_longitude",
    "city_address",
    "city_timezone",
    "city_tzoffset",
    "day_feelslike",
    "day_precipprob",
    "day_snow",
    "day_snowdepth",
    "day_windgust",
    "day_windspeed",
    "day_winddir",
    "day_pressure",
    "day_cloudcover",
    "day_visibility",
    "day_severerisk",
    "day_conditions",
    "day_icon",
    "day_source",
    "day_preciptype",
    "day_stations",
    "hour_icon",
    "hour_source",
    "hour_stations",
    "hour_feelslike"
]

df_weather_v2 = df_weather.drop(weather_exclude, axis=1)

# differet formats for different cities :/
df_weather_v2["city"] = df_weather_v2["city_resolvedAddress"].apply(lambda x: x.split(",")[0])
df_weather_v2["city"] = df_weather_v2["city"].replace('Хмельницька область', "Хмельницький")

print(df_weather_v2.shape)
df_weather_v2.head(5)

(182712, 38)


Unnamed: 0,city_resolvedAddress,day_datetime,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,day_solarradiation,day_solarenergy,day_uvindex,day_sunrise,day_sunset,day_moonphase,hour_datetime,hour_datetimeEpoch,hour_temp,hour_humidity,hour_dew,hour_precip,hour_precipprob,hour_snow,hour_snowdepth,hour_preciptype,hour_windgust,hour_windspeed,hour_winddir,hour_pressure,hour_visibility,hour_cloudcover,hour_solarradiation,hour_solarenergy,hour_uvindex,hour_severerisk,hour_conditions,city
0,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,36.9,2.8,1.0,07:13:36,17:51:06,0.77,00:00:00,1645653600,2.4,89.18,0.8,0.0,0.0,0.1,0.2,['snow'],31.3,15.5,275.6,1020.0,0.0,91.5,0.0,,0.0,10.0,Overcast,Луцьк
1,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,36.9,2.8,1.0,07:13:36,17:51:06,0.77,01:00:00,1645657200,2.4,87.9,0.6,0.0,0.0,0.0,0.2,['snow'],27.7,14.8,280.3,1021.0,0.2,88.2,0.0,,0.0,10.0,Partially cloudy,Луцьк
2,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,36.9,2.8,1.0,07:13:36,17:51:06,0.77,02:00:00,1645660800,2.9,88.58,1.2,0.0,0.0,0.0,0.1,['snow'],29.2,14.4,310.0,1022.0,10.0,100.0,,,,10.0,Overcast,Луцьк
3,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,36.9,2.8,1.0,07:13:36,17:51:06,0.77,03:00:00,1645664400,2.3,86.63,0.3,0.0,0.0,0.0,0.1,['snow'],23.8,13.3,295.1,1021.0,0.1,92.0,0.0,,0.0,10.0,Overcast,Луцьк
4,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,36.9,2.8,1.0,07:13:36,17:51:06,0.77,04:00:00,1645668000,1.9,87.85,0.1,0.0,0.0,0.0,0.1,['snow'],24.5,13.3,305.8,1021.0,0.0,93.8,0.0,,0.0,10.0,Overcast,Луцьк


## merging data

In [257]:
# Use region ids from regions dataset
df_regions = pd.read_csv(f"{REGIONS_DATA_FOLDER}/{REGIONS_DATA_FILE}")
df_regions.head(5)

Unnamed: 0,region,center_city_ua,center_city_en,region_alt,region_id
0,АР Крим,Сімферополь,Simferopol,Крим,1
1,Вінницька,Вінниця,Vinnytsia,Вінниччина,2
2,Волинська,Луцьк,Lutsk,Волинь,3
3,Дніпропетровська,Дніпро,Dnipro,Дніпропетровщина,4
4,Донецька,Донецьк,Donetsk,Донеччина,5


In [258]:
df_weather_reg = pd.merge(df_weather_v2, df_regions, left_on="city",right_on="center_city_ua")

print(df_weather_reg.shape)
df_weather_reg.head(5)

(182712, 43)


Unnamed: 0,city_resolvedAddress,day_datetime,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,day_solarradiation,day_solarenergy,day_uvindex,day_sunrise,day_sunset,day_moonphase,hour_datetime,hour_datetimeEpoch,hour_temp,hour_humidity,hour_dew,hour_precip,hour_precipprob,hour_snow,hour_snowdepth,hour_preciptype,hour_windgust,hour_windspeed,hour_winddir,hour_pressure,hour_visibility,hour_cloudcover,hour_solarradiation,hour_solarenergy,hour_uvindex,hour_severerisk,hour_conditions,city,region,center_city_ua,center_city_en,region_alt,region_id
0,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,36.9,2.8,1.0,07:13:36,17:51:06,0.77,00:00:00,1645653600,2.4,89.18,0.8,0.0,0.0,0.1,0.2,['snow'],31.3,15.5,275.6,1020.0,0.0,91.5,0.0,,0.0,10.0,Overcast,Луцьк,Волинська,Луцьк,Lutsk,Волинь,3
1,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,36.9,2.8,1.0,07:13:36,17:51:06,0.77,01:00:00,1645657200,2.4,87.9,0.6,0.0,0.0,0.0,0.2,['snow'],27.7,14.8,280.3,1021.0,0.2,88.2,0.0,,0.0,10.0,Partially cloudy,Луцьк,Волинська,Луцьк,Lutsk,Волинь,3
2,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,36.9,2.8,1.0,07:13:36,17:51:06,0.77,02:00:00,1645660800,2.9,88.58,1.2,0.0,0.0,0.0,0.1,['snow'],29.2,14.4,310.0,1022.0,10.0,100.0,,,,10.0,Overcast,Луцьк,Волинська,Луцьк,Lutsk,Волинь,3
3,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,36.9,2.8,1.0,07:13:36,17:51:06,0.77,03:00:00,1645664400,2.3,86.63,0.3,0.0,0.0,0.0,0.1,['snow'],23.8,13.3,295.1,1021.0,0.1,92.0,0.0,,0.0,10.0,Overcast,Луцьк,Волинська,Луцьк,Lutsk,Волинь,3
4,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,36.9,2.8,1.0,07:13:36,17:51:06,0.77,04:00:00,1645668000,1.9,87.85,0.1,0.0,0.0,0.0,0.1,['snow'],24.5,13.3,305.8,1021.0,0.0,93.8,0.0,,0.0,10.0,Overcast,Луцьк,Волинська,Луцьк,Lutsk,Волинь,3


### Merging weather and events

In [259]:
print(df_events_v2.shape)
df_events_v2.head(5)

(19933, 14)


Unnamed: 0,region_title,region_city,all_region,start,end,clean_end,intersection_alarm_id,start_time,end_time,start_hour,end_hour,day_date,start_hour_datetimeEpoch,end_hour_datetimeEpoch
0,Вінниччина,Вінниця,0,2022-02-25 22:55:42,2022-02-25 23:41:53,2022-02-25 23:41:53,,2022-02-25 22:55:42,2022-02-25 23:41:53,2022-02-25 22:00:00,2022-02-26 00:00:00,2022-02-25,1645826400,1645833600
1,Львівщина,Львів,0,2022-02-26 06:26:17,2022-02-26 07:15:28,2022-02-26 07:15:28,,2022-02-26 06:26:17,2022-02-26 07:15:28,2022-02-26 06:00:00,2022-02-26 08:00:00,2022-02-26,1645855200,1645862400
2,Одещина,Одеса,0,2022-02-26 07:16:58,2022-02-26 07:47:03,2022-02-26 07:47:03,,2022-02-26 07:16:58,2022-02-26 07:47:03,2022-02-26 07:00:00,2022-02-26 08:00:00,2022-02-26,1645858800,1645862400
3,Житомирщина,Житомир,0,2022-02-26 08:05:54,2022-02-26 09:36:36,2022-02-26 09:36:36,,2022-02-26 08:05:54,2022-02-26 09:36:36,2022-02-26 08:00:00,2022-02-26 10:00:00,2022-02-26,1645862400,1645869600
4,Вінниччина,Вінниця,0,2022-02-26 08:39:39,2022-02-26 10:42:41,2022-02-26 10:42:41,,2022-02-26 08:39:39,2022-02-26 10:42:41,2022-02-26 08:00:00,2022-02-26 11:00:00,2022-02-26,1645862400,1645873200


In [260]:
# present evets as dictionary
events_dict = df_events_v2.to_dict('records')
events_by_hour = []

events_dict[0]

{'region_title': 'Вінниччина',
 'region_city': 'Вінниця',
 'all_region': 0,
 'start': '2022-02-25 22:55:42',
 'end': '2022-02-25 23:41:53',
 'clean_end': '2022-02-25 23:41:53',
 'intersection_alarm_id': nan,
 'start_time': Timestamp('2022-02-25 22:55:42'),
 'end_time': Timestamp('2022-02-25 23:41:53'),
 'start_hour': Timestamp('2022-02-25 22:00:00'),
 'end_hour': Timestamp('2022-02-26 00:00:00'),
 'day_date': datetime.date(2022, 2, 25),
 'start_hour_datetimeEpoch': 1645826400,
 'end_hour_datetimeEpoch': 1645833600}

In [261]:
# Slice events into descrete samples one hour long each
for event in events_dict:
    for d in pd.date_range(start=event["start_hour"], end=event["end_hour"], freq='1H'):
        et = event.copy()
        et["hour_level_event_time"] = d
        events_by_hour.append(et)

df_events_v3 = pd.DataFrame.from_dict(events_by_hour)
df_events_v3["hour_level_event_datetimeEpoch"] = df_events_v3["hour_level_event_time"].apply(lambda x: int(calendar.timegm(x.timetuple()))  if not isNaN(x) else None)

print(df_events_v3.shape)
df_events_v3.head(5)

(58860, 16)


Unnamed: 0,region_title,region_city,all_region,start,end,clean_end,intersection_alarm_id,start_time,end_time,start_hour,end_hour,day_date,start_hour_datetimeEpoch,end_hour_datetimeEpoch,hour_level_event_time,hour_level_event_datetimeEpoch
0,Вінниччина,Вінниця,0,2022-02-25 22:55:42,2022-02-25 23:41:53,2022-02-25 23:41:53,,2022-02-25 22:55:42,2022-02-25 23:41:53,2022-02-25 22:00:00,2022-02-26 00:00:00,2022-02-25,1645826400,1645833600,2022-02-25 22:00:00,1645826400
1,Вінниччина,Вінниця,0,2022-02-25 22:55:42,2022-02-25 23:41:53,2022-02-25 23:41:53,,2022-02-25 22:55:42,2022-02-25 23:41:53,2022-02-25 22:00:00,2022-02-26 00:00:00,2022-02-25,1645826400,1645833600,2022-02-25 23:00:00,1645830000
2,Вінниччина,Вінниця,0,2022-02-25 22:55:42,2022-02-25 23:41:53,2022-02-25 23:41:53,,2022-02-25 22:55:42,2022-02-25 23:41:53,2022-02-25 22:00:00,2022-02-26 00:00:00,2022-02-25,1645826400,1645833600,2022-02-26 00:00:00,1645833600
3,Львівщина,Львів,0,2022-02-26 06:26:17,2022-02-26 07:15:28,2022-02-26 07:15:28,,2022-02-26 06:26:17,2022-02-26 07:15:28,2022-02-26 06:00:00,2022-02-26 08:00:00,2022-02-26,1645855200,1645862400,2022-02-26 06:00:00,1645855200
4,Львівщина,Львів,0,2022-02-26 06:26:17,2022-02-26 07:15:28,2022-02-26 07:15:28,,2022-02-26 06:26:17,2022-02-26 07:15:28,2022-02-26 06:00:00,2022-02-26 08:00:00,2022-02-26,1645855200,1645862400,2022-02-26 07:00:00,1645858800


In [262]:
# merge events to weather data
df_events_v4 = df_events_v3.copy().add_prefix('event_')

df_weather_v4 = df_weather_reg.merge(df_events_v4, 
                                     how="left", 
                                     left_on=["region_alt","hour_datetimeEpoch"],
                                     right_on=["event_region_title","event_hour_level_event_datetimeEpoch"])

print(df_weather_v4.shape)
df_weather_v4.sample(5)

(195084, 59)


Unnamed: 0,city_resolvedAddress,day_datetime,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,day_solarradiation,day_solarenergy,day_uvindex,day_sunrise,day_sunset,day_moonphase,hour_datetime,hour_datetimeEpoch,hour_temp,hour_humidity,hour_dew,hour_precip,hour_precipprob,hour_snow,hour_snowdepth,hour_preciptype,hour_windgust,hour_windspeed,hour_winddir,hour_pressure,hour_visibility,hour_cloudcover,hour_solarradiation,hour_solarenergy,hour_uvindex,hour_severerisk,hour_conditions,city,region,center_city_ua,center_city_en,region_alt,region_id,event_region_title,event_region_city,event_all_region,event_start,event_end,event_clean_end,event_intersection_alarm_id,event_start_time,event_end_time,event_start_hour,event_end_hour,event_day_date,event_start_hour_datetimeEpoch,event_end_hour_datetimeEpoch,event_hour_level_event_time,event_hour_level_event_datetimeEpoch
66912,"Одеса, Україна",2022-10-02,1664658000,20.3,13.7,17.8,13.1,75.1,5.0,8.33,60.0,5.3,3.0,06:56:35,18:35:34,0.2,13:00:00,1664704800,17.5,75.88,13.2,0.0,0.0,0.0,0.0,['rain'],49.0,23.8,310.9,1005.0,24.1,100.0,51.0,0.2,1.0,10.0,Overcast,Одеса,Одеська,Одеса,Odesa,Одещина,15,,,,,,,,NaT,NaT,NaT,NaT,,,,NaT,
156239,"Львів, Україна",2022-05-10,1652130000,18.7,4.9,12.4,-0.9,42.3,0.0,0.0,313.3,26.9,9.0,05:46:11,20:55:27,0.3,04:00:00,1652144400,6.2,58.66,-1.3,0.0,0.0,0.0,0.0,,10.8,4.3,34.4,1027.0,24.1,0.0,0.0,,0.0,10.0,Clear,Львів,Львівська,Львів,Lviv,Львівщина,13,,,,,,,,NaT,NaT,NaT,NaT,,,,NaT,
143839,"Тернопіль, Україна",2022-10-28,1666904400,15.9,5.4,9.4,7.7,89.6,0.2,4.17,81.4,6.9,3.0,07:59:21,18:02:47,0.08,10:00:00,1666940400,6.6,100.0,6.7,0.0,0.0,0.0,0.0,,6.1,3.2,204.3,1030.0,24.1,100.0,121.0,0.4,1.0,10.0,Overcast,Тернопіль,Тернопільська,Тернопіль,Ternopil,Тернопільщина,19,,,,,,,,NaT,NaT,NaT,NaT,,,,NaT,
118763,"Запоріжжя, Україна",2022-10-18,1666040400,23.2,4.7,10.4,6.1,77.1,0.0,0.0,142.3,12.2,5.0,07:02:25,17:45:54,0.74,21:00:00,1666116000,10.6,83.95,8.0,0.0,0.0,0.0,0.0,,5.4,3.6,174.7,1026.0,24.1,0.0,0.0,,0.0,10.0,Clear,Запоріжжя,Запорізька,Запоріжжя,Zaporozhye,Запоріжжя,8,,,,,,,,NaT,NaT,NaT,NaT,,,,NaT,
49177,"Чернівці, Україна",2022-08-21,1661029200,26.6,17.2,21.9,16.6,73.3,1.952,4.17,232.4,20.0,8.0,06:18:17,20:19:42,0.78,19:00:00,1661097600,25.5,57.16,16.4,0.0,0.0,0.0,0.0,,16.1,6.3,120.0,1008.6,13.0,70.9,231.0,0.8,2.0,10.0,Partially cloudy,Чернівці,Чернівецька,Чернівці,Chernivtsi,Буковина,24,,,,,,,,NaT,NaT,NaT,NaT,,,,NaT,


In [264]:
# Save weather data
df_weather_v4.to_csv(f"{OUTPUT_FOLDER}/{WEATHER_EVENTS_OUTPUT_DATA_FILE}", sep=";", index=False)

### Calculate top keywords for isw keywords (deprecated for model training; EDA use only)

In [265]:
# read reports dataset
df_isw = pd.read_csv(f"{REPORTS_DATA_FOLDER}/{REPORTS_DATA_FILE}", sep=";")
df_isw.head(5)

Unnamed: 0,date,text,lemm,stemm
0,2022-02-24,(Ukraine local time) that Russian forces have...,ukrain local time russian forc not achiev bre...,ukrain local time russian forc not achiev bre...
1,2022-02-25,\n\n\n\n\n\n\n\n\nRussia-Ukraine Warning Updat...,russia ukrain warn updat russian offens campa...,russia ukrain warn updat russian offen campai...
2,2022-02-26,\n\n\n\n\n\n\n\n\nRussia-Ukraine Warning Updat...,russia ukrain warn updat russian offens campa...,russia ukrain warn updat russian offen campai...
3,2022-02-27,\n\n\n\n\n\n\n\n\nRussia-Ukraine Warning Updat...,russia ukrain warn updat russian offens campa...,russia ukrain warn updat russian offen campai...
4,2022-02-28,\n\n\n\n\n\n\n\n\nRussian Offensive Campaign A...,russian offens campaign ass februari twenti e...,russian offen campaign assess februari twenti...


In [266]:
# load tfidf transformer
tfidf = pickle.load(open(f"{MODEL_FOLDER}/{tfidf_transformer_model}_{tfidf_transformer_version}.pkl", "rb"))
cv = pickle.load(open(f"{MODEL_FOLDER}/{count_vectorizer_model}_{count_vectorizer_version}.pkl", "rb"))

In [267]:

# Converts doument lemm to keywords vector, returs TFIDF_NUMBER top keywords as dict
def convert_to_vector(doc, tfidf_transformer, count_vectorized):
    feature_names = count_vectorized.get_feature_names_out()
    tfidf_vector = tfidf_transformer.transform(count_vectorized.transform([doc]))

    # sort vector by tf-idf score of each keyword
    keywords = pd.DataFrame(np.transpose(tfidf_vector.todense()), index=feature_names, columns=['tfidf'])
    keywords = keywords['tfidf'].sort_values(ascending=False).head(TFIDF_NUMBER).round(3).to_dict()
    
    return keywords

In [268]:
# Add top keywords to dataset
df_isw['keywords'] = df_isw['lemm'].apply(lambda x: convert_to_vector(x, tfidf, cv))
df_isw.head(5)

Unnamed: 0,date,text,lemm,stemm,keywords
0,2022-02-24,(Ukraine local time) that Russian forces have...,ukrain local time russian forc not achiev bre...,ukrain local time russian forc not achiev bre...,"{'odesa': 0.335, 'land': 0.285, 'air': 0.204, ..."
1,2022-02-25,\n\n\n\n\n\n\n\n\nRussia-Ukraine Warning Updat...,russia ukrain warn updat russian offens campa...,russia ukrain warn updat russian offen campai...,"{'februari': 0.429, 'kyiv': 0.405, 'twenti': 0..."
2,2022-02-26,\n\n\n\n\n\n\n\n\nRussia-Ukraine Warning Updat...,russia ukrain warn updat russian offens campa...,russia ukrain warn updat russian offen campai...,"{'februari': 0.466, 'kyiv': 0.434, 'six': 0.27..."
3,2022-02-27,\n\n\n\n\n\n\n\n\nRussia-Ukraine Warning Updat...,russia ukrain warn updat russian offens campa...,russia ukrain warn updat russian offen campai...,"{'februari': 0.634, 'kyiv': 0.318, 'advanc': 0..."
4,2022-02-28,\n\n\n\n\n\n\n\n\nRussian Offensive Campaign A...,russian offens campaign ass februari twenti e...,russian offen campaign assess februari twenti...,"{'februari': 0.591, 'eight': 0.425, 'twenti': ..."


In [269]:
# Create tomorrow date column for merging
df_isw["date_datetime"] = pd.to_datetime(df_isw["date"])
df_isw['date_tomorrow_datetime'] = df_isw['date_datetime'].apply(lambda x: x+datetime.timedelta(days=1))
df_isw = df_isw.rename(columns = {"date_datetime":"report_date"})
df_isw.sample(5)

Unnamed: 0,date,text,lemm,stemm,keywords,report_date,date_tomorrow_datetime
244,2022-10-26,\nClick here to see ISW’s interactive map of t...,reuter investig document trove found abandon ...,reuter investig document trove found abandon ...,"{'octob': 0.478, 'six': 0.275, 'twenti': 0.253...",2022-10-26,2022-10-27
129,2022-07-03,\nClick here to see ISW's interactive map of t...,click see isw interact map russian invas ukra...,click see isw interact map russian inva ukrai...,"{'juli': 0.526, 'lysychansk': 0.349, 'luhansk'...",2022-07-03,2022-07-04
262,2022-11-13,\nClick here to see ISW’s interactive map of t...,click see isw interact map russian invas ukra...,click see isw interact map russian inva ukrai...,"{'putin': 0.286, 'offens': 0.233, 'counter': 0...",2022-11-13,2022-11-14
366,2023-02-28,\nClick here to see ISW’s interactive map of t...,click see isw interact map russian invas ukra...,click see isw interact map russian inva ukrai...,"{'februari': 0.481, 'eight': 0.235, 'twenti': ...",2023-02-28,2023-03-01
168,2022-08-11,\nClick here to see ISW's interactive map of t...,click see isw interact map russian invas ukra...,click see isw interact map russian inva ukrai...,"{'eleven': 0.398, 'znpp': 0.196, 'command': 0....",2022-08-11,2022-08-12


In [270]:
# Save dataset
df_isw.to_csv(f"{OUTPUT_FOLDER}/{ISW_OUTPUT_DATA_FILE}", sep=";", index=False)

In [271]:
# df_weather_v5 = df_weather_v4.merge(df_isw[["date_tomorrow_datetime", "keywords"]].add_prefix('isw_'),
#                                     how="left",
#                                     left_on="day_datetime",
#                                     right_on="isw_date_tomorrow_datetime")
# df_weather_v5.head(10)

In [272]:
# df_weather_v5.head(5)

In [273]:
# df_weather_v5.to_csv(f"{OUTPUT_FOLDER}/{WEATHER_EVENTS_KEYWORDS_OUTPUT_DATA_FILE}", sep=";", index=False)