In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import ast
from scipy.sparse import csr_matrix
import scipy.sparse as sp
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, precision_recall_curve
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.linear_model import LinearRegression
from sklearn.metrics import roc_curve, auc
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

In [3]:
data = pd.read_json('data.json', lines=True)
kyiv_new = pd.read_csv('reg10_kyiv.csv')

In [4]:
#pd.set_option('display.max_rows', None)
kyiv_new.columns
#kyiv_new

Index(['Unnamed: 0', 'day_tempmax', 'day_tempmin', 'day_temp', 'day_dew',
       'day_humidity', 'day_precip', 'day_precipcover', 'day_snow',
       'day_windgust', 'day_windspeed', 'day_winddir', 'day_pressure',
       'day_cloudcover', 'day_visibility', 'day_solarradiation',
       'day_solarenergy', 'day_uvindex', 'day_moonphase', 'hour_temp',
       'hour_humidity', 'hour_dew', 'hour_precip', 'hour_precipprob',
       'hour_snow', 'hour_snowdepth', 'hour_windgust', 'hour_windspeed',
       'hour_winddir', 'hour_pressure', 'hour_visibility', 'hour_cloudcover',
       'hour_solarradiation', 'hour_uvindex', 'hour_severerisk', 'region_id_x',
       'tf-idf', 'alarm', 'datetime_combined', 'date', 'time', 'day_of_week',
       'alarm_last_2hours', 'alarm_last_3hours', 'alarm_last_4hours',
       'alarm_in_vinnytsia', 'alarm_in_zhytomyr', 'alarm_in_poltava',
       'alarm_in_cherkasy', 'alarm_in_chernihiv'],
      dtype='object')

### clean dataframe 

In [5]:
kyiv = kyiv_new.copy()
kyiv = kyiv.drop_duplicates(subset=['datetime_combined'])
kyiv = kyiv.dropna(subset=['datetime_combined'])

### add lemmatized text to main dataset

In [6]:
kyiv['date'] = pd.to_datetime(kyiv['date']).dt.date
data['date'] = pd.to_datetime(data['date']).dt.date

kyiv = kyiv[kyiv['date'].isin(data['date'])]

weather_alarms = pd.merge(kyiv, data, on='date', how='inner')
#weather_alarms.head(5)

### drop columns and make weather sparce matrix

In [9]:
with open('selected_columns.txt', 'r') as file:
    selected_columns = [line.strip() for line in file]

In [10]:
weather = weather_alarms.copy()
columns_to_drop = set(weather.columns).difference(selected_columns)
weather.drop(columns=columns_to_drop, inplace=True)
weather = weather.fillna(0)
weather.columns

Index(['day_tempmax', 'day_tempmin', 'day_temp', 'day_dew', 'day_humidity',
       'day_precip', 'day_precipcover', 'day_snow', 'day_windgust',
       'day_windspeed', 'day_winddir', 'day_pressure', 'day_cloudcover',
       'day_visibility', 'day_solarradiation', 'day_solarenergy',
       'day_uvindex', 'day_moonphase', 'hour_temp', 'hour_humidity',
       'hour_dew', 'hour_precip', 'hour_precipprob', 'hour_snow',
       'hour_snowdepth', 'hour_windgust', 'hour_windspeed', 'hour_winddir',
       'hour_pressure', 'hour_visibility', 'hour_cloudcover',
       'hour_solarradiation', 'hour_uvindex', 'hour_severerisk',
       'day_of_week'],
      dtype='object')

In [11]:
weather_columns = weather.columns.tolist()
weather_matrix = weather.to_numpy()
# delete the first day (24feb) as we didn't have report for 23 febr
weather_matrix = weather_matrix[24:]   
sparse_weather = csr_matrix(weather_matrix)
sparse_weather.shape

(7775, 35)

### sparse matrix of words (tf-idf)

In [12]:
# remove report for the last day as we move them for one day back
lem_text_ser = weather_alarms['lemmatized_text'][:-24]

In [13]:
# transformation to matrix with all words
lemmatized_text_str = lem_text_ser.apply(lambda x: ' '.join(x))
# remove words which are less then in three reports
tfidf_vectorizer = TfidfVectorizer(min_df=72)
tfidf_matrix = tfidf_vectorizer.fit_transform(lemmatized_text_str)
words = tfidf_vectorizer.get_feature_names_out()

tfidf_matrix.shape    # (7799, 6099)

(7775, 6099)

In [14]:
# merge weather matrix with words matrix
combine_matrix = sp.hstack((tfidf_matrix, sparse_weather))
# delete alarm for day (24feb) as we didn't have report for 23 febr
labels = weather_alarms['alarm'][24:]
combine_matrix.shape

(7775, 6134)

In [15]:
all_words = weather_columns.copy()
all_words.extend(words)
# all_words
# for item in all_words:
#     print(item)

# Models

## GradientBoostingClassifier

In [None]:
X_train_5, X_test_5, y_train_5, y_test_5 = train_test_split(combine_matrix, labels, test_size=0.20, random_state=42)

gb_clf1 = GradientBoostingClassifier(n_estimators=50, learning_rate=0.05, max_depth=3, random_state=42)
gb_clf1.fit(X_train_5, y_train_5)
y_pred_5 = gb_clf1.predict(X_test_5)
y_pred_proba = gb_clf1.predict_proba(X_test_5)

for i in range(10):
    print("Example {}: Prediction: {}, Probability: {}".format(i+1, y_pred_5[i], y_pred_proba[i]))

In [None]:
print(classification_report(y_test_5, y_pred_5))

cm_5 = confusion_matrix(y_test_5, y_pred_5)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_5, display_labels=['Not Alarm', 'Alarm'])
disp.plot()
plt.show()

In [None]:
plt.figure(figsize=(17, 12)) 
colors = ['#FFAF45', '#FB6D48', '#D74B76', '#673F69']

feature_importances = gb_clf1.feature_importances_
feature_importances = feature_importances[::-1]
top_indices = np.argsort(feature_importances)[::-1][:20]
top_importances = feature_importances[top_indices]
top_words = [all_words[idx] for idx in top_indices]

In [None]:
#plt.subplot(2, 2, i) 
plt.barh(range(len(top_indices)), top_importances, color=colors)
plt.yticks(range(len(top_indices)), top_words)
plt.xlabel('Importance')
plt.ylabel('Feature Name')
plt.title(f'Top 20 Features and Their Importances ({"GradientBoostingClassifier"})')
plt.gca().invert_yaxis()

plt.tight_layout()
plt.show()