In [212]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from bs4 import BeautifulSoup
import requests
import matplotlib.pyplot as plt
import pandas as pd
import calendar
import datetime 
from scipy.sparse import csr_matrix, vstack
import scipy.sparse as sp

In [139]:
import json
import datetime as dt
import re
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
stop_words = set(stopwords.words('english'))

In [95]:
tfidf_matrix = np.load('words.npy', allow_pickle=True)
all_words = tfidf_matrix.tolist()

### download model

In [215]:
with open('model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)


In [6]:
num_columns = loaded_model.feature_importances_.shape[0]

print(f"Number of columns in the model: {num_columns}")

Number of columns in the model: 6134


### get isw words

In [2]:
# date
def get_date(soup):
    date_span = soup.find('span', class_='submitted')
    date = date_span.find('span').get('content')
    date_object = datetime.datetime.strptime(date, '%Y-%m-%dT%H:%M:%S%z')
    return date_object.date()

In [3]:
# title
def get_title(soup):
    title_h1 = soup.find('h1', class_='title')
    return title_h1.text

In [4]:
# main_html
def get_html(soup):
    html_element = soup.find('div', class_='field field-name-body field-type-text-with-summary field-label-hidden')
    return html_element

def get_html_text(soup):
    html_element = get_html(soup)
    html_main = html_element.decode() 
    return html_main

In [5]:
# full_url
def get_url(soup):
    full_url_tag = soup.find("link", rel="canonical")
    if full_url_tag:
        full_url = full_url_tag.get("href")
        return full_url

In [6]:
# text
def get_text(soup):
    html_element = get_html(soup)
    html_div = html_element.find('div', class_='field-item even')
    elements = html_div.find_all(['p', 'ul', 'ol', 'div'])
    text = ''
    for element in elements:
        if element.name == 'div' and element.find('hr'):
            break
        text += element.text
    return text

In [7]:
# data for dataframe
def get_data(soup):
    data = []
    data.append(get_date(soup))
    data.append(get_title(soup))
    data.append(get_url(soup))
    data.append(get_html_text(soup))
    data.append(get_text(soup))
    return data

In [8]:
# url for request
def build_url(days_to_substract):
    today = datetime.datetime.today() - datetime.timedelta(days=days_to_substract)
    month_name = calendar.month_name[today.month].lower()
    url = f'https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-{month_name}-{today.day}-{today.year}'
    return url

In [9]:
# dataframe
def get_dataframe():
    df = pd.DataFrame(columns=['date', 'title', 'full_url', 'main_html', 'main_text'])
    days_to_substract = 0
    while True:
        days_to_substract += 1
        url = build_url(days_to_substract)
        answer = requests.get(url)
        if not answer.status_code == 200: 
            continue
        html_text = answer.text     
        soup = BeautifulSoup(html_text, 'lxml')
        df.loc[0] = get_data(soup)
        return df

In [96]:
data_frame = get_dataframe()
data_frame

Unnamed: 0,date,title,full_url,main_html,main_text
0,2024-04-20,"Russian Offensive Campaign Assessment, April 2...",/backgrounder/russian-offensive-campaign-asses...,"<div class=""field field-name-body field-type-t...","Russian Offensive Campaign Assessment, April 2..."


### lemmatize words

In [31]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

In [32]:
def lemmatize(report):
    report = report.lower()
    text = re.sub(r'\d+', '', report)
    text = re.sub(r'[^\w\s]', '', text)

    tokens = nltk.word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in stop_words]
    
    lemmatizer = nltk.WordNetLemmatizer()
    pos_tags = nltk.pos_tag(filtered_tokens)
    lemmatized_tokens = []
    for token, tag in pos_tags:
        wordnet_tag = get_wordnet_pos(tag)
        if wordnet_tag is None:
            lemmatized_tokens.append(token)
        else:
            lemmatized_tokens.append(lemmatizer.lemmatize(str(token), pos=wordnet_tag))
    return lemmatized_tokens

In [97]:
data_frame['lemmatized_text'] = data_frame['main_text'].apply(lemmatize)

### transform words to matrix (medhod#1)

In [None]:
# lem_text = data_frame['lemmatized_text']

# lem_text[0] = [word for word in lem_text[0] if word in all_words]
# data_frame['lemmatized_text'] = lem_text

In [127]:
# df_two = data_frame.copy()
# df_two.loc[len(data_frame)] = [None, None, None, None, None, all_words]

In [128]:
# lemmatized_text_str = df_two['lemmatized_text'].apply(lambda x: ' '.join(x))
# tfidf_vectorizer = TfidfVectorizer()
# tfidf_matrix = tfidf_vectorizer.fit_transform(lemmatized_text_str)
# words = tfidf_vectorizer.get_feature_names_out()

<2x6099 sparse matrix of type '<class 'numpy.float64'>'
	with 6954 stored elements in Compressed Sparse Row format>

In [130]:
# first_row = tfidf_matrix.getrow(0)

### transform words to matrix (medhod#2)

In [157]:
first_row

<1x6099 sparse matrix of type '<class 'numpy.float64'>'
	with 855 stored elements in Compressed Sparse Row format>

### get weather

In [158]:
url_base_url = "https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline"
RSA_KEY = "FY62VLWDPNHH845BSLWHAFTVQ"
API_TOKEN = "prymaty"
location = "Kyiv"
date = "next24hours"
url_api_version = "v1"
    
url = f"{url_base_url}/{location}/{date}?unitGroup=metric&include=hours&key={RSA_KEY}&contentType=json"


headers = {
  'X-Api-Key': API_TOKEN
}

response = requests.get(url, headers=headers)

In [161]:
if response.status_code == requests.codes.ok:
    print(response.text)
else:
    print("Error:", response.status_code, response.text)

{"queryCost":1,"latitude":50.4506,"longitude":30.5243,"resolvedAddress":"Київ, Україна","address":"Kyiv","timezone":"Europe/Kiev","tzoffset":3.0,"days":[{"datetime":"2024-04-21","datetimeEpoch":1713646800,"tempmax":9.1,"tempmin":5.3,"temp":7.4,"feelslikemax":8.2,"feelslikemin":1.6,"feelslike":5.0,"dew":5.0,"humidity":85.1,"precip":27.8,"precipprob":100.0,"precipcover":83.33,"preciptype":["rain"],"snow":0.0,"snowdepth":0.0,"windgust":52.9,"windspeed":20.5,"winddir":345.3,"pressure":1005.5,"cloudcover":100.0,"visibility":11.9,"solarradiation":21.1,"solarenergy":1.8,"uvindex":1.0,"severerisk":10.0,"sunrise":"05:51:23","sunriseEpoch":1713667883,"sunset":"20:02:50","sunsetEpoch":1713718970,"moonphase":0.41,"conditions":"Rain, Overcast","description":"Cloudy skies throughout the day with a chance of rain throughout the day.","icon":"rain","stations":["remote"],"source":"comb","hours":[{"datetime":"00:00:00","datetimeEpoch":1713646800,"temp":9.1,"feelslike":8.2,"humidity":59.82,"dew":1.7,"pre

In [162]:
response_json = json.loads(response.text)

In [163]:
curr_hour = dt.datetime.now().hour + 1
next_hour = 0

In [164]:
result = {}
if curr_hour > 11:
    while curr_hour != 24:
        first_res = {
            'day_tempmax': response_json['days'][0]['tempmax'],
            'day_tempmin': response_json['days'][0]['tempmin'],
            'day_temp': response_json['days'][0]['temp'],
            'day_dew': response_json['days'][0]['dew'],
            'day_humidity': response_json['days'][0]['humidity'],
            'day_precip': response_json['days'][0]['precip'],
            'day_precipcover': response_json['days'][0]['precipcover'],
            'day_snow': response_json['days'][0]['snow'],
            'day_windgust': response_json['days'][0]['windgust'],
            'day_windspeed': response_json['days'][0]['windspeed'],
            'day_winddir': response_json['days'][0]['winddir'],
            'day_pressure': response_json['days'][0]['pressure'],
            'day_cloudcover': response_json['days'][0]['cloudcover'],
            'day_visibility': response_json['days'][0]['visibility'],
            'day_solarradiation': response_json['days'][0]['solarradiation'],
            'day_solarenergy': response_json['days'][0]['solarenergy'],
            'day_uvindex': response_json['days'][0]['uvindex'],
            'day_moonphase': response_json['days'][0]['moonphase'],
            'hour_temp': response_json['days'][0]['hours'][curr_hour]['temp'],
            'hour_humidity': response_json['days'][0]['hours'][curr_hour]['humidity'],
            'hour_dew': response_json['days'][0]['hours'][curr_hour]['dew'],
            'hour_precip': response_json['days'][0]['hours'][curr_hour]['precip'],
            'hour_precipprob': response_json['days'][0]['hours'][curr_hour]['precipprob'],
            'hour_snow': response_json['days'][0]['hours'][curr_hour]['snow'],
            'hour_snowdepth': response_json['days'][0]['hours'][curr_hour]['snowdepth'],
            'hour_windgust': response_json['days'][0]['hours'][curr_hour]['windgust'],
            'hour_windspeed': response_json['days'][0]['hours'][curr_hour]['windspeed'],
            'hour_winddir': response_json['days'][0]['hours'][curr_hour]['winddir'],
            'hour_pressure': response_json['days'][0]['hours'][curr_hour]['pressure'],
            'hour_visibility': response_json['days'][0]['hours'][curr_hour]['visibility'],
            'hour_cloudcover': response_json['days'][0]['hours'][curr_hour]['cloudcover'],
            'hour_solarradiation': response_json['days'][0]['hours'][curr_hour]['solarradiation'],
            'hour_uvindex': response_json['days'][0]['hours'][curr_hour]['uvindex'],
            'hour_severerisk': response_json['days'][0]['hours'][curr_hour]['severerisk'],
            'day_of_week': dt.datetime.now().day % 7,
            'date1': str(dt.datetime.today().date())
        }
        date = f"{location}, {response_json['days'][0]['datetime']}, {response_json['days'][0]['hours'][curr_hour]['datetime']}"
        curr_hour += 1
        result[date] = first_res
    while len(result) != 12:
        second_res = {
            'day_tempmax': response_json['days'][1]['tempmax'],
            'day_tempmin': response_json['days'][1]['tempmin'],
            'day_temp': response_json['days'][1]['temp'],
            'day_dew': response_json['days'][1]['dew'],
            'day_humidity': response_json['days'][1]['humidity'],
            'day_precip': response_json['days'][1]['precip'],
            'day_precipcover': response_json['days'][1]['precipcover'],
            'day_snow': response_json['days'][1]['snow'],
            'day_windgust': response_json['days'][1]['windgust'],
            'day_windspeed': response_json['days'][1]['windspeed'],
            'day_winddir': response_json['days'][1]['winddir'],
            'day_pressure': response_json['days'][1]['pressure'],
            'day_cloudcover': response_json['days'][1]['cloudcover'],
            'day_visibility': response_json['days'][1]['visibility'],
            'day_solarradiation': response_json['days'][1]['solarradiation'],
            'day_solarenergy': response_json['days'][1]['solarenergy'],
            'day_uvindex': response_json['days'][1]['uvindex'],
            'day_moonphase': response_json['days'][1]['moonphase'],
            'hour_temp': response_json['days'][1]['hours'][next_hour]['temp'],
            'hour_humidity': response_json['days'][1]['hours'][next_hour]['humidity'],
            'hour_dew': response_json['days'][1]['hours'][next_hour]['dew'],
            'hour_precip': response_json['days'][1]['hours'][next_hour]['precip'],
            'hour_precipprob': response_json['days'][1]['hours'][next_hour]['precipprob'],
            'hour_snow': response_json['days'][1]['hours'][next_hour]['snow'],
            'hour_snowdepth': response_json['days'][1]['hours'][next_hour]['snowdepth'],
            'hour_windgust': response_json['days'][1]['hours'][next_hour]['windgust'],
            'hour_windspeed': response_json['days'][1]['hours'][next_hour]['windspeed'],
            'hour_winddir': response_json['days'][1]['hours'][next_hour]['winddir'],
            'hour_pressure': response_json['days'][1]['hours'][next_hour]['pressure'],
            'hour_visibility': response_json['days'][1]['hours'][next_hour]['visibility'],
            'hour_cloudcover': response_json['days'][1]['hours'][next_hour]['cloudcover'],
            'hour_solarradiation': response_json['days'][1]['hours'][next_hour]['solarradiation'],
            'hour_uvindex': response_json['days'][1]['hours'][next_hour]['uvindex'],
            'hour_severerisk': response_json['days'][1]['hours'][next_hour]['severerisk'],
            'day_of_week': (dt.datetime.today() + dt.timedelta(days=1)).day % 7,
            'date1': str(dt.datetime.today().date())
        }
        date = f"{location}, {response_json['days'][1]['datetime']}, {response_json['days'][1]['hours'][next_hour]['datetime']}"
        next_hour += 1
        result[date] = second_res
else:
    for i in range(12):
        first_res = {
            'day_tempmax': response_json['days'][0]['tempmax'],
            'day_tempmin': response_json['days'][0]['tempmin'],
            'day_temp': response_json['days'][0]['temp'],
            'day_dew': response_json['days'][0]['dew'],
            'day_humidity': response_json['days'][0]['humidity'],
            'day_precip': response_json['days'][0]['precip'],
            'day_precipcover': response_json['days'][0]['precipcover'],
            'day_snow': response_json['days'][0]['snow'],
            'day_windgust': response_json['days'][0]['windgust'],
            'day_windspeed': response_json['days'][0]['windspeed'],
            'day_winddir': response_json['days'][0]['winddir'],
            'day_pressure': response_json['days'][0]['pressure'],
            'day_cloudcover': response_json['days'][0]['cloudcover'],
            'day_visibility': response_json['days'][0]['visibility'],
            'day_solarradiation': response_json['days'][0]['solarradiation'],
            'day_solarenergy': response_json['days'][0]['solarenergy'],
            'day_uvindex': response_json['days'][0]['uvindex'],
            'day_moonphase': response_json['days'][0]['moonphase'],
            'hour_temp': response_json['days'][0]['hours'][curr_hour]['temp'],
            'hour_humidity': response_json['days'][0]['hours'][curr_hour]['humidity'],
            'hour_dew': response_json['days'][0]['hours'][curr_hour]['dew'],
            'hour_precip': response_json['days'][0]['hours'][curr_hour]['precip'],
            'hour_precipprob': response_json['days'][0]['hours'][curr_hour]['precipprob'],
            'hour_snow': response_json['days'][0]['hours'][curr_hour]['snow'],
            'hour_snowdepth': response_json['days'][0]['hours'][curr_hour]['snowdepth'],
            'hour_windgust': response_json['days'][0]['hours'][curr_hour]['windgust'],
            'hour_windspeed': response_json['days'][0]['hours'][curr_hour]['windspeed'],
            'hour_winddir': response_json['days'][0]['hours'][curr_hour]['winddir'],
            'hour_pressure': response_json['days'][0]['hours'][curr_hour]['pressure'],
            'hour_visibility': response_json['days'][0]['hours'][curr_hour]['visibility'],
            'hour_cloudcover': response_json['days'][0]['hours'][curr_hour]['cloudcover'],
            'hour_solarradiation': response_json['days'][0]['hours'][curr_hour]['solarradiation'],
            'hour_uvindex': response_json['days'][0]['hours'][curr_hour]['uvindex'],
            'hour_severerisk': response_json['days'][0]['hours'][curr_hour]['severerisk'],
            'day_of_week': dt.today().day(),
            'date1': str(dt.datetime.today().date())
        }
        date = f"{location}, {response_json['days'][0]['datetime']}, {response_json['days'][0]['hours'][curr_hour]['datetime']}"
        result[date] = first_res
        curr_hour += 1
print(json.dumps(result, indent = 4))
# for i in range(len(result)):
#    print(result[i],'\n')

{
    "Kyiv, 2024-04-21, 17:00:00": {
        "day_tempmax": 9.1,
        "day_tempmin": 5.3,
        "day_temp": 7.4,
        "day_dew": 5.0,
        "day_humidity": 85.1,
        "day_precip": 27.8,
        "day_precipcover": 83.33,
        "day_snow": 0.0,
        "day_windgust": 52.9,
        "day_windspeed": 20.5,
        "day_winddir": 345.3,
        "day_pressure": 1005.5,
        "day_cloudcover": 100.0,
        "day_visibility": 11.9,
        "day_solarradiation": 21.1,
        "day_solarenergy": 1.8,
        "day_uvindex": 1.0,
        "day_moonphase": 0.41,
        "hour_temp": 7.8,
        "hour_humidity": 89.61,
        "hour_dew": 6.2,
        "hour_precip": 0.0,
        "hour_precipprob": 93.5,
        "hour_snow": 0.0,
        "hour_snowdepth": 0.0,
        "hour_windgust": 33.5,
        "hour_windspeed": 13.3,
        "hour_winddir": 314.9,
        "hour_pressure": 1003.0,
        "hour_visibility": 18.8,
        "hour_cloudcover": 100.0,
        "hour_solarradiation":

In [206]:
data = json.dumps(result, indent = 4)
df = pd.read_json(data)
weather_df = df.T
weather_df.drop(columns=['date1'], inplace=True)
weather_df.fillna(0)
weather_df = weather_df.reset_index(drop=True)
# because sparce matrix cannot use just float value
weather_df = weather_df.astype('float64')
object_columns = weather_df.select_dtypes(include=['object']).columns

# Print the columns with object dtype
print("Columns with object dtype ('O'):", object_columns)

Columns with object dtype ('O'): Index([], dtype='object')


  df = pd.read_json(data)
  weather_df.fillna(0)


In [207]:
weather_columns = weather_df.columns.tolist()
weather_matrix = weather_df.to_numpy()

sparse_weather = csr_matrix(weather_matrix)
sparse_weather.shape

(12, 35)

In [210]:
new_sparse_matrix = vstack([first_row] * 12)
new_sparse_matrix

<12x6099 sparse matrix of type '<class 'numpy.float64'>'
	with 10260 stored elements in Compressed Sparse Row format>

In [213]:
# merge weather matrix with words matrix
combine_matrix = sp.hstack((new_sparse_matrix, sparse_weather))
combine_matrix.shape

(12, 6134)

In [216]:
y_pred_5 = loaded_model.predict(combine_matrix)
y_pred_proba = loaded_model.predict_proba(combine_matrix)

for i in range(12):
    print("Example {}: Prediction: {}, Probability: {}".format(i+1, y_pred_5[i], y_pred_proba[i]))

Example 1: Prediction: 0.0, Probability: [0.56615921 0.43384079]
Example 2: Prediction: 0.0, Probability: [0.62543823 0.37456177]
Example 3: Prediction: 0.0, Probability: [0.61909756 0.38090244]
Example 4: Prediction: 0.0, Probability: [0.61340789 0.38659211]
Example 5: Prediction: 0.0, Probability: [0.60820065 0.39179935]
Example 6: Prediction: 0.0, Probability: [0.60175399 0.39824601]
Example 7: Prediction: 0.0, Probability: [0.59176966 0.40823034]
Example 8: Prediction: 0.0, Probability: [0.52290551 0.47709449]
Example 9: Prediction: 0.0, Probability: [0.51617185 0.48382815]
Example 10: Prediction: 0.0, Probability: [0.51617185 0.48382815]
Example 11: Prediction: 0.0, Probability: [0.51617185 0.48382815]
Example 12: Prediction: 0.0, Probability: [0.52290551 0.47709449]


### receive info 