In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from datetime import datetime
import re

url = 'https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-april-25-2025'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

content_div = soup.find('div', class_='field field-name-body field-type-text-with-summary field-label-hidden')
if content_div:
    paragraphs = content_div.find_all('p')
    text = ' '.join([p.get_text() for p in paragraphs])
else:
    text = ''

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    words = text.split()
    stop_words = {'the', 'and', 'to', 'of', 'in', 'a', 'that', 'is', 'for', 'on', 'with', 'as', 'by', 'at', 'from',
                  'it', 'an', 'be', 'this', 'are', 'was', 'or', 'which', 'but', 'has', 'have', 'not', 'were', 'their',
                  'they', 'been', 'its', 'he', 'she', 'his', 'her', 'them', 'we', 'us', 'our', 'you', 'your', 'i', 'me',
                  'my', 'mine'}
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

cleaned_text = preprocess(text)

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([cleaned_text])
feature_names = vectorizer.get_feature_names_out()
tfidf_scores = tfidf_matrix.toarray()[0]

core_words = [
    'russian', 'ukrainian', 'forces', 'ukraine', 'russia',
    'oblast', 'military', 'operations', 'offensive',
    'bakhmut', 'likely', 'war', 'defense',
    'reported', 'claimed', 'stated', 'continued',
    'kremlin', 'putin', 'wagner', 'president', 'general', 'city', 'donetsk', 'western'
]

data = {'date': [int(datetime.strptime('2025-04-25', '%Y-%m-%d').timestamp())]}
for word in core_words:
    if word in feature_names:
        index = list(feature_names).index(word)
        data[word] = [tfidf_scores[index]]
    else:
        data[word] = [0.0]

df = pd.DataFrame(data)

df.to_csv('data/isw_latest_report.csv', sep=';', index=False)