In [59]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
import calendar
import datetime 

In [98]:
# date
def get_date(soup):
    date_span = soup.find('span', class_='submitted')
    date = date_span.find('span').get('content')
    date_object = datetime.datetime.strptime(date, '%Y-%m-%dT%H:%M:%S%z')
    return date_object.date()

In [85]:
# title
def get_title(soup):
    title_h1 = soup.find('h1', class_='title')
    return title_h1.text

In [86]:
# main_html
def get_html(soup):
    html_element = soup.find('div', class_='field field-name-body field-type-text-with-summary field-label-hidden')
    return html_element

def get_html_text(soup):
    html_element = get_html(soup)
    html_main = html_element.decode() 
    return html_main

In [87]:
# full_url
def get_url(soup):
    full_url_tag = soup.find("link", rel="canonical")
    if full_url_tag:
        full_url = full_url_tag.get("href")
        return full_url

In [88]:
# text
def get_text(soup):
    html_element = get_html(soup)
    html_div = html_element.find('div', class_='field-item even')
    elements = html_div.find_all(['p', 'ul', 'ol', 'div'])
    text = ''
    for element in elements:
        if element.name == 'div' and element.find('hr'):
            break
        text += element.text
    return text

In [89]:
# data for dataframe
def get_data(soup):
    data = []
    data.append(get_date(soup))
    data.append(get_title(soup))
    data.append(get_url(soup))
    data.append(get_html_text(soup))
    data.append(get_text(soup))
    return data

In [90]:
# url for request
def build_url(current_date):
    month_name = calendar.month_name[current_date.month].lower()
    if 26 <= current_date.day <= 27 and current_date.month == 2 and current_date.year == 2022:
        url = f'https://www.understandingwar.org/backgrounder/russia-ukraine-warning-update-russian-offensive-campaign-assessment-{month_name}-{current_date.day}'
    elif current_date.day == 28 and current_date.month == 2 and current_date.year == 2022:
        url = f'https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-{month_name}-{current_date.day}-{current_date.year}'        
    elif current_date.day == 24 and current_date.month == 2 and current_date.year == 2022:
        url = f'https://www.understandingwar.org/backgrounder/russia-ukraine-warning-update-initial-russian-offensive-campaign-assessment'        
    elif current_date.day == 25 and current_date.month == 2 and current_date.year == 2022:
        url = f'https://www.understandingwar.org/backgrounder/russia-ukraine-warning-update-russian-offensive-campaign-assessment-{month_name}-{current_date.day}-{current_date.year}'                 
    elif current_date.year == 2022:
        url = f'https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-{month_name}-{current_date.day}'
    else:
        url = f'https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-{month_name}-{current_date.day}-2023' 
    return url

In [91]:
# dataframe
def get_dataframe(start_date, end_date):
    df = pd.DataFrame(columns=['date', 'title', 'full_url', 'main_html', 'main_text'])
    current_date = start_date
    i = 0
    while current_date <= end_date:
        url = build_url(current_date)
        answer = requests.get(url)
        if not answer.status_code == 200: 
            current_date += datetime.timedelta(days=1)
            continue
        html_text = answer.text     
        soup = BeautifulSoup(html_text, 'lxml')
        df.loc[i] = get_data(soup)
        i += 1
        current_date += datetime.timedelta(days=1)
    return df

In [101]:
start_date = datetime.date(2022, 2, 24)
end_date = datetime.date(2023, 1, 25)
data_frame = get_dataframe(start_date, end_date)
data_frame.head(3)

Unnamed: 0,date,title,full_url,main_html,main_text
0,2022-02-24,Russia-Ukraine Warning Update: Initial Russian...,/backgrounder/russia-ukraine-warning-update-in...,"<div class=""field field-name-body field-type-t...","Mason Clark, George Barros, and Kateryna Step..."
1,2022-02-25,Russia-Ukraine Warning Update: Russian Offensi...,/backgrounder/russia-ukraine-warning-update-ru...,"<div class=""field field-name-body field-type-t...","Mason Clark, George Barros, and Kateryna Stepa..."
2,2022-02-26,Russia-Ukraine Warning Update: Russian Offensi...,/backgrounder/russia-ukraine-warning-update-ru...,"<div class=""field field-name-body field-type-t...","Mason Clark, George Barros, and Katya Stepanen..."


In [103]:
# to save dataframe as csv file
#data_frame.to_csv('data.csv1', index=False)