In [1]:
# the first part of this script collects the homepages from the internet archive and save them in a .txt

import urllib.request, os
import json, codecs,time
from bs4 import BeautifulSoup
from collections import Counter
import time
import pandas as pd

# the name of the website as it appears in the URL, for example nytimes, guardian, lastampa

website_name = "arabic.rt"

# the precise homepage, from which you want to collect news (this step is needed as sometimes it could point to a subdomain, e.g. theguardian.com/uk)
url_website = "arabic.rt.com"

yearmonthday_link = "20240406235320"

In [2]:
import urllib.request

def get_url_content(url : str)->str:
    # Define a Chrome user agent
    chrome_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    
    # Create a custom opener with our user agent
    opener = urllib.request.build_opener()
    opener.addheaders = [('User-Agent', chrome_agent)]
    
    # Install this opener as the default opener
    urllib.request.install_opener(opener)
    
    try:
        # Now make the request
        with urllib.request.urlopen(url) as response:
            html_content = response.read().decode('utf-8')
        return html_content
    except urllib.error.HTTPError as e:
        print(f"HTTP Error {e.code}: {e.reason}")
    except urllib.error.URLError as e:
        print(f"URL Error: {e.reason}")
    return None


def show_articles(article_list):
    # Print the extracted articles
    for idx, article in enumerate(article_list, 1):
        print(f"Article {idx}:")
        for key in article:
            print(f"{key}: {article[key]}")
        print("---")


### Download one page

In [3]:
# https://web.archive.org/web/20240402230330/https://arabic.rt.com/
url = f"https://web.archive.org/web/{yearmonthday_link}/https://{url_website}"

html_content = get_url_content(url)
if html_content:
    print(html_content[:500])  # Print first 500 characters of the content


<!DOCTYPE html>
<html prefix="og: http://ogp.me/ns#" lang="ar"><head><!-- is_embed=False -->
<script>
  const observer = new PerformanceObserver((list) => {
    list.getEntries().forEach((entry) => {
      console.log('%o', entry);
    })
  });
  observer.observe({type: "navigation", buffered: true});
</script>
<script src="//archive.org/includes/athena.js" type="text/javascript"></script>
<script type="text/javascript">window.addEventListener('DOMContentLoaded',function(){var v=archive_analytic


### Extract last-news_list
use keyword to have 2 list

    list 1 keyword. <div class="last-news last-news_scroll"><ul class="last-news_list">
    list 2 keyword. <div class="tabs__content tabs__content_2"><div class="last-news last-news_scroll"><ul class="last-news_list">

In [4]:
def extract_last_news(html_content, selector="ul.last-news_list li a"):
    soup = BeautifulSoup(html_content, 'html.parser')
    article_elements = soup.select(selector)
    article_list = []
    for art in article_elements:
        article_list.append({
            "link": art.get('href'),
            "title": art.span.text.replace("/n", ""). replace("  ", " ").strip(),
            "time": art.time.text.replace("/n", ""). replace("  ", " ").strip()
        })
    return article_list


def extract_news_article(relative_link: str, verbose: bool)-> str:
    # generate the complete link with web.archive.com
    link = f"https://web.archive.org{relative_link}"
    if verbose:
        print("downloading ", link)
    article = get_url_content(link)

    # extract article
    soup = BeautifulSoup(article, 'html.parser')
    
    selector = "div.article div.js-mediator-article p"
    article_detail = soup.select(selector)

    # clean the text, and return the plain article
    plain_article = "\n".join([p.text.replace("\n", "").strip() for p in article_detail])

    if verbose:
        print(plain_article)

    return plain_article


In [5]:
def extract_last_news_article(article_list):
    for art in article_list:
        # skip the downloading if it is already downloaded
        if "text" in art and len(art["text"]) > 1:
            continue

        try:
            plain_article = extract_news_article(art['link'], verbose=True)
            art['text'] = plain_article
        except:
            print(f"error in downloading {art['link']}")
        
        # wait 5 seconds to download the next article
        # the connection will be refused if downloads are too often
        time.sleep(10)
    
    return article_list

def save_article_to_csv(article_list, yearmonthday_str, filename="last_news.csv"):
    df = pd.DataFrame.from_dict(article_list)
    df = df[["title", "time", "link", "text"]]
    folder = f"{os.getcwd()}/web/{yearmonthday_link}"
    os.makedirs(folder)
    df.to_csv(os.path.join(folder, filename))

In [6]:
article_list = extract_last_news(html_content, selector="ul.last-news_list li a")
extract_last_news_article(article_list)
save_article_to_csv(article_list, yearmonthday_link, filename="last_news.csv")

downloading  https://web.archive.org/web/20240406235320/https://arabic.rt.com/world/1553851-%D8%A2%D8%AE%D8%B1-%D8%AA%D8%B7%D9%88%D8%B1%D8%A7%D8%AA-%D8%A7%D9%84%D8%B9%D9%85%D9%84%D9%8A%D8%A9-%D8%A7%D9%84%D8%B9%D8%B3%D9%83%D8%B1%D9%8A%D8%A9-%D8%A7%D9%84%D8%B1%D9%88%D8%B3%D9%8A%D8%A9-%D9%81%D9%8A-%D8%A3%D9%88%D9%83%D8%B1%D8%A7%D9%86%D9%8A%D8%A7-05042024/
HTTP Error 403: Forbidden
error in downloading /web/20240406235320/https://arabic.rt.com/world/1553851-%D8%A2%D8%AE%D8%B1-%D8%AA%D8%B7%D9%88%D8%B1%D8%A7%D8%AA-%D8%A7%D9%84%D8%B9%D9%85%D9%84%D9%8A%D8%A9-%D8%A7%D9%84%D8%B9%D8%B3%D9%83%D8%B1%D9%8A%D8%A9-%D8%A7%D9%84%D8%B1%D9%88%D8%B3%D9%8A%D8%A9-%D9%81%D9%8A-%D8%A3%D9%88%D9%83%D8%B1%D8%A7%D9%86%D9%8A%D8%A7-05042024/
downloading  https://web.archive.org/web/20240406235320/https://arabic.rt.com/world/1554031-%D9%86%D8%A7%D8%A6%D8%A8-%D9%88%D8%B2%D9%8A%D8%B1-%D8%A7%D9%84%D8%AF%D9%81%D8%A7%D8%B9-%D8%A7%D9%84%D8%A8%D8%B1%D9%8A%D8%B7%D8%A7%D9%86%D9%8A-%D9%8A%D8%A4%D9%83%D8%AF-%D8%B9%D8%AF%D9%85-%

In [18]:
link_list[2].get("href")


'https://web.archive.org/web/20240306235224/https://arabic.rt.com/'

In [23]:
soup = BeautifulSoup(html_content, 'html.parser')
    
selector = "a"
res = []
link_list = soup.select(selector)
for link in link_list:
    sl = link.get('href')
    if sl != None and len(sl) > 120:
        print(link.get('href'))
        res.append(sl)

/web/20240406235320/https://arabic.rt.com/prg/program/792233-%D8%A7%D8%B3%D8%AA%D9%88%D8%AF%D9%8A%D9%88-%D8%A8%D9%8A%D8%B1%D9%88%D8%AA/
/web/20240406235320/https://arabic.rt.com/prg/program/786853-%D8%A7%D8%B3%D8%AA%D9%88%D8%AF%D9%8A%D9%88-%D8%A7%D9%84%D9%82%D8%A7%D9%87%D8%B1%D8%A9/
/web/20240406235320/https://arabic.rt.com/prg/program/783733-%D8%A7%D8%B3%D8%AA%D9%88%D8%AF%D9%8A%D9%88-%D9%88%D8%A7%D8%B4%D9%86%D8%B7%D9%86/
https://web.archive.org/web/20240406235320/https://arabic.rt.com/prg/program/1514747-%D8%A7%D8%B3%D8%AA%D9%88%D8%AF%D9%8A%D9%88-%D8%A7%D9%84%D8%AC%D8%B2%D8%A7%D8%A6%D8%B1/
/web/20240406235320/https://arabic.rt.com/world/1554017-%D9%87%D9%8A%D8%A6%D8%A9-%D8%A8%D8%AD%D8%B1%D9%8A%D8%A9-%D8%A8%D8%B1%D9%8A%D8%B7%D8%A7%D9%86%D9%8A%D8%A9-%D8%AA%D9%84%D9%82%D9%8A%D9%86%D8%A7-%D8%A8%D9%84%D8%A7%D8%BA%D8%A7-%D8%B9%D9%86-%D8%AD%D8%A7%D8%AF%D8%AB%D8%A9-%D8%AC%D9%86%D9%88%D8%A8-%D8%BA%D8%B1%D8%A8-%D8%A7%D9%84%D8%AD%D8%AF%D9%8A%D8%AF%D8%A9-%D8%A7%D8%B3%D8%AA%D9%87%D8%AF%D8%A7%D9%81

In [24]:
len(res)

153

In [13]:
from requests.utils import DEFAULT_CA_BUNDLE_PATH
print(DEFAULT_CA_BUNDLE_PATH)

/home/zhou/miniconda3/lib/python3.9/site-packages/certifi/cacert.pem


### Download Ticket-list

    key:  <div class="ticker_list-wrap"><ul class="ticker_list">

### Download Impotant News
    key:  <div class="important-news" data-tb-region="Important news">

In [117]:
soup = BeautifulSoup(html_content, 'html.parser')

In [None]:
def extract_last_news(html_content, selector="ul.last-news_list li a"):
    soup = BeautifulSoup(html_content, 'html.parser')
    article_elements = soup.select(selector)
    article_list = []
    for art in article_elements:
        article_list.append({
            "link": art.get('href'),
            "title": art.span.text.replace("/n", ""). replace("  ", " ").strip(),
            "time": art.time.text.replace("/n", ""). replace("  ", " ").strip()
        })
    return article_list


In [67]:

def extract_articles(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    articles = []

    # Find all article elements
    article_elements = soup.find_all('div', class_='last-telecast__item')

    for article in article_elements:
        # Extract title
        title_element = article.text.replace("\n", "").replace("\xa0", "").replace("\r", "").strip()
        title = title_element if title_element is not None or len(title_element) > 0 else 'No title'

        # Extract article link
        link_element = article.a.get("href")

        articles.append({
            'title': title,
            'link': link_element
        })

    return articles

# Extract articles
article_list = extract_articles(html_content)



Article 1:
Title: بالفيديو من ضواحي موسكو.. فتاة تخاطر بحياتها لإنقاذ كلبها من بحيرة يغطيها الجليد
Link: /web/20240405235307/https://arabic.rt.com/videoclub/1553711-%D8%A8%D8%A7%D9%84%D9%81%D9%8A%D8%AF%D9%8A%D9%88-%D9%85%D9%86-%D8%B6%D9%88%D8%A7%D8%AD%D9%8A-%D9%85%D9%88%D8%B3%D9%83%D9%88-%D9%81%D8%AA%D8%A7%D8%A9-%D8%AA%D8%AE%D8%A7%D8%B7%D8%B1-%D8%A8%D8%AD%D9%8A%D8%A7%D8%AA%D9%87%D8%A7-%D9%84%D8%A5%D9%86%D9%82%D8%A7%D8%B0-%D9%83%D9%84%D8%A8%D9%87%D8%A7-%D9%85%D9%86-%D8%A8%D8%AD%D9%8A%D8%B1%D8%A9-%D9%8A%D8%BA%D8%B7%D9%8A%D9%87%D8%A7-%D8%A7%D9%84%D8%AC%D9%84%D9%8A%D8%AF/
---
Article 2:
Title: مشاهد من فيضانات وعمليات إجلاء في مقاطعة أورنبورغ الروسية
Link: /web/20240405235307/https://arabic.rt.com/videoclub/1553719-%D9%85%D8%B4%D8%A7%D9%87%D8%AF-%D9%85%D9%86-%D9%81%D9%8A%D8%B6%D8%A7%D9%86%D8%A7%D8%AA-%D9%88%D8%B9%D9%85%D9%84%D9%8A%D8%A7%D8%AA-%D8%A5%D8%AC%D9%84%D8%A7%D8%A1-%D9%81%D9%8A-%D9%85%D9%82%D8%A7%D8%B7%D8%B9%D8%A9-%D8%A3%D9%88%D8%B1%D9%86%D8%A8%D9%88%D8%B1%D8%BA-%D8%A7%D9%84%D8%B1%

In [None]:
   https://arabic.rt.com/world/1552786-%D9%83%D9%88%D8%B1%D9%8A%D8%A7-%D8%A7%D9%84%D8%B4%D9%85%D8%A7%D9%84%D9%8A%D8%A9-%D8%AA%D8%B9%D9%84%D9%86-%D8%B9%D9%86-%D8%A7%D8%AE%D8%AA%D8%A8%D8%A7%D8%B1-%D9%86%D8%A7%D8%AC%D8%AD-%D9%84%D8%B5%D8%A7%D8%B1%D9%88%D8%AE-%D9%81%D8%B1%D8%B7-%D8%B5%D9%88%D8%AA%D9%8A-%D9%85%D8%AA%D9%88%D8%B3%D8%B7-%D8%A7%D9%84%D9%85%D8%AF%D9%89/
 /web/20240405235307/https://arabic.rt.com/videoclub/1553711-%D8%A8%D8%A7%D9%84%D9%81%D9%8A%D8%AF%D9%8A%D9%88-%D9%85%D9%86-%D8%B6%D9%88%D8%A7%D8%AD%D9%8A-%D9%85%D9%88%D8%B3%D9%83%D9%88-%D9%81%D8%AA%D8%A7%D8%A9-%D8%AA%D8%AE%D8%A7%D8%B7%D8%B1-%D8%A8%D8%AD%D9%8A%D8%A7%D8%AA%D9%87%D8%A7-%D9%84%D8%A5%D9%86%D9%82%D8%A7%D8%B0-%D9%83%D9%84%D8%A8%D9%87%D8%A7-%D9%85%D9%86-%D8%A8%D8%AD%D9%8A%D8%B1%D8%A9-%D9%8A%D8%BA%D8%B7%D9%8A%D9%87%D8%A7-%D8%A7%D9%84%D8%AC%D9%84%D9%8A%D8%AF/