## Read from newsdata.io

In [None]:
import json

from datetime import datetime
from os import makedirs

from utils import get_articles

In [None]:
date = datetime.now().strftime("%Y-%m-%d")
makedirs(f"./data/{date}", exist_ok=True)

queries = ["catastrophe", "disaster", "drought", "heatwave", "pollution", "rain"]
q = "catastrophe"

res = get_articles(q=q, cat="environment", n_articles=200)

with open(f"./data/{date}/newsdata_{q}.json", "w") as ofp:
  json.dump(res, ofp, ensure_ascii=False)

## Read news from JSON

In [None]:
import json
import time

from datetime import datetime

from utils import get_articles_with_top_words, get_article_images_by_size
from utils import get_img_description

In [None]:
date = datetime.now().strftime("%Y-%m-%d")
queries = ["catastrophe", "disaster", "drought", "heatwave", "pollution", "rain"]
q = "catastrophe"

with open(f"./data/{date}/newsdata_{q}.json", "r") as ifp:
  newsdata_res = json.load(ifp)

art_idxs = get_articles_with_top_words(newsdata_res, n_words=8, n_articles=8)

display(art_idxs)

print(len(set(art_idxs.values.reshape(-1))), "/", len(newsdata_res))

In [None]:
imgs_by_size = get_article_images_by_size(newsdata_res, art_idxs.values.reshape(-1), limit=32)

img_data = []

for img in imgs_by_size:
  iw,ih = img["image"].size
  if ih > iw or ih < 480 or iw < 640:
    continue

  if ih > 512:
    img_512 = img["image"].resize((int(iw/ih*512), 512))
  else:
    img_512 = img["image"]
  
  if img_512.size[0] < 640:
    continue

  description = get_img_description(img_512)
  img_content = description["content"]
  img_style = description["style"]
  time.sleep(7)

  if (
    img_content == "" or img_style == "" or
    "logo" in img_content or "logo" in img_style or
    "hologra" in img_content or "hologra" in img_style or
    "branding" in img_content or "branding" in img_style or
    "line art" in img_content or "line art" in img_style or
    "typograph" in img_content or "typograph" in img_style or
    "illustrat" in img_content or "illustrat" in img_style or
    "digital art" in img_content or "digital art" in img_style or
    "graphic design" in img_content or "graphic design" in img_style or
    "graphic overlay" in img_content or "graphic overlay" in img_style
  ): continue

  i512w,i512h = img_512.size
  display(img_512.resize((i512w//3, i512h//3)))
  print(newsdata_res[img["idx"]]["article_id"], "☝️")

  img_data.append({
    "article_id" : newsdata_res[img["idx"]]["article_id"],
    "title": newsdata_res[img["idx"]]["title"],
    "description": newsdata_res[img["idx"]]["description"],
    "source_name": newsdata_res[img["idx"]]["source_name"],
    "pubDate": newsdata_res[img["idx"]]["pubDate"],

    "image": img_512,
    "content": description["content"],
    "style": description["style"],
  })

len(img_data)

In [None]:
to_remove = [
  "",
  "",
]

filtered_img_data = [x for x in img_data if x["article_id"] not in to_remove]

print(len(filtered_img_data))

In [None]:
to_keep = [
  "",
  "",
]

filtered_img_data = [x for x in img_data if x["article_id"] in to_keep]

print(len(filtered_img_data))

## Push to HF

In [None]:
from datasets import Dataset, concatenate_datasets, load_dataset

## Create new dataset

In [None]:
dataset = Dataset.from_list(filtered_img_data)
dataset.push_to_hub("thiagohersan/newsdata-images", split="newsdata")

## Add rows

In [None]:
dataset_hf = load_dataset("thiagohersan/newsdata-images", split="newsdata")
dataset = concatenate_datasets([dataset_hf, Dataset.from_list(filtered_img_data)])

id2article = {x["article_id"] : x for x in dataset}
dataset = Dataset.from_list(list(id2article.values()))

dataset.push_to_hub("thiagohersan/newsdata-images", split="newsdata")

## Manually filter

In [None]:
dataset_hf = load_dataset("thiagohersan/newsdata-images", split="newsdata")

print(len(dataset_hf))

for a in dataset_hf:
  display(a["image"])
  print(a["article_id"], "☝️")

In [None]:
to_remove = [
  "",
  "",
]

filtered_img_data = [a for a in dataset_hf if a["article_id"] not in to_remove]

len(filtered_img_data)

In [None]:
dataset = Dataset.from_list(filtered_img_data)
dataset.push_to_hub("thiagohersan/newsdata-images", split="newsdata")

## Bulk rename (with git)

In [None]:
from os import listdir
from time import sleep
import subprocess

DIR = "../www/imgs/2025-10-22"

fs = sorted([f for f in listdir(DIR) if f.endswith("jpg") and f.startswith("20251022")])
len(fs)

In [None]:
for f in fs:
  nf = f.replace("20251022_", "20251021_")
  subprocess.run(["git", "mv", f"{DIR}/{f}", f"{DIR}/{nf}"])
  sleep(0.2)