## Image Generation

In [None]:
!pip install transformers==4.49 diffusers==0.32.2

In [None]:
!rm -rf imgs/

In [None]:
!rm LandscapeGenerator.py utils.py

In [None]:
!wget https://raw.githubusercontent.com/thiagohersan/media-landscape/refs/heads/main/python/utils.py
!wget https://raw.githubusercontent.com/thiagohersan/media-landscape/refs/heads/main/python/LandscapeGenerator.py

In [None]:
import os
os.environ["GEMINI_API_KEY"] = ""

from datasets import load_dataset
from LandscapeGenerator import LandscapeGenerator

In [None]:
newsdata_ds = load_dataset("thiagohersan/newsdata-images", split="newsdata")

newsdata_data = []
for img in newsdata_ds:
  newsdata_data.append(img)

newsdata_data_sorted = sorted(newsdata_data, key=lambda x: x["image"].size[0]*x["image"].size[1], reverse=True)

In [None]:
# "runwayml/stable-diffusion-inpainting"
# "stable-diffusion-v1-5/stable-diffusion-inpainting"
# "stabilityai/stable-diffusion-2-inpainting"
mLG = LandscapeGenerator(newsdata_data_sorted, "stable-diffusion-v1-5/stable-diffusion-inpainting")

In [None]:
mLG.gen_landscape(keep_width=256, size=(1440, 512), n=4, label="test")

## Read from newsdata.io

In [None]:
import os
os.environ["NEWSDATA_API_KEY"] = ""

import json

from utils import get_articles

In [None]:
flavors = ["catastrophe", "disaster", "drought", "rain"]

q = "rain"
res = get_articles(q=q, cat="environment", n_articles=200)

with open(f"./data/newsdata_{q}_200.json", "w") as ofp:
  json.dump(res, ofp, ensure_ascii=False)

## Read news from JSON

In [None]:
import os
os.environ["GEMINI_API_KEY"] = ""

import json
import time

from utils import get_articles_with_top_words, get_article_images_by_size
from utils import get_img_description

In [None]:
with open("./data/newsdata_rain_200.json", "r") as ifp:
  newsdata_res = json.load(ifp)

art_idxs = get_articles_with_top_words(newsdata_res, n_words=14, n_articles=4)

display(art_idxs)

print(len(set(art_idxs.values.reshape(-1))), "/", len(newsdata_res))

In [None]:
len(sorted(x["article_id"] for x in newsdata_res)), len(list(set(sorted(x["article_id"] for x in newsdata_res))))

In [None]:
imgs_by_size = get_article_images_by_size(newsdata_res, art_idxs.values.reshape(-1), limit=30)

img_data = []

for img in imgs_by_size:
  iw,ih = img["image"].size
  if ih > iw or ih < 480 or iw < 640:
    continue

  if ih > 512:
    img_512 = img["image"].resize((int(iw/ih*512), 512))
  else:
    img_512 = img["image"]
  
  if img_512.size[0] < 640:
    continue

  description = get_img_description(img_512)
  img_content = description["content"]
  img_style = description["style"]
  time.sleep(7)

  if (
    img_content == "" or img_style == "" or
    "logo" in img_content or "logo" in img_style or
    "hologra" in img_content or "hologra" in img_style or
    "branding" in img_content or "branding" in img_style or
    "line art" in img_content or "line art" in img_style or
    "typograph" in img_content or "typograph" in img_style or
    "illustrat" in img_content or "illustrat" in img_style or
    "digital art" in img_content or "digital art" in img_style or
    "graphic design" in img_content or "graphic design" in img_style or
    "graphic overlay" in img_content or "graphic overlay" in img_style
  ): continue

  display(img_512)

  img_data.append({
    "article_id" : newsdata_res[img["idx"]]["article_id"],
    "title": newsdata_res[img["idx"]]["title"],
    "pubDate": newsdata_res[img["idx"]]["pubDate"],
    "image": img_512,
    "content": description["content"],
    "style": description["style"],
  })

len(img_data)

## Push to HF

In [None]:
from datasets import Dataset, concatenate_datasets, load_dataset

In [None]:
dataset = Dataset.from_list(img_data)
dataset.push_to_hub("thiagohersan/newsdata-images", split="newsdata")

In [None]:
dataset_hf = load_dataset("thiagohersan/newsdata-images", split="newsdata")
dataset = concatenate_datasets([dataset_hf, Dataset.from_list(img_data)])
dataset.push_to_hub("thiagohersan/newsdata-images", split="newsdata")