## Image Generation

In [None]:
!pip install transformers==4.49 diffusers==0.32.2

In [None]:
# !rm -rf imgs/

In [None]:
# !rm LandscapeGenerator.py utils.py

In [None]:
# !wget https://raw.githubusercontent.com/thiagohersan/media-landscape/refs/heads/main/python/utils.py
# !wget https://raw.githubusercontent.com/thiagohersan/media-landscape/refs/heads/main/python/LandscapeGenerator.py

In [None]:
import os
os.environ["GEMINI_API_KEY"] = ""

from datasets import load_dataset
from PIL import Image as PImage

from LandscapeGenerator import LandscapeGenerator

In [None]:
newsdata_ds = load_dataset("thiagohersan/newsdata-images", split="newsdata")

newsdata_data = []
for img in newsdata_ds:
  newsdata_data.append(img)

newsdata_data_sorted = sorted(newsdata_data, key=lambda x: x["image"].size[0]*x["image"].size[1], reverse=True)

In [None]:
# "runwayml/stable-diffusion-inpainting"
# "stable-diffusion-v1-5/stable-diffusion-inpainting"
# "stabilityai/stable-diffusion-2-inpainting"
mLG = LandscapeGenerator(newsdata_data_sorted, "stable-diffusion-v1-5/stable-diffusion-inpainting")

### New landscape

In [None]:
mLG.gen_landscape(keep_width=256, size=(1440, 512), n=8, label="test00")

### Continue with seed

In [None]:
seed = PImage.open("./imgs/test01/test01_07.jpg")
mLG.gen_landscape(keep_width=256, size=(1440, 512), n=8, label="test02", seed_img=seed)

### Graft

In [None]:
limg = PImage.open("./imgs/test02/test01_01.jpg")
rimg = PImage.open("./imgs/test02/test01_02.jpg")
display(rimg)

prompt,img_in,mask_in = mLG.prep_graft(limg, rimg, keep_width=256, right_offset=20)
display(img_in)
display(mask_in)

In [None]:
output = mLG.pipe(
  prompt=prompt,
  negative_prompt="repetitive, distortion, glitch, borders, stretched, frames, breaks, multiple rows, gore, zombies, violence, splits, maps, diagrams, text, font, logos, branding",
  image=img_in,
  mask_image=mask_in,
  width=img_in.size[0], height=img_in.size[1],
  guidance_scale=12.0,
  num_inference_steps=32,
  num_images_per_prompt=1,
)
output.images[0]

In [None]:
iw,ih = output.images[0].size
output.images[0].crop((256,0,iw,ih)).save("imgs/test01_grafted2.jpg")

## Read from newsdata.io

In [None]:
import json

from datetime import datetime
from os import makedirs

from utils import get_articles

In [None]:
date = datetime.now().strftime("%Y-%m-%d")
makedirs(f"./data/{date}", exist_ok=True)

queries = ["catastrophe", "disaster", "drought", "rain"]
q = "rain"

res = get_articles(q=q, cat="environment", n_articles=200)

with open(f"./data/{date}/newsdata_{q}.json", "w") as ofp:
  json.dump(res, ofp, ensure_ascii=False)

## Read news from JSON

In [None]:
import json
import time

from datetime import datetime

from utils import get_articles_with_top_words, get_article_images_by_size
from utils import get_img_description

In [None]:
date = datetime.now().strftime("%Y-%m-%d")
queries = ["catastrophe", "disaster", "drought", "rain"]
q = "rain"

with open(f"./data/{date}/newsdata_{q}.json", "r") as ifp:
  newsdata_res = json.load(ifp)

art_idxs = get_articles_with_top_words(newsdata_res, n_words=10, n_articles=8)

display(art_idxs)

print(len(set(art_idxs.values.reshape(-1))), "/", len(newsdata_res))

In [None]:
imgs_by_size = get_article_images_by_size(newsdata_res, art_idxs.values.reshape(-1), limit=30)

img_data = []

for img in imgs_by_size:
  iw,ih = img["image"].size
  if ih > iw or ih < 480 or iw < 640:
    continue

  if ih > 512:
    img_512 = img["image"].resize((int(iw/ih*512), 512))
  else:
    img_512 = img["image"]
  
  if img_512.size[0] < 640:
    continue

  description = get_img_description(img_512)
  img_content = description["content"]
  img_style = description["style"]
  time.sleep(7)

  if (
    img_content == "" or img_style == "" or
    "logo" in img_content or "logo" in img_style or
    "hologra" in img_content or "hologra" in img_style or
    "branding" in img_content or "branding" in img_style or
    "line art" in img_content or "line art" in img_style or
    "typograph" in img_content or "typograph" in img_style or
    "illustrat" in img_content or "illustrat" in img_style or
    "digital art" in img_content or "digital art" in img_style or
    "graphic design" in img_content or "graphic design" in img_style or
    "graphic overlay" in img_content or "graphic overlay" in img_style
  ): continue

  i512w,i512h = img_512.size
  display(img_512.resize((i512w//3, i512h//3)))
  print(newsdata_res[img["idx"]]["article_id"])

  img_data.append({
    "article_id" : newsdata_res[img["idx"]]["article_id"],
    "title": newsdata_res[img["idx"]]["title"],
    "description": newsdata_res[img["idx"]]["description"],
    "source_name": newsdata_res[img["idx"]]["source_name"],
    "pubDate": newsdata_res[img["idx"]]["pubDate"],

    "image": img_512,
    "content": description["content"],
    "style": description["style"],
  })

len(img_data)

In [None]:
to_remove = [
  "2c2100e117f3836cd72fd13b63b26e3a",
  "00fcec63a1093d16d716eb5c4e2d6f02",
  "5a697149df423f3fd7f4a09a1dfff176",
  "ed9076bfb73e9eb1978067ded3d4be86",
  "e4f19539be2d1de1691fa057c74a40da"
]

filtered_img_data = [x for x in img_data if x["article_id"] not in to_remove]

print(len(filtered_img_data))

In [None]:
to_keep = [
  "73d7e90a242e1a8d20f4003708e78d52",
  "f8e31ed86e3eb0e50fe26bc5300240db",
  "7d7325580f92627e3770edf9ed228ab8",
  "98fb216757839f99797f2b150902fd0c",
  "03330f8751f778873de18e3c7a8a39a9"
]

filtered_img_data = [x for x in img_data if x["article_id"] in to_keep]

print(len(filtered_img_data))

## Push to HF

In [None]:
from datasets import Dataset, concatenate_datasets, load_dataset

In [None]:
dataset = Dataset.from_list(filtered_img_data)
dataset.push_to_hub("thiagohersan/newsdata-images", split="newsdata")

In [None]:
dataset_hf = load_dataset("thiagohersan/newsdata-images", split="newsdata")
dataset = concatenate_datasets([dataset_hf, Dataset.from_list(filtered_img_data)])
dataset.push_to_hub("thiagohersan/newsdata-images", split="newsdata")