diff --git a/ingest/crawlers/inspectorat-so-org/_entry.png b/ingest/crawlers/inspectorat-so-org/_entry.png new file mode 100644 index 00000000..05b42fa4 Binary files /dev/null and b/ingest/crawlers/inspectorat-so-org/_entry.png differ diff --git a/ingest/crawlers/inspectorat-so-org/_message.png b/ingest/crawlers/inspectorat-so-org/_message.png new file mode 100644 index 00000000..563878ec Binary files /dev/null and b/ingest/crawlers/inspectorat-so-org/_message.png differ diff --git a/ingest/crawlers/inspectorat-so-org/extractors.test.ts b/ingest/crawlers/inspectorat-so-org/extractors.test.ts new file mode 100644 index 00000000..c336de93 --- /dev/null +++ b/ingest/crawlers/inspectorat-so-org/extractors.test.ts @@ -0,0 +1,92 @@ +import { describe, expect, it, vi } from "vitest"; +import { extractPostDetails, extractPostLinks } from "./extractors"; + +interface MockPage { + evaluate: (fn: (...args: any[]) => T, ...args: any[]) => Promise; +} + +function createMockPage(mockEvaluate: any): MockPage { + return { + evaluate: mockEvaluate, + } as MockPage; +} + +describe("inspectorat-so-org/extractors", () => { + describe("extractPostLinks", () => { + it("extracts all valid news links", async () => { + const mockEvaluate = vi.fn().mockResolvedValue([ + { + url: "https://inspectorat-so.org/%D0%BD%D0%BE%D0%B2%D0%B8%D0%BD%D0%B8/?newsid=%D1%80%D0%B5%D0%BC%D0%BE%D0%BD%D1%82-%D0%BD%D0%B0-%D1%83%D0%BB%D0%B8%D1%86%D0%B0", + title: "Ремонт на улица", + date: "06апр.", + }, + { + url: "https://inspectorat-so.org/%D0%BD%D0%BE%D0%B2%D0%B8%D0%BD%D0%B8/?newsid=%D0%BF%D1%80%D0%B0%D0%B7%D0%BD%D0%B8%D0%BA-%D0%B2-%D0%BF%D0%B0%D1%80%D0%BA", + title: "Празник в парк", + date: "05апр.", + }, + ]); + + const page = createMockPage(mockEvaluate) as any; + const posts = await extractPostLinks(page); + + expect(posts).toHaveLength(2); + expect(posts[0].title).toBe("Ремонт на улица"); + expect(posts[1].title).toBe("Празник в парк"); + expect(posts[0].url).toContain("newsid="); + }); + + it("deduplicates by URL", async () => { + const mockEvaluate = vi.fn().mockResolvedValue([ + { + url: "https://inspectorat-so.org/%D0%BD%D0%BE%D0%B2%D0%B8%D0%BD%D0%B8/?newsid=%D0%BC%D0%B8%D0%B5%D0%BD%D0%B5-%D0%BD%D0%B0-%D0%BF%D0%BE%D0%B4%D0%BB%D0%B5%D0%B7", + title: "Първо", + date: "06апр.", + }, + { + url: "https://inspectorat-so.org/%D0%BD%D0%BE%D0%B2%D0%B8%D0%BD%D0%B8/?newsid=%D0%BC%D0%B8%D0%B5%D0%BD%D0%B5-%D0%BD%D0%B0-%D0%BF%D0%BE%D0%B4%D0%BB%D0%B5%D0%B7", + title: "Второ", + date: "06апр.", + }, + ]); + + const page = createMockPage(mockEvaluate) as any; + const posts = await extractPostLinks(page); + + expect(posts).toHaveLength(1); + expect(posts[0].title).toBe("Второ"); + }); + + it("returns empty array when there are no valid newsid URLs", async () => { + const mockEvaluate = vi.fn().mockResolvedValue([ + { + url: "https://inspectorat-so.org/%D0%BA%D0%BE%D0%BD%D1%82%D0%B0%D0%BA%D1%82%D0%B8", + title: "Контакти", + date: "01апр.", + }, + ]); + + const page = createMockPage(mockEvaluate) as any; + const posts = await extractPostLinks(page); + + expect(posts).toEqual([]); + }); + }); + + describe("extractPostDetails", () => { + it("extracts title and content", async () => { + const mockEvaluate = vi.fn().mockResolvedValue({ + title: "Миене на подлези с временно ограничение за движение", + dateText: "", + contentHtml: "
Текст на съобщението
", + }); + + const page = createMockPage(mockEvaluate) as any; + const details = await extractPostDetails(page); + + expect(details.title).toContain("Миене на подлези"); + expect(details.contentHtml).toContain("Текст на съобщението"); + expect(details.dateText).toBe(""); + }); + }); +}); diff --git a/ingest/crawlers/inspectorat-so-org/extractors.ts b/ingest/crawlers/inspectorat-so-org/extractors.ts new file mode 100644 index 00000000..3601c13c --- /dev/null +++ b/ingest/crawlers/inspectorat-so-org/extractors.ts @@ -0,0 +1,38 @@ +import type { Page } from "playwright"; +import type { PostLink } from "./types"; +import { SELECTORS } from "./selectors"; +import { + extractPostLinks as extractPostLinksShared, + extractPostDetailsGeneric, +} from "../shared/extractors"; + +export async function extractPostLinks(page: Page): Promise { + const posts = await extractPostLinksShared(page, SELECTORS, (url) => { + let decodedUrl = ""; + + try { + decodedUrl = decodeURIComponent(url).toLowerCase(); + } catch { + decodedUrl = url.toLowerCase(); + } + + // Keep all article links and let downstream AI stages decide relevance. + return decodedUrl.includes("inspectorat-so.org") && decodedUrl.includes("newsid="); + }); + + // Preserve latest duplicate (same behavior as nadezhda crawler). + return Array.from(new Map(posts.map((post) => [post.url, post])).values()); +} + +export async function extractPostDetails( + page: Page, +): Promise<{ title: string; dateText: string; contentHtml: string }> { + return extractPostDetailsGeneric(page, SELECTORS.POST, [ + "script", + "style", + "nav", + "footer", + ".breadcrumb", + ".article-info", + ]); +} diff --git a/ingest/crawlers/inspectorat-so-org/index.test.ts b/ingest/crawlers/inspectorat-so-org/index.test.ts new file mode 100644 index 00000000..d9e0a284 --- /dev/null +++ b/ingest/crawlers/inspectorat-so-org/index.test.ts @@ -0,0 +1,105 @@ +import { beforeEach, describe, expect, it, vi } from "vitest"; +import { crawl } from "./index"; +import { parseInspectoratDate } from "./index"; +import { extractPostDetails, extractPostLinks } from "./extractors"; +import { + crawlWordpressPage, + processWordpressPost, +} from "../shared/webpage-crawlers"; + +vi.mock("./extractors", () => ({ + extractPostLinks: vi.fn(), + extractPostDetails: vi.fn(), +})); + +vi.mock("../shared/webpage-crawlers", () => ({ + crawlWordpressPage: vi.fn(), + processWordpressPost: vi.fn(), +})); + +describe("inspectorat-so-org/index", () => { + beforeEach(() => { + vi.clearAllMocks(); + }); + + it("configures crawlWordpressPage with inspectorat settings", async () => { + const mockedCrawlWordpressPage = vi.mocked(crawlWordpressPage); + mockedCrawlWordpressPage.mockResolvedValueOnce(); + + await crawl(); + + expect(mockedCrawlWordpressPage).toHaveBeenCalledTimes(1); + + const [options] = mockedCrawlWordpressPage.mock.calls[0]; + + expect(options.indexUrl).toBe( + "https://inspectorat-so.org/%D0%BD%D0%BE%D0%B2%D0%B8%D0%BD%D0%B8", + ); + expect(options.sourceType).toBe("inspectorat-so-org"); + expect(options.delayBetweenRequests).toBe(2000); + expect(options.extractPostLinks).toBe(extractPostLinks); + expect(typeof options.processPost).toBe("function"); + }); + + it("delegates post processing to processWordpressPost", async () => { + const mockedCrawlWordpressPage = vi.mocked(crawlWordpressPage); + const mockedProcessWordpressPost = vi.mocked(processWordpressPost); + mockedCrawlWordpressPage.mockResolvedValueOnce(); + mockedProcessWordpressPost.mockResolvedValueOnce(); + + await crawl(); + + const [options] = mockedCrawlWordpressPage.mock.calls[0]; + + const browser = {} as any; + const db = {} as any; + const postLink = { + url: "https://inspectorat-so.org/%D0%BD%D0%BE%D0%B2%D0%B8%D0%BD%D0%B8/?newsid=%D0%BC%D0%B8%D0%B5%D0%BD%D0%B5-%D0%BD%D0%B0-%D0%BF%D0%BE%D0%B4%D0%BB%D0%B5%D0%B7", + title: "Миене на подлези", + date: "06апр.", + }; + + await options.processPost(browser, postLink, db); + + expect(mockedProcessWordpressPost).toHaveBeenCalledTimes(1); + expect(mockedProcessWordpressPost).toHaveBeenCalledWith( + browser, + postLink, + db, + "inspectorat-so-org", + "bg.sofia", + 2000, + extractPostDetails, + expect.any(Function), + ); + }); + + it("propagates crawlWordpressPage errors", async () => { + const mockedCrawlWordpressPage = vi.mocked(crawlWordpressPage); + mockedCrawlWordpressPage.mockRejectedValueOnce(new Error("crawl failed")); + + await expect(crawl()).rejects.toThrow("crawl failed"); + }); + + describe("parseInspectoratDate", () => { + it("parses short month dates as previous year when they are too far in the future", () => { + const referenceDate = new Date("2026-01-05T12:00:00+02:00"); + const iso = parseInspectoratDate("29 дек.", "", referenceDate); + const parsed = new Date(iso); + + expect(parsed.getUTCFullYear()).toBe(2025); + expect(parsed.getUTCMonth()).toBe(11); + expect(parsed.getUTCDate()).toBe(28); + }); + + it("keeps short month dates in current year when near future threshold", () => { + const referenceDate = new Date("2026-04-01T12:00:00+03:00"); + const iso = parseInspectoratDate("06 апр.", "", referenceDate); + const parsed = new Date(iso); + + expect(parsed.getUTCFullYear()).toBe(2026); + expect(parsed.getUTCMonth()).toBe(3); + expect(parsed.getUTCDate()).toBe(5); + }); + }); +}); diff --git a/ingest/crawlers/inspectorat-so-org/index.ts b/ingest/crawlers/inspectorat-so-org/index.ts new file mode 100644 index 00000000..1db893c8 --- /dev/null +++ b/ingest/crawlers/inspectorat-so-org/index.ts @@ -0,0 +1,138 @@ +#!/usr/bin/env node + +import dotenv from "dotenv"; +import { resolve } from "node:path"; +import { Browser } from "playwright"; +import type { OboDb } from "@oboapp/db"; +import { PostLink } from "./types"; +import { extractPostLinks, extractPostDetails } from "./extractors"; +import { + crawlWordpressPage, + processWordpressPost, +} from "../shared/webpage-crawlers"; +import { logger } from "@/lib/logger"; + +dotenv.config({ path: resolve(process.cwd(), ".env.local") }); + +const INDEX_URL = "https://inspectorat-so.org/%D0%BD%D0%BE%D0%B2%D0%B8%D0%BD%D0%B8"; +const SOURCE_TYPE = "inspectorat-so-org"; +const LOCALITY = "bg.sofia"; +const DELAY_BETWEEN_REQUESTS = 2000; // 2 seconds +const MAX_FUTURE_DAYS_FOR_SHORT_DATE = 7; + +const BG_MONTH_TO_NUMBER: Record = { + "ян": "01", + "фев": "02", + "март": "03", + "апр": "04", + "май": "05", + "юни": "06", + "юли": "07", + "авг": "08", + "сеп": "09", + "окт": "10", + "ное": "11", + "дек": "12", +}; + +function inferShortDateYear( + day: string, + month: string, + referenceDate: Date, +): number { + const currentYear = referenceDate.getFullYear(); + const candidateThisYear = new Date( + `${currentYear}-${month}-${day}T00:00:00+02:00`, + ); + + if (Number.isNaN(candidateThisYear.getTime())) { + return currentYear; + } + + const futureThresholdMs = + MAX_FUTURE_DAYS_FOR_SHORT_DATE * 24 * 60 * 60 * 1000; + const isTooFarInFuture = + candidateThisYear.getTime() - referenceDate.getTime() > futureThresholdMs; + + return isTooFarInFuture ? currentYear - 1 : currentYear; +} + +export function parseInspectoratDate( + dateText: string, + fallbackDateText?: string, + referenceDate = new Date(), +): string { + const candidate = (dateText || fallbackDateText || "").replace(/\s+/g, " ").trim(); + + const directMatch = candidate.match(/(\d{1,2})[./](\d{1,2})[./](\d{2,4})/); + if (directMatch) { + const [, dayRaw, monthRaw, yearRaw] = directMatch; + const day = dayRaw.padStart(2, "0"); + const month = monthRaw.padStart(2, "0"); + const year = yearRaw.length === 2 ? `20${yearRaw}` : yearRaw; + const parsed = new Date(`${year}-${month}-${day}T00:00:00+02:00`); + if (!Number.isNaN(parsed.getTime())) { + return parsed.toISOString(); + } + } + + const shortMonthMatch = candidate.match(/(\d{1,2})\s*([а-я]+)/i); + if (shortMonthMatch) { + const [, dayRaw, monthRaw] = shortMonthMatch; + const monthKey = monthRaw.toLowerCase().replace(/\.$/, ""); + const mappedMonth = BG_MONTH_TO_NUMBER[monthKey]; + + if (mappedMonth) { + const day = dayRaw.padStart(2, "0"); + const year = inferShortDateYear(day, mappedMonth, referenceDate); + const parsed = new Date(`${year}-${mappedMonth}-${day}T00:00:00+02:00`); + if (!Number.isNaN(parsed.getTime())) { + return parsed.toISOString(); + } + } + } + + logger.warn("Unable to parse inspectorat date, using current date", { + sourceType: SOURCE_TYPE, + dateText, + fallbackDateText: fallbackDateText || "", + }); + + return new Date().toISOString(); +} + +const processPost = ( + browser: Browser, + postLink: PostLink, + db: OboDb, +) => + processWordpressPost( + browser, + postLink, + db, + SOURCE_TYPE, + LOCALITY, + DELAY_BETWEEN_REQUESTS, + extractPostDetails, + (dateText) => parseInspectoratDate(dateText, postLink.date), + ); + +export async function crawl(): Promise { + await crawlWordpressPage({ + indexUrl: INDEX_URL, + sourceType: SOURCE_TYPE, + extractPostLinks, + processPost, + delayBetweenRequests: DELAY_BETWEEN_REQUESTS, + }); +} + +if (require.main === module) { + crawl().catch((error) => { + logger.error("Fatal error", { + error: error instanceof Error ? error.message : String(error), + sourceType: SOURCE_TYPE, + }); + process.exit(1); + }); +} diff --git a/ingest/crawlers/inspectorat-so-org/selectors.ts b/ingest/crawlers/inspectorat-so-org/selectors.ts new file mode 100644 index 00000000..627b528f --- /dev/null +++ b/ingest/crawlers/inspectorat-so-org/selectors.ts @@ -0,0 +1,17 @@ +/** + * CSS selectors for scraping inspectorat-so.org announcements. + */ +export const SELECTORS = { + INDEX: { + POST_CONTAINER: ".newsContent", + POST_LINK: '.titleLink[href*="newsid="]', + POST_DATE: ".dateCreated", + POST_TITLE: "h2", + }, + + POST: { + CONTENT: '[itemprop="articleBody"] > .col-md-12 > div[style*="font-size:14pt"]', + TITLE: '[itemprop="articleBody"] > .col-md-12 > h2', + DATE: '[itemprop="articleBody"] .dateCreated, [itemprop="datePublished"], time', + }, +} as const; diff --git a/ingest/crawlers/inspectorat-so-org/tsconfig.json b/ingest/crawlers/inspectorat-so-org/tsconfig.json new file mode 100644 index 00000000..d3d29cc5 --- /dev/null +++ b/ingest/crawlers/inspectorat-so-org/tsconfig.json @@ -0,0 +1,9 @@ +{ + "extends": "../../tsconfig.json", + "compilerOptions": { + "module": "ESNext", + "target": "ES2022", + "moduleResolution": "node", + "esModuleInterop": true + } +} diff --git a/ingest/crawlers/inspectorat-so-org/types.ts b/ingest/crawlers/inspectorat-so-org/types.ts new file mode 100644 index 00000000..16335fc1 --- /dev/null +++ b/ingest/crawlers/inspectorat-so-org/types.ts @@ -0,0 +1,7 @@ +import { BaseSourceDocument, PostLink } from "../shared/types"; + +export interface SourceDocument extends BaseSourceDocument { + sourceType: "inspectorat-so-org"; +} + +export type { PostLink }; diff --git a/ingest/lib/source-trust.ts b/ingest/lib/source-trust.ts index f14072e1..b9d40e9b 100644 --- a/ingest/lib/source-trust.ts +++ b/ingest/lib/source-trust.ts @@ -38,6 +38,7 @@ const SOURCE_TRUST: Record = { "krasna-polyana-org": { trust: 0.8, geometryQuality: 2 }, "vrabnitsa-org": { trust: 0.8, geometryQuality: 2 }, "nadezhda-org": { trust: 0.8, geometryQuality: 2 }, + "inspectorat-so-org": { trust: 0.8, geometryQuality: 2 }, }; const DEFAULT_TRUST: SourceTrustEntry = { trust: 0.5, geometryQuality: 0 }; diff --git a/ingest/terraform/main.tf b/ingest/terraform/main.tf index 3dccdd84..a7b8407b 100644 --- a/ingest/terraform/main.tf +++ b/ingest/terraform/main.tf @@ -324,6 +324,12 @@ locals { timeout = "1800s" description = "Crawl Nadezhda district website" } + inspectorat-so = { + source = "inspectorat-so-org" + memory = "1Gi" + timeout = "1800s" + description = "Crawl Stolichen inspektorat news" + } nimh-severe-weather = { source = "nimh-severe-weather" memory = "512Mi" diff --git a/shared/src/index.ts b/shared/src/index.ts index a04d169b..287bab21 100644 --- a/shared/src/index.ts +++ b/shared/src/index.ts @@ -1,5 +1,6 @@ // Main entry point for @oboapp/shared package export * from "./schema"; +export * from "./sources"; export * from "./message-id-utils"; export * from "./bounds"; export * from "./coordinate-utils"; diff --git a/shared/src/sources.ts b/shared/src/sources.ts index f4ae5eae..d1468c96 100644 --- a/shared/src/sources.ts +++ b/shared/src/sources.ts @@ -121,6 +121,12 @@ export const SOURCES: readonly SourceDefinition[] = [ name: 'Столична община, Район "Надежда"', localities: ["bg.sofia"], }, + { + id: "inspectorat-so-org", + url: "https://inspectorat-so.org/%D0%BD%D0%BE%D0%B2%D0%B8%D0%BD%D0%B8", + name: "Столичен инспекторат", + localities: ["bg.sofia"], + }, { id: "sensor-community", url: "https://sensor.community/", diff --git a/web/public/sources/inspectorat-so-org.png b/web/public/sources/inspectorat-so-org.png new file mode 100644 index 00000000..4b4beb5a Binary files /dev/null and b/web/public/sources/inspectorat-so-org.png differ