Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added ingest/crawlers/inspectorat-so-org/_entry.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added ingest/crawlers/inspectorat-so-org/_message.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
92 changes: 92 additions & 0 deletions ingest/crawlers/inspectorat-so-org/extractors.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import { describe, expect, it, vi } from "vitest";
import { extractPostDetails, extractPostLinks } from "./extractors";

interface MockPage {
evaluate: <T>(fn: (...args: any[]) => T, ...args: any[]) => Promise<T>;
}

function createMockPage(mockEvaluate: any): MockPage {
return {
evaluate: mockEvaluate,
} as MockPage;
}

describe("inspectorat-so-org/extractors", () => {
describe("extractPostLinks", () => {
it("extracts all valid news links", async () => {
const mockEvaluate = vi.fn().mockResolvedValue([
{
url: "https://inspectorat-so.org/%D0%BD%D0%BE%D0%B2%D0%B8%D0%BD%D0%B8/?newsid=%D1%80%D0%B5%D0%BC%D0%BE%D0%BD%D1%82-%D0%BD%D0%B0-%D1%83%D0%BB%D0%B8%D1%86%D0%B0",
title: "Ремонт на улица",
date: "06апр.",
},
{
url: "https://inspectorat-so.org/%D0%BD%D0%BE%D0%B2%D0%B8%D0%BD%D0%B8/?newsid=%D0%BF%D1%80%D0%B0%D0%B7%D0%BD%D0%B8%D0%BA-%D0%B2-%D0%BF%D0%B0%D1%80%D0%BA",
title: "Празник в парк",
date: "05апр.",
},
]);

const page = createMockPage(mockEvaluate) as any;
const posts = await extractPostLinks(page);

expect(posts).toHaveLength(2);
expect(posts[0].title).toBe("Ремонт на улица");
expect(posts[1].title).toBe("Празник в парк");
expect(posts[0].url).toContain("newsid=");
});

it("deduplicates by URL", async () => {
const mockEvaluate = vi.fn().mockResolvedValue([
{
url: "https://inspectorat-so.org/%D0%BD%D0%BE%D0%B2%D0%B8%D0%BD%D0%B8/?newsid=%D0%BC%D0%B8%D0%B5%D0%BD%D0%B5-%D0%BD%D0%B0-%D0%BF%D0%BE%D0%B4%D0%BB%D0%B5%D0%B7",
title: "Първо",
date: "06апр.",
},
{
url: "https://inspectorat-so.org/%D0%BD%D0%BE%D0%B2%D0%B8%D0%BD%D0%B8/?newsid=%D0%BC%D0%B8%D0%B5%D0%BD%D0%B5-%D0%BD%D0%B0-%D0%BF%D0%BE%D0%B4%D0%BB%D0%B5%D0%B7",
title: "Второ",
date: "06апр.",
},
]);

const page = createMockPage(mockEvaluate) as any;
const posts = await extractPostLinks(page);

expect(posts).toHaveLength(1);
expect(posts[0].title).toBe("Второ");
});

it("returns empty array when there are no valid newsid URLs", async () => {
const mockEvaluate = vi.fn().mockResolvedValue([
{
url: "https://inspectorat-so.org/%D0%BA%D0%BE%D0%BD%D1%82%D0%B0%D0%BA%D1%82%D0%B8",
title: "Контакти",
date: "01апр.",
},
]);

const page = createMockPage(mockEvaluate) as any;
const posts = await extractPostLinks(page);

expect(posts).toEqual([]);
});
});

describe("extractPostDetails", () => {
it("extracts title and content", async () => {
const mockEvaluate = vi.fn().mockResolvedValue({
title: "Миене на подлези с временно ограничение за движение",
dateText: "",
contentHtml: "<div>Текст на съобщението</div>",
});

const page = createMockPage(mockEvaluate) as any;
const details = await extractPostDetails(page);

expect(details.title).toContain("Миене на подлези");
expect(details.contentHtml).toContain("Текст на съобщението");
expect(details.dateText).toBe("");
});
});
});
38 changes: 38 additions & 0 deletions ingest/crawlers/inspectorat-so-org/extractors.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import type { Page } from "playwright";
import type { PostLink } from "./types";
import { SELECTORS } from "./selectors";
import {
extractPostLinks as extractPostLinksShared,
extractPostDetailsGeneric,
} from "../shared/extractors";

export async function extractPostLinks(page: Page): Promise<PostLink[]> {
const posts = await extractPostLinksShared(page, SELECTORS, (url) => {
let decodedUrl = "";

try {
decodedUrl = decodeURIComponent(url).toLowerCase();
} catch {
decodedUrl = url.toLowerCase();
}

// Keep all article links and let downstream AI stages decide relevance.
return decodedUrl.includes("inspectorat-so.org") && decodedUrl.includes("newsid=");
});

// Preserve latest duplicate (same behavior as nadezhda crawler).
return Array.from(new Map(posts.map((post) => [post.url, post])).values());
}

export async function extractPostDetails(
page: Page,
): Promise<{ title: string; dateText: string; contentHtml: string }> {
return extractPostDetailsGeneric(page, SELECTORS.POST, [
"script",
"style",
"nav",
"footer",
".breadcrumb",
".article-info",
]);
}
105 changes: 105 additions & 0 deletions ingest/crawlers/inspectorat-so-org/index.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import { beforeEach, describe, expect, it, vi } from "vitest";
import { crawl } from "./index";
import { parseInspectoratDate } from "./index";
import { extractPostDetails, extractPostLinks } from "./extractors";
import {
crawlWordpressPage,
processWordpressPost,
} from "../shared/webpage-crawlers";

vi.mock("./extractors", () => ({
extractPostLinks: vi.fn(),
extractPostDetails: vi.fn(),
}));

vi.mock("../shared/webpage-crawlers", () => ({
crawlWordpressPage: vi.fn(),
processWordpressPost: vi.fn(),
}));

describe("inspectorat-so-org/index", () => {
beforeEach(() => {
vi.clearAllMocks();
});

it("configures crawlWordpressPage with inspectorat settings", async () => {
const mockedCrawlWordpressPage = vi.mocked(crawlWordpressPage);
mockedCrawlWordpressPage.mockResolvedValueOnce();

await crawl();

expect(mockedCrawlWordpressPage).toHaveBeenCalledTimes(1);

const [options] = mockedCrawlWordpressPage.mock.calls[0];

expect(options.indexUrl).toBe(
"https://inspectorat-so.org/%D0%BD%D0%BE%D0%B2%D0%B8%D0%BD%D0%B8",
);
expect(options.sourceType).toBe("inspectorat-so-org");
expect(options.delayBetweenRequests).toBe(2000);
expect(options.extractPostLinks).toBe(extractPostLinks);
expect(typeof options.processPost).toBe("function");
});

it("delegates post processing to processWordpressPost", async () => {
const mockedCrawlWordpressPage = vi.mocked(crawlWordpressPage);
const mockedProcessWordpressPost = vi.mocked(processWordpressPost);
mockedCrawlWordpressPage.mockResolvedValueOnce();
mockedProcessWordpressPost.mockResolvedValueOnce();

await crawl();

const [options] = mockedCrawlWordpressPage.mock.calls[0];

const browser = {} as any;
const db = {} as any;
const postLink = {
url: "https://inspectorat-so.org/%D0%BD%D0%BE%D0%B2%D0%B8%D0%BD%D0%B8/?newsid=%D0%BC%D0%B8%D0%B5%D0%BD%D0%B5-%D0%BD%D0%B0-%D0%BF%D0%BE%D0%B4%D0%BB%D0%B5%D0%B7",
title: "Миене на подлези",
date: "06апр.",
};

await options.processPost(browser, postLink, db);

expect(mockedProcessWordpressPost).toHaveBeenCalledTimes(1);
expect(mockedProcessWordpressPost).toHaveBeenCalledWith(
browser,
postLink,
db,
"inspectorat-so-org",
"bg.sofia",
2000,
extractPostDetails,
expect.any(Function),
);
});

it("propagates crawlWordpressPage errors", async () => {
const mockedCrawlWordpressPage = vi.mocked(crawlWordpressPage);
mockedCrawlWordpressPage.mockRejectedValueOnce(new Error("crawl failed"));

await expect(crawl()).rejects.toThrow("crawl failed");
});

describe("parseInspectoratDate", () => {
it("parses short month dates as previous year when they are too far in the future", () => {
const referenceDate = new Date("2026-01-05T12:00:00+02:00");
const iso = parseInspectoratDate("29 дек.", "", referenceDate);
const parsed = new Date(iso);

expect(parsed.getUTCFullYear()).toBe(2025);
expect(parsed.getUTCMonth()).toBe(11);
expect(parsed.getUTCDate()).toBe(28);
});

it("keeps short month dates in current year when near future threshold", () => {
const referenceDate = new Date("2026-04-01T12:00:00+03:00");
const iso = parseInspectoratDate("06 апр.", "", referenceDate);
const parsed = new Date(iso);

expect(parsed.getUTCFullYear()).toBe(2026);
expect(parsed.getUTCMonth()).toBe(3);
expect(parsed.getUTCDate()).toBe(5);
});
});
});
138 changes: 138 additions & 0 deletions ingest/crawlers/inspectorat-so-org/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
#!/usr/bin/env node

import dotenv from "dotenv";
import { resolve } from "node:path";
import { Browser } from "playwright";
import type { OboDb } from "@oboapp/db";
import { PostLink } from "./types";
import { extractPostLinks, extractPostDetails } from "./extractors";
import {
crawlWordpressPage,
processWordpressPost,
} from "../shared/webpage-crawlers";
import { logger } from "@/lib/logger";

dotenv.config({ path: resolve(process.cwd(), ".env.local") });

const INDEX_URL = "https://inspectorat-so.org/%D0%BD%D0%BE%D0%B2%D0%B8%D0%BD%D0%B8";
const SOURCE_TYPE = "inspectorat-so-org";
const LOCALITY = "bg.sofia";
const DELAY_BETWEEN_REQUESTS = 2000; // 2 seconds
const MAX_FUTURE_DAYS_FOR_SHORT_DATE = 7;

const BG_MONTH_TO_NUMBER: Record<string, string> = {
"ян": "01",
"фев": "02",
"март": "03",
"апр": "04",
"май": "05",
"юни": "06",
"юли": "07",
"авг": "08",
"сеп": "09",
"окт": "10",
"ное": "11",
"дек": "12",
};

function inferShortDateYear(
day: string,
month: string,
referenceDate: Date,
): number {
const currentYear = referenceDate.getFullYear();
const candidateThisYear = new Date(
`${currentYear}-${month}-${day}T00:00:00+02:00`,
);

if (Number.isNaN(candidateThisYear.getTime())) {
return currentYear;
}

const futureThresholdMs =
MAX_FUTURE_DAYS_FOR_SHORT_DATE * 24 * 60 * 60 * 1000;
const isTooFarInFuture =
candidateThisYear.getTime() - referenceDate.getTime() > futureThresholdMs;

return isTooFarInFuture ? currentYear - 1 : currentYear;
}

export function parseInspectoratDate(
dateText: string,
fallbackDateText?: string,
referenceDate = new Date(),
): string {
const candidate = (dateText || fallbackDateText || "").replace(/\s+/g, " ").trim();

const directMatch = candidate.match(/(\d{1,2})[./](\d{1,2})[./](\d{2,4})/);
if (directMatch) {
const [, dayRaw, monthRaw, yearRaw] = directMatch;
const day = dayRaw.padStart(2, "0");
const month = monthRaw.padStart(2, "0");
const year = yearRaw.length === 2 ? `20${yearRaw}` : yearRaw;
const parsed = new Date(`${year}-${month}-${day}T00:00:00+02:00`);
if (!Number.isNaN(parsed.getTime())) {
return parsed.toISOString();
}
}

const shortMonthMatch = candidate.match(/(\d{1,2})\s*([а-я]+)/i);
if (shortMonthMatch) {
const [, dayRaw, monthRaw] = shortMonthMatch;
const monthKey = monthRaw.toLowerCase().replace(/\.$/, "");
const mappedMonth = BG_MONTH_TO_NUMBER[monthKey];

if (mappedMonth) {
const day = dayRaw.padStart(2, "0");
const year = inferShortDateYear(day, mappedMonth, referenceDate);
const parsed = new Date(`${year}-${mappedMonth}-${day}T00:00:00+02:00`);
if (!Number.isNaN(parsed.getTime())) {
return parsed.toISOString();
}
}
}

logger.warn("Unable to parse inspectorat date, using current date", {
sourceType: SOURCE_TYPE,
dateText,
fallbackDateText: fallbackDateText || "",
});

return new Date().toISOString();
}

const processPost = (
browser: Browser,
postLink: PostLink,
db: OboDb,
) =>
processWordpressPost(
browser,
postLink,
db,
SOURCE_TYPE,
LOCALITY,
DELAY_BETWEEN_REQUESTS,
extractPostDetails,
(dateText) => parseInspectoratDate(dateText, postLink.date),
);

export async function crawl(): Promise<void> {
await crawlWordpressPage({
indexUrl: INDEX_URL,
sourceType: SOURCE_TYPE,
extractPostLinks,
processPost,
delayBetweenRequests: DELAY_BETWEEN_REQUESTS,
});
}

if (require.main === module) {
crawl().catch((error) => {
logger.error("Fatal error", {
error: error instanceof Error ? error.message : String(error),
sourceType: SOURCE_TYPE,
});
process.exit(1);
});
}
17 changes: 17 additions & 0 deletions ingest/crawlers/inspectorat-so-org/selectors.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
/**
* CSS selectors for scraping inspectorat-so.org announcements.
*/
export const SELECTORS = {
INDEX: {
POST_CONTAINER: ".newsContent",
POST_LINK: '.titleLink[href*="newsid="]',
POST_DATE: ".dateCreated",
POST_TITLE: "h2",
},

POST: {
CONTENT: '[itemprop="articleBody"] > .col-md-12 > div[style*="font-size:14pt"]',
TITLE: '[itemprop="articleBody"] > .col-md-12 > h2',
DATE: '[itemprop="articleBody"] .dateCreated, [itemprop="datePublished"], time',
},
} as const;
Loading
Loading