In [None]:
# --------------------------
# Construct classification training set
# --------------------------
import pandas as pd
import time
from serpapi import GoogleSearch
import requests

SERP_API_KEY = "API key"

# Adverse news data
search_config = [
  {
    "category": "Money Laundering",
    "search_queries": [
      "money laundering charges",
      "arrested for money laundering",
      "laundered funds",
      "money laundering network"
    ]
  },
  {
    "category": "Terrorist Financing",
    "search_queries": [
      "funding terrorism",
      "money trail terrorism",
      "financing extremist groups",
      "donations linked to terrorism"
    ]
  },
  {
    "category": "Sanctions Violations",
    "search_queries": [
      "sanctions breach",
      "bypassed sanctions",
      "trading with sanctioned entities",
      "UN/EU/OFAC sanctions violation"
    ]
  },
  {
    "category": "Fraud",
    "search_queries": [
      "financial fraud case",
      "bank fraud investigation",
      "wire fraud indictment",
      "fraudulent transactions"
    ]
  },
  {
    "category": "Tax Evasion",
    "search_queries": [
      "underreported income",
      "offshore tax evasion",
      "tax haven investigation",
      "illicit tax schemes"
    ]
  },
  {
    "category": "Bribery and Corruption",
    "search_queries": [
      "accused of corruption",
      "corruption investigation",
      "illegal kickbacks",
      "public official bribery"
    ]
  },
  {
    "category": "Insider Trading",
    "search_queries": [
      "trading on non-public information",
      "securities fraud case",
      "insider trading investigation",
      "market manipulation scheme"
    ]
  },
  {
    "category": "Ponzi and Pyramid Schemes",
    "search_queries": [
      "pyramid scheme charges",
      "investment scam",
      "high-return fraud",
      "ponzi fraud case"
    ]
  },
  {
    "category": "Trade-Based Money Laundering",
    "search_queries": [
      "over-invoicing scheme",
      "under-invoicing fraud",
      "illicit trade transactions",
      "false trade documents"
    ]
  }
]


# Store results
results = []

# Search each query
for config in search_config:
    category = config["category"]
    for query in config["search_queries"]:

        print(f"Searching: '{query}' under '{category}'")
        try:
            params = {
                "engine": "google_news",
                "q": query,
                "hl": "en",
                "api_key": SERP_API_KEY
            }

            search = GoogleSearch(params)
            news_results = search.get_dict().get("news_results", [])

            for article in news_results:
                results.append({
                    "category": category,
                    "query": query,
                    "title": article.get("title"),
                    "snippet": article.get("snippet"),
                    "source": article.get("source"),
                    "link": article.get("link"),
                    "date": article.get("date")
                })
            time.sleep(2)  # Respect rate limits
            

        except Exception as e:
            print(f"Error: {e}")
# Save to CSV
df = pd.DataFrame(results)
df.to_csv("serpapi_adverse_news.csv", index=False)
print(f"✅ Saved {len(df)} articles.")


# General news data
queries = [
  "global financial news",
  "stock market today",
  "economic news updates",
  "financial markets update",
  "latest business news",
  "financial trends 2025",
  "corporate earnings reports",
  "investment news",
  "economic outlook 2025",
  "financial news analysis",
  "real estate market update",
  "cryptocurrency news",
  "central bank interest rate news",
  "stock market trends",
  "business mergers and acquisitions",
  "companies earnings announcement",
  "global trade news",
  "commodity price update",
  "currency market news",
  "venture capital funding news"
]
results = []
for query in queries:
    params = {
        "engine": "google_news",
        "q": query,
        "hl": "en",
        "api_key": SERP_API_KEY
    }
    search = GoogleSearch(params)
    news_results = search.get_dict().get("news_results", [])

    for article in news_results:
        results.append({
            "category": "General Fiancial News",
            "query": query,
            "title": article.get("title"),
            "snippet": article.get("snippet"),
            "source": article.get("source"),
            "link": article.get("link"),
            "date": article.get("date")
        })

df = pd.DataFrame(results)
df.to_csv("serpapi_general_news.csv", index=False)
print(f"✅ Saved {len(df)} articles.")



In [None]:
# --------------------------
# Construct simulation prod data
# --------------------------

import pandas as pd
import time
from serpapi import GoogleSearch
import requests

SERP_API_KEY = "API key"

queries = [
    "Prateek Gupta",
    "Macy's",
    "Simpson Thacher",
    "Dmitrii Ovsiannikov",
    "Bob Menendez's",
    "STMicroelectronics",
]

# Store results
results = []

# Search each query
for query in queries:

    print(f"Searching: '{query}'")
    try:
        params = {
            "engine": "google_news",
            "q": query,
            "hl": "en",
            "api_key": SERP_API_KEY
        }

        search = GoogleSearch(params)
        news_results = search.get_dict().get("news_results", [])

        for article in news_results:
            results.append({
                "query": query,
                "title": article.get("title"),
                "source": article.get("source"),
                "link": article.get("link"),
                "date": article.get("date")
            })
        time.sleep(2)  # Respect rate limits
        

    except Exception as e:
        print(f"Error: {e}")
# Save to CSV
df = pd.DataFrame(results)
df.to_csv("serpapi_adverse_news_testcase.csv", index=False)
print(f"✅ Saved {len(df)} articles.")


In [None]:
adverse_df_1 = pd.read_csv("serpapi_adverse_news_1.csv")
adverse_df = pd.read_csv("serpapi_adverse_news.csv")
general_df = pd.read_csv("serpapi_general_news.csv")

news_df = pd.concat([adverse_df, adverse_df_1, general_df]).sample(frac=1).reset_index(drop=True)