In [3]:
import pandas as pd
df = pd.read_csv("serpapi_news.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,category,query,title,snippet,source,link,date
0,0,Terrorist Financing,terrorist financing,Blockchain and Bloodshed: The Role of Cryptocu...,,"{'name': 'The Soufan Center', 'icon': 'https:/...",https://thesoufancenter.org/intelbrief-2024-oc...,"10/16/2024, 07:00 AM, +0000 UTC"
1,1,Insider Trading,insider trading charges,Securities Enforcement 2024 Year-End Update,,"{'name': 'Gibson Dunn', 'icon': 'https://encry...",https://www.gibsondunn.com/securities-enforcem...,"01/30/2025, 08:00 AM, +0000 UTC"
2,2,Ponzi and Pyramid Schemes,investment scam,"Here's how to avoid risky investment scams, wi...",,"{'name': 'WKYC', 'icon': 'https://encrypted-tb...",https://www.wkyc.com/article/news/verify/how-t...,"03/26/2025, 07:00 AM, +0000 UTC"
3,3,Ponzi and Pyramid Schemes,investment scam,ASX plunges more than $50 billion as stark war...,,"{'name': 'Yahoo', 'icon': 'https://encrypted-t...",https://au.finance.yahoo.com/news/asx-set-to-l...,"04/08/2025, 10:51 PM, +0000 UTC"
4,4,General Fiancial News,global financial news,Korean won starts day at weakest level since g...,,"{'name': 'Korea JoongAng Daily', 'icon': 'http...",https://koreajoongangdaily.joins.com/news/2025...,"04/09/2025, 01:04 AM, +0000 UTC"


In [4]:
import pandas as pd
import random
import fasttext
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

In [25]:
import re
import string


def preprocess_title(title: str) -> str:
    """
    Cleans and tokenizes a news title for NLP classification.

    Args:
        title (str): Raw news title
        lemmatize (bool): Whether to apply lemmatization

    Returns:
        str: Cleaned and preprocessed title
    """
    # Lowercase
    title = title.lower()

    # Remove URLs and special chars
    title = re.sub(r"http\\S+|www\\S+|[^a-z0-9\\s]", " ", title)

    return title

#Load dataset (replace with your file)
df = pd.read_csv("serpapi_news.csv")

# Combine title + snippet as text input
# df["text"] = df["title"].fillna('').apply(lambda x: preprocess_title(x))
df["text"] = df["title"].fillna('')
df['category'] = df['category'].apply(lambda x: 'General Financial News' if x == 'General Fiancial News' else x)

# Drop rows with missing category or text
df = df.dropna(subset=["category", "text"])

# Convert labels to FastText format (label prefix required)
df["fasttext_label"] = "__label__" + df["category"].str.replace(" ", "_")

# Train/test split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["category"])

# Save train.txt and test.txt in FastText format
def save_fasttext_file(df, path):
    with open(path, "w", encoding="utf-8") as f:
        for _, row in df.iterrows():
            f.write(f"{row['fasttext_label']} {row['text'].strip()}\n")

save_fasttext_file(train_df, "train.txt")
save_fasttext_file(test_df, "test.txt")

# Train model
# test_df = pd.read_csv('test_df.csv')
model = fasttext.train_supervised(input="train.txt", lr=1.0, epoch=25, wordNgrams=2, verbose=2)

# Evaluate on test set
preds = model.predict(test_df['text'].tolist())
y_pred = [i[0] for i in preds[0]]

# Remove FastText label prefix for readability
y_true_labels = test_df['category'].tolist()
y_pred_labels = [label.replace("__label__", "").replace("_"," ") for label in y_pred]

test_df['pred_category'] = y_pred_labels
test_df['pred_score'] = [float(i[0]) for i in preds[1]]
# Print metrics
print("\nAccuracy:", accuracy_score(y_true_labels, y_pred_labels))
print("\nClassification Report:\n")
print(classification_report(y_true_labels, y_pred_labels))
test_df.to_csv('test_df.csv')

model.save_model('fasttext_model.bin')


Read 0M words
Number of words:  12614
Number of labels: 10
Progress: 100.0% words/sec/thread: 1128837 lr:  0.000000 avg.loss:  0.097685 ETA:   0h 0m 0s



Accuracy: 0.866504854368932

Classification Report:

                              precision    recall  f1-score   support

      Bribery and Corruption       0.89      0.85      0.87       100
                       Fraud       0.71      0.73      0.72        99
      General Financial News       0.89      0.99      0.94       393
             Insider Trading       0.94      0.81      0.87       100
            Money Laundering       0.88      0.81      0.84       100
   Ponzi and Pyramid Schemes       0.75      0.77      0.76        99
        Sanctions Violations       0.98      0.92      0.95        89
                 Tax Evasion       0.92      0.87      0.90        94
         Terrorist Financing       0.92      0.90      0.91        99
Trade-Based Money Laundering       0.62      0.54      0.58        63

                    accuracy                           0.87      1236
                   macro avg       0.85      0.82      0.83      1236
                weighted avg      

In [26]:
test_df_sample = pd.read_csv('test_df_sample.csv')
preds = model.predict(test_df_sample['text'].tolist())
y_pred = [i[0] for i in preds[0]]

# Remove FastText label prefix for readability
# test_df_sample['review_category'] = test_df_sample['review_category'].apply(lambda x: 'General Financial News' if x == 'General Fiancial News' else x)
test_df_sample = test_df_sample[~test_df_sample['review_category'].isin(['Not Clear', 'Not Financial News'])]
y_true_labels = test_df_sample['review_category'].tolist()
y_pred_labels = [label.replace("__label__", "").replace("_"," ") for label in y_pred]

test_df_sample['pred_category'] = y_pred_labels
test_df_sample['pred_score'] = [float(i[0]) for i in preds[1]]
# Print metrics
print("\nAccuracy:", accuracy_score(y_true_labels, y_pred_labels))
print("\nClassification Report:\n")
print(classification_report(y_true_labels, y_pred_labels))
# test_df_sample.to_csv('test_df.csv')



Accuracy: 0.8214285714285714

Classification Report:

                              precision    recall  f1-score   support

      Bribery and Corruption       0.88      0.88      0.88         8
                       Fraud       0.64      1.00      0.78         7
      General Financial News       0.97      0.72      0.83        40
             Insider Trading       1.00      1.00      1.00         6
            Money Laundering       1.00      0.88      0.93         8
   Ponzi and Pyramid Schemes       1.00      0.50      0.67         4
        Sanctions Violations       0.83      1.00      0.91         5
                 Tax Evasion       0.50      1.00      0.67         3
         Terrorist Financing       0.43      1.00      0.60         3
Trade-Based Money Laundering       0.00      0.00      0.00         0

                    accuracy                           0.82        84
                   macro avg       0.72      0.80      0.73        84
                weighted avg     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [27]:
test_df_sample.to_csv('test_df_sample.csv', index=False)

In [32]:
testcase = pd.read_csv('serpapi_adverse_news_testcase.csv')
preds = model.predict(testcase['title'].tolist())
y_pred = [i[0] for i in preds[0]]

testcase['fasttext_pred_category'] = [i[0].replace('__label__', '').replace('_', ' ') for i in preds[0]]
testcase['fasttext_pred_score'] = [float(i[0]) for i in preds[1]]


In [33]:
testcase

Unnamed: 0,query,title,source,link,date,fasttext_pred_category,fasttext_pred_score
0,Prateek Gupta,Defendant in Trafigura fraud case agrees to di...,"{'name': 'MSN', 'icon': 'https://encrypted-tbn...",https://www.msn.com/en-us/money/companies/defe...,"04/09/2025, 06:10 PM, +0000 UTC",Fraud,0.927947
1,Prateek Gupta,Trafigura wins access to bank records in nicke...,"{'name': 'Global Trade Review (GTR)', 'icon': ...",https://www.gtreview.com/news/global/trafigura...,"02/05/2025, 08:00 AM, +0000 UTC",Fraud,0.979434
2,Prateek Gupta,Gupta granted delay on document disclosure in ...,"{'name': 'Reuters', 'icon': 'https://encrypted...",https://www.reuters.com/markets/commodities/gu...,"12/13/2024, 08:00 AM, +0000 UTC",Bribery and Corruption,0.299181
3,Prateek Gupta,Detroit St. Patrick's Parade,"{'name': 'The Detroit News', 'icon': 'https://...",https://www.detroitnews.com/picture-gallery/me...,"03/16/2025, 07:00 AM, +0000 UTC",General Financial News,0.959083
4,Prateek Gupta,Trafigura asks to subpoena banks in search for...,"{'name': 'Mining.com', 'icon': 'https://encryp...",https://www.mining.com/web/trafigura-asks-to-s...,"01/30/2025, 08:00 AM, +0000 UTC",Trade-Based Money Laundering,0.355079
...,...,...,...,...,...,...,...
506,STMicroelectronics,STMicroelectronics launches high-performance N...,"{'name': 'nfcw.com', 'icon': 'https://encrypte...",https://www.nfcw.com/2025/03/14/390263/stmicro...,"03/14/2025, 07:00 AM, +0000 UTC",General Financial News,0.984010
507,STMicroelectronics,"STMicroelectronics Acquires 427,820 Shares in ...","{'name': 'Stock Titan', 'icon': 'https://encry...",https://www.stocktitan.net/news/STM/st-microel...,"03/10/2025, 07:00 AM, +0000 UTC",General Financial News,0.958961
508,STMicroelectronics,STMicroelectronics signs 15-year PPA with Tota...,"{'name': 'Data Center Dynamics', 'icon': 'http...",https://www.datacenterdynamics.com/en/news/stm...,"01/28/2025, 08:00 AM, +0000 UTC",General Financial News,0.919032
509,STMicroelectronics,[News] Sanan-STMicroelectronics SiC Device Fab...,"{'name': 'TrendForce', 'icon': 'https://encryp...",https://www.trendforce.com/news/2025/03/07/new...,"03/07/2025, 08:00 AM, +0000 UTC",Trade-Based Money Laundering,0.575204


In [34]:
from src.llm_news_analyzer import LLMNewsAnalyzer
import time
analyzer = LLMNewsAnalyzer()
llm_preds = []
llm_pred_scores = []
for _, row in test_df_sample.iterrows():
    analyzer.classify_news(row['text'])
    llm_preds.append(analyzer.classification_result[0]['category'])
    llm_pred_scores.append(analyzer.classification_result[0]['confidence_score'])
    time.sleep(1)

test_df_sample['llm_pred_category'] = llm_preds
test_df_sample['llm_pred_score'] = llm_pred_scores

y_true_labels = test_df_sample['review_category'].tolist()
y_pred_labels = test_df_sample['llm_pred_category'].tolist()

# Print metrics
print("\nAccuracy:", accuracy_score(y_true_labels, y_pred_labels))
print("\nClassification Report:\n")
print(classification_report(y_true_labels, y_pred_labels))


Accuracy: 0.75

Classification Report:

                           precision    recall  f1-score   support

   Bribery and Corruption       0.89      1.00      0.94         8
                    Fraud       0.46      0.86      0.60         7
   General Financial News       1.00      0.57      0.73        40
          Insider Trading       1.00      0.83      0.91         6
         Money Laundering       0.78      0.88      0.82         8
       Non Financial News       0.00      0.00      0.00         0
Ponzi and Pyramid Schemes       1.00      0.75      0.86         4
     Sanctions Violations       0.71      1.00      0.83         5
              Tax Evasion       0.50      1.00      0.67         3
      Terrorist Financing       0.60      1.00      0.75         3

                 accuracy                           0.75        84
                macro avg       0.69      0.79      0.71        84
             weighted avg       0.87      0.75      0.77        84



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [35]:
test_df_sample.to_csv('test_df_sample.csv', index=False)

In [40]:
testcase.to_csv('serpapi_adverse_news_testcase.csv')
testcase_sample = testcase.sample(n=100)

In [44]:
from src.llm_news_analyzer import LLMNewsAnalyzer
import time
analyzer = LLMNewsAnalyzer()
llm_preds = []
llm_pred_scores = []
for _, row in testcase_sample.iterrows():
    try:
        analyzer.classify_news(row['title'])
        llm_preds.append(analyzer.classification_result[0]['category'])
        llm_pred_scores.append(analyzer.classification_result[0]['confidence_score'])
        time.sleep(1)
    except Exception as e:
        print(f"Exception when calling LLM service on row {_}: {e}, title:", row['title'])
        llm_preds.append(None)
        llm_pred_scores.append(None)

testcase_sample['llm_pred_category'] = llm_preds
testcase_sample['llm_pred_score'] = llm_pred_scores

Exception when calling LLM service on row 216: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'jailbreak': {'filtered': False, 'detected': False}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': True, 'severity': 'medium'}, 'violence': {'filtered': False, 'severity': 'safe'}}}}}, title: Simpson Thacher Snaps Up Sidley Austin Leveraged Finance Duo


In [45]:
len(llm_preds)

100

In [46]:
testcase_sample.to_csv('serpapi_adverse_news_testcase_sample.csv', index=False)

In [61]:
testcase_sample = pd.read_csv('serpapi_adverse_news_testcase_sample.csv')
testcase_sample = testcase_sample[~testcase_sample['llm_pred_category'].isna()]
testcase_sample = testcase_sample[testcase_sample['review_category']!='Non Financial News']
testcase_sample.head()

Unnamed: 0,query,title,source,link,date,fasttext_pred_category,fasttext_pred_score,llm_pred_category,llm_pred_score,review_category
0,STMicroelectronics,[News] STMicroelectronics and Onsemi Struggle ...,"{'name': 'TrendForce', 'icon': 'https://encryp...",https://www.trendforce.com/news/2025/02/26/new...,"02/26/2025, 08:00 AM, +0000 UTC",General Financial News,0.995784,General Financial News,0.95,General Financial News
1,Prateek Gupta,How one Gupta accused of fraud knows another G...,"{'name': 'Financial Times', 'icon': 'https://e...",https://www.ft.com/content/ab817970-499a-473d-...,"02/14/2023, 08:00 AM, +0000 UTC",Fraud,0.712454,Fraud,0.95,Fraud
2,Bob Menendez's,Prosecutors seek 15 years in prison for former...,"{'name': 'WHYY', 'icon': 'https://encrypted-tb...",https://whyy.org/articles/prosecutors-15-years...,"01/10/2025, 08:00 AM, +0000 UTC",Bribery and Corruption,0.999603,Bribery and Corruption,0.95,Bribery and Corruption
3,Prateek Gupta,Businessman accused by Trafigura living in Dub...,"{'name': 'FBC News', 'icon': 'https://encrypte...",https://www.fbcnews.com.fj/world/businessman-a...,"03/09/2023, 08:00 AM, +0000 UTC",General Financial News,0.355001,Fraud,0.85,Fraud
7,Simpson Thacher,Simpson Thacher partner takes finance practice...,"{'name': 'Reuters', 'icon': 'https://encrypted...",https://www.reuters.com/legal/litigation/simps...,"09/04/2024, 07:00 AM, +0000 UTC",General Financial News,0.76123,Non Financial News,0.9,General Financial News


In [62]:
testcase_sample.shape

(65, 10)

In [63]:
y_true_labels = testcase_sample['review_category'].tolist()
y_pred_labels = testcase_sample['llm_pred_category'].tolist()

# Print metrics
print("\nLLM Accuracy:", accuracy_score(y_true_labels, y_pred_labels))
print("\nLLM Classification Report:\n")
print(classification_report(y_true_labels, y_pred_labels))


LLM Accuracy: 0.7538461538461538

LLM Classification Report:

                        precision    recall  f1-score   support

Bribery and Corruption       0.93      0.88      0.90        16
                 Fraud       1.00      0.86      0.92        14
General Financial News       0.90      0.61      0.73        31
       Insider Trading       1.00      1.00      1.00         1
      Money Laundering       1.00      1.00      1.00         1
    Non Financial News       0.00      0.00      0.00         0
  Sanctions Violations       1.00      1.00      1.00         2

              accuracy                           0.75        65
             macro avg       0.83      0.76      0.79        65
          weighted avg       0.94      0.75      0.83        65



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [65]:
y_true_labels = testcase_sample['review_category'].tolist()
y_pred_labels = testcase_sample['fasttext_pred_category'].tolist()

# Print metrics
print("\FastText Accuracy:", accuracy_score(y_true_labels, y_pred_labels))
print("\FastText Classification Report:\n")
print(classification_report(y_true_labels, y_pred_labels))

\FastText Accuracy: 0.7230769230769231
\FastText Classification Report:

                              precision    recall  f1-score   support

      Bribery and Corruption       0.82      0.88      0.85        16
                       Fraud       1.00      0.36      0.53        14
      General Financial News       0.84      0.84      0.84        31
             Insider Trading       0.00      0.00      0.00         1
            Money Laundering       0.00      0.00      0.00         1
   Ponzi and Pyramid Schemes       0.00      0.00      0.00         0
        Sanctions Violations       0.67      1.00      0.80         2
                 Tax Evasion       0.00      0.00      0.00         0
         Terrorist Financing       0.00      0.00      0.00         0
Trade-Based Money Laundering       0.00      0.00      0.00         0

                    accuracy                           0.72        65
                   macro avg       0.33      0.31      0.30        65
               

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
[{"COMPANY":{"entity_name":"Brady Aune and Joe Anderson Safety Bill","variations":["Safety Bill"]},"PERSON":{"entity_name":"Joe Anderson","variations":["Joe Anderson"]},"LOCATION":{"entity_name":"Minnesota","variations":["Minnesota"]},"SECTOR":{"entity_name":"lake weed clearing","variations":["lake weeds","lake weeds clearing"]}},{"COMPANY":{"entity_name":"Retirement Financial Group","variations":["Retirement Financial Group","RFG"]},"PERSON":{"entity_name":"Joe Anderson","variations":["Joe Anderson"]},"SECTOR":{"entity_name":"wealth management","variations":["wealth management","financial planning"]}},{"COMPANY":{"entity_name":"Arthur Andersen LLP","variations":["Arthur Andersen LLP","Arthur Andersen"]},"SECTOR":{"entity_name":"accounting","variations":["public accounting"]}},{"PERSON":{"entity_name":"Joe Anderson","variations":["Joe Anderson"]},"LEGAL_ACTION":{"entity_name":"corruption charges","variations":["bribery charges","misconduct charges"]},"LOCATION":{"entity_name":"Liverpool","variations":["Liverpool"]}},{"PERSON":{"entity_name":"Derek Hatton","variations":["Derek Hatton"]},"LEGAL_ACTION":{"entity_name":"bribery charges","variations":["bribery charges","misconduct charges"]},"LOCATION":{"entity_name":"Liverpool","variations":["Liverpool"]}},{"PERSON":{"entity_name":"Derek Hatton","variations":["Derek Hatton"]},"LEGAL_ACTION":{"entity_name":"bribery charges","variations":["bribery charges"]},"LOCATION":{"entity_name":"Liverpool","variations":["Liverpool"]}},{"PERSON":{"entity_name":"Joe Anderson","variations":["Joe Anderson"]},"LEGAL_ACTION":{"entity_name":"corruption charges","variations":["corruption charges"]}},{"REGULATORY_BODY":{"entity_name":"Government imposed Commissioners","variations":["Government imposed Commissioners","Government Commissioners"]},"LEGAL_ACTION":{"entity_name":"claims of pervasive and rotten culture","variations":["claims of pervasive and rotten culture"]},"LOCATION":{"entity_name":"Liverpool City Council","variations":["Liverpool City Council"]},"DATE":{"entity_name":"2021","variations":["2021"]}},{"PERSON":{"entity_name":"Derek Hatton","variations":["Derek Hatton"]},"LEGAL_ACTION":{"entity_name":"bribery accusations","variations":["bribing Liverpool Council officer"]},"COMPANY":{"entity_name":"Liverpool Council","variations":["Liverpool Council"]}},{},{"PERSON":{"entity_name":"Joe Anderson","variations":["Joe Anderson"]},"LEGAL_ACTION":{"entity_name":"corruption probe","variations":["corruption probe"]},"LOCATION":{"entity_name":"Liverpool","variations":["Liverpool"]}},{"PERSON":{"entity_name":"Joe Anderson","variations":["Joe Anderson"]},"LEGAL_ACTION":{"entity_name":"corruption charges","variations":["threatening letters"]},"LOCATION":{"entity_name":"Liverpool","variations":["Liverpool"]}},{"PERSON":{"entity_name":"Nick Kavanagh","variations":["Nick Kavanagh"]},"LEGAL_ACTION":{"entity_name":"three charges","variations":["court charges","misconduct charges"]},"LOCATION":{"entity_name":"Liverpool","variations":["Liverpool"]}},{"REGULATORY_BODY":{"entity_name":"Intellectual Property Protection","variations":["Intellectual Property Protection"]},"ENFORCEMENT_ACTION":{"entity_name":"enforcement","variations":["enforcement","protection actions"]}}]