Vendor Analysis: Lending Scorecard

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os 
import sys 
sys.path.append(os.path.abspath(os.path.join('..')))
from transformers import pipeline
from scripts.vendor_scorecard_engine import score_vendors

df = pd.read_csv("../data/telegram_data.csv")
df.head()

Unnamed: 0,Channel Title,Channel Username,ID,Message,Date,Media Path
0,Sheger online-store,@Shageronlinestore,7383,💥Miralux Hot plate\n ባለሁለት ምድጃ ስቶቭ\n\n 💯o...,2025-06-19 06:31:31+00:00,data/photos/@Shageronlinestore_7383.jpg
1,Sheger online-store,@Shageronlinestore,7382,💥7pcs glass water set\n\n✔️ አንድ ማራኪ ጆግና 6 መጠጫ ...,2025-06-18 11:19:11+00:00,data/photos/@Shageronlinestore_7382.jpg
2,Sheger online-store,@Shageronlinestore,7381,,2025-06-18 11:19:11+00:00,data/photos/@Shageronlinestore_7381.jpg
3,Sheger online-store,@Shageronlinestore,7380,,2025-06-18 11:19:11+00:00,data/photos/@Shageronlinestore_7380.jpg
4,Sheger online-store,@Shageronlinestore,7379,,2025-06-18 11:19:11+00:00,data/photos/@Shageronlinestore_7379.jpg


In [3]:
from transformers import pipeline
ner_pipeline = pipeline("token-classification", model="Davlan/bert-base-multilingual-cased-ner-hrl", aggregation_strategy="simple")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Device set to use cpu


In [3]:

# Load scraped Telegram data
df = pd.read_csv("../data/telegram_data.csv")

In [5]:

# Normalize column names
df.columns = df.columns.str.lower().str.strip()
print("Columns after load:", df.columns.tolist())


Columns after load: ['channel title', 'channel username', 'id', 'message', 'date', 'media path']


In [6]:
# Rename columns to match expectations in score_vendors
df.rename(columns={
    'channel title': 'vendor',
    'message': 'text',
    'date': 'timestamp'
}, inplace=True)

# Simulate 'views' since missing from data
np.random.seed(42)
df['views'] = np.random.randint(100, 5000, size=len(df))

In [7]:

# Parse timestamps and drop rows with invalid timestamps
df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
if df['timestamp'].isnull().any():
    print(f"Dropping {df['timestamp'].isnull().sum()} rows with invalid timestamps")
    df = df.dropna(subset=['timestamp'])

# Remove timezone info if present (to avoid warnings in score_vendors)
df['timestamp'] = df['timestamp'].dt.tz_localize(None)

In [9]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# Load your model (replace with your model path or Hugging Face model name)
model_name = "Davlan/afro-xlmr-base"  # or "models/saved_model_dir" for local fine-tuned model

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Now define the NER pipeline
ner_pipeline = pipeline(
    "token-classification",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple"
)


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


In [None]:
# Filter out rows with missing or empty text
df_with_text = df[df['text'].notna() & (df['text'].str.strip() != '')]

score_df = score_vendors(df_with_text, ner_pipeline)
score_df.to_csv("../data/vendor_scorecard.csv", index=False)
print(score_df.head())

