# FinTech Vendor Scorecard for Micro-Lending

#### Load Files

In [1]:
from google.colab import files

uploaded = files.upload()


Saving qnashcom_data.jsonl to qnashcom_data.jsonl
Saving marakibrand_data.jsonl to marakibrand_data.jsonl
Saving ethio_brand_collection_data.jsonl to ethio_brand_collection_data.jsonl
Saving MerttEka_data.jsonl to MerttEka_data.jsonl
Saving Leyueqa_data.jsonl to Leyueqa_data.jsonl


In [2]:
import pandas as pd
import json

def load_jsonl(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return [json.loads(line) for line in f]


file_names = ["qnashcom_data.jsonl", "marakibrand_data.jsonl", "ethio_brand_collection_data.jsonl", "MerttEka_data.jsonl", "Leyueqa_data.jsonl"]
all_data = []
for file in file_names:
    all_data.extend(load_jsonl(file))

df = pd.DataFrame(all_data)


#### Preprocess Timestamp


In [3]:
df['timestamp'] = pd.to_datetime(df['timestamp'])


In [4]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [5]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

# Load model and tokenizer
model_path = "/content/drive/MyDrive/amharic-ner"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)
model.eval()


XLMRobertaForTokenClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768

#### Apply NER to Extract PRICE & PRODUCT

In [6]:
def extract_entities(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True).to(model.device)
    with torch.no_grad():
        outputs = model(**inputs).logits

    tokens = tokenizer.tokenize(text)
    predictions = torch.argmax(outputs, dim=-1)[0].cpu().numpy()
    entities = {}

    for token, label_id in zip(tokens, predictions):
        label = model.config.id2label[label_id]
        if label != 'O':
            ent_type = label.split("-")[-1]
            entities.setdefault(ent_type, []).append(token)

    return entities

df["entities"] = df["text_clean"].apply(extract_entities)


#### Compute Metrics per Vendor

In [9]:
from collections import defaultdict

vendor_metrics = []

for vendor, group in df.groupby("channel_username"):
    num_posts = len(group)
    time_span = (group["timestamp"].max() - group["timestamp"].min()).days
    posts_per_week = num_posts / (time_span / 7) if time_span > 0 else num_posts

    avg_views = group.get("views", pd.Series([0]*num_posts)).mean()

    # Average Price
    prices = []
    for row in group["entities"]:
        price_tokens = row.get("PRICE", [])
        for tok in price_tokens:
            try:
                prices.append(float(tok))
            except:
                pass
    avg_price = sum(prices) / len(prices) if prices else 0

    # Top Performing Post
    top_row = group.loc[group["views"].idxmax()] if "views" in group else group.iloc[0]
    top_product = top_row["entities"].get("PRODUCT", ["N/A"])[0]
    top_price = top_row["entities"].get("PRICE", ["N/A"])[0]

    # Lending Score (custom formula)
    score = (avg_views * 0.5) + (posts_per_week * 0.3) + (avg_price * 0.2)

    vendor_metrics.append({
        "Vendor": vendor,
        "Avg Views/Post": round(avg_views, 2),
        "Posts/Week": round(posts_per_week, 2),
        "Avg Price (ETB)": round(avg_price, 2),
        "Lending Score": round(score, 2),
        "Top Price": top_price
    })


#### Generate Scorecard Table

In [10]:
scorecard_df = pd.DataFrame(vendor_metrics)
scorecard_df.sort_values("Lending Score", ascending=False, inplace=True)
scorecard_df


Unnamed: 0,Vendor,Avg Views/Post,Posts/Week,Avg Price (ETB),Lending Score,Top Price
4,@qnashcom,0.0,15.42,682.98,141.22,፦
1,@MerttEka,0.0,40.0,486.79,109.36,
0,@Leyueqa,0.0,42.17,308.43,74.34,፦
3,@marakibrand,0.0,21.74,0.0,6.52,
2,@ethio_brand_collection,0.0,10.61,0.0,3.18,
