###REQUIREMENTS


In [5]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import accuracy_score, r2_score

###LOAD DATA

In [6]:
train_df = pd.read_csv("/content/train_complaints.csv")
test_df  = pd.read_csv("/content/test_complaints.csv")

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

Train shape: (2999, 5)
Test shape: (499, 2)


In [7]:
train_df.head()

Unnamed: 0,complaint_id,complaint_text,primary_category,secondary_category,severity
0,1634299,Back into XXXX of 2010 during this mortgage cr...,Mortgage,"Loan modification,collection,foreclosure",2
1,5505088,I checked my credit report and I am upset on w...,"Credit reporting, credit repair services, or o...",Problem with a credit reporting company's inve...,1
2,10979675,I am writing to dispute the accuracy of the in...,Credit reporting or other personal consumer re...,Problem with a company's investigation into an...,1
3,7520351,A transaction from XXXX XXXX XXXX submitted a ...,Checking or savings account,Managing an account,1
4,5847870,I was recently alerted to an account in collec...,Debt collection,Attempts to collect debt not owed,5


###PRIMARY CLASSIFIER

In [8]:
def clean_text(text):
    text = text.lower()
    text = " ".join(text.split())
    return text


In [9]:
train_df["clean_text"] = train_df["complaint_text"].apply(clean_text)


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

primary_vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    min_df=5,
    max_df=0.9,
    stop_words="english",
    sublinear_tf=True
)


In [11]:
from sklearn.linear_model import LogisticRegression

primary_clf = LogisticRegression(
    max_iter=2000,
    class_weight="balanced",
    multi_class="multinomial",
    solver="lbfgs",
    # C=0.7,          # NEW: stronger regularization
    n_jobs=-1
)


In [12]:
def normalize_primary(label):
    if label in {
        "Credit reporting",
        "Credit reporting or other personal consumer reports",
        "Credit reporting, credit repair services, or other personal consumer reports",
    }:
        return "Credit reporting (all)"
    return label


In [13]:
train_df["primary_norm"] = train_df["primary_category"].apply(normalize_primary)


In [14]:
train_split = train_df.copy()
train_split["primary_norm"] = train_split["primary_category"].apply(normalize_primary)


X_train = primary_vectorizer.fit_transform(train_split["clean_text"])
y_train = train_split["primary_norm"]

primary_clf.fit(X_train, y_train)



###Enforce Hard Constraints (NO ML)

In [15]:
PRIMARY_TO_SECONDARY = {
    "Debt collection": {
        "Attempts to collect debt not owed",
        "Written notification about debt",
        "Problem with a company's investigation into an existing problem",
    },

    "Mortgage": {
        "Loan modification,collection,foreclosure",
        "Problem with a company's investigation into an existing problem",
    },

    "Checking or savings account": {
        "Managing an account",
    },

    "Credit card": {
        "Trouble during payment process",
        "Managing an account",
        "Problem with a company's investigation into an existing problem",
    },

    "Credit card or prepaid card": {
        "Trouble during payment process",
        "Managing an account",
        "Problem with a company's investigation into an existing problem",
    },

    "Vehicle loan or lease": {
        "Loan modification,collection,foreclosure",
        "Problem with a company's investigation into an existing problem",
    },

    "Credit reporting (all)": {
        "Incorrect information on credit report",
        "Incorrect information on your report",
        "Improper use of your report",
        "Problem with a credit reporting company's investigation into an existing problem",
    },
}


###SECONDARY CLASSIFICATION (ML + RULE OVERRIDES)

In [16]:
SECONDARY_RULES = {
    "Debt collection": [
        {
            "keywords": [
                "not mine", "never owed", "already paid", "paid in full",
                "wrong person", "identity theft"
            ],
            "secondary": "Attempts to collect debt not owed"
        },
        {
            "keywords": [
                "letter", "mail", "written notice", "validation notice"
            ],
            "secondary": "Written notification about debt"
        }
    ],

    "Credit reporting (all)": [
        {
            "keywords": [
                "unauthorized inquiry", "hard inquiry",
                "pulled my credit", "accessed without permission"
            ],
            "secondary": "Improper use of your report"
        },
        {
            "keywords": [
                "dispute", "investigation", "failed to investigate",
                "no response", "did not fix"
            ],
            "secondary": "Problem with a credit reporting company's investigation into an existing problem"
        }
    ],

    "Credit card": [
        {
            "keywords": [
                "payment failed", "declined", "charged twice",
                "late fee", "interest charged"
            ],
            "secondary": "Trouble during payment process"
        }
    ],

    "Credit card or prepaid card": [
        {
            "keywords": [
                "payment failed", "declined", "charged twice",
                "late fee", "interest charged"
            ],
            "secondary": "Trouble during payment process"
        }
    ],

    "Mortgage": [
        {
            "keywords": [
                "foreclosure", "loan modification", "repossession"
            ],
            "secondary": "Loan modification,collection,foreclosure"
        }
    ],

    "Vehicle loan or lease": [
        {
            "keywords": [
                "repossession", "loan modification"
            ],
            "secondary": "Loan modification,collection,foreclosure"
        }
    ],
}


In [17]:
def apply_secondary_rules(text, primary_pred):
    """
    Returns a secondary label if a high-precision rule fires.
    Otherwise returns None.
    """
    text = text.lower()
    rules = SECONDARY_RULES.get(primary_pred, [])

    for rule in rules:
        for kw in rule["keywords"]:
            if kw in text:
                return rule["secondary"]

    return None


###transfomer


In [18]:
!pip install sentence-transformers




In [19]:
from sentence_transformers import SentenceTransformer
import numpy as np


In [20]:
embedder = SentenceTransformer("all-MiniLM-L6-v2")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]



README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [21]:
train_texts = train_df["complaint_text"].tolist()
train_embeddings = embedder.encode(
    train_texts,
    batch_size=64,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True
)


Batches:   0%|          | 0/47 [00:00<?, ?it/s]

In [22]:
np.save("train_embeddings.npy", train_embeddings)


In [23]:
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
import numpy as np

clean_prototypes = defaultdict(dict)

for primary in train_df["primary_norm"].unique():
    subset = train_df[train_df["primary_norm"] == primary]

    for secondary in subset["secondary_category"].unique():
        idx = subset[subset["secondary_category"] == secondary].index
        embs = train_embeddings[idx]

        # Mean embedding
        mean_emb = embs.mean(axis=0, keepdims=True)

        # Similarity to mean
        sims = cosine_similarity(embs, mean_emb).flatten()

        # Keep top 60% most central samples
        k = max(1, int(0.6 * len(embs)))
        top_idx = sims.argsort()[-k:]

        # Clean prototype
        proto = embs[top_idx].mean(axis=0)
        proto = proto / np.linalg.norm(proto)

        clean_prototypes[primary][secondary] = proto


In [24]:
def predict_secondary_embedding(text, primary_pred):
    emb = embedder.encode(
        [text],
        convert_to_numpy=True,
        normalize_embeddings=True
    )[0]

    # candidates = secondary_prototypes.get(primary_pred)
    candidates = clean_prototypes.get(primary_pred)
    if candidates is None:
        return None

    best_sec = None
    best_sim = -1

    for sec, proto in candidates.items():
        sim = np.dot(emb, proto)   # cosine similarity
        if sim > best_sim:
            best_sim = sim
            best_sec = sec

    return best_sec


In [25]:
def predict_secondary(text, primary_pred, rule_threshold=0.45):

    # Deterministic primary
    if primary_pred == "Checking or savings account":
        return "Managing an account"

    # ---- RULE OVERRIDE (only when ML was unsure) ----
    # NOTE: rules stay the same as before
    rule_hit = apply_secondary_rules(text, primary_pred)
    if rule_hit is not None:
        return rule_hit

    # ---- EMBEDDING-BASED DECISION ----
    sec = predict_secondary_embedding(text, primary_pred)
    return sec


###RISK


In [26]:
pip install sentence-transformers torch




In [27]:
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
import torch
import pandas as pd
import numpy as np
from collections import Counter


In [28]:
FINETUNE_PRIMARIES = {
    "Credit reporting (all)",
    "Debt collection",
    "Mortgage"
}


In [30]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sentence_transformers import SentenceTransformer
from collections import Counter


In [31]:
class SecondaryDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]


In [34]:
from sentence_transformers.models import Transformer, Pooling


In [35]:
class SentenceClassifier(nn.Module):
    def __init__(self, encoder, num_labels):
        super().__init__()
        self.encoder = encoder
        self.classifier = nn.Linear(
            encoder.get_sentence_embedding_dimension(),
            num_labels
        )

    def forward(self, sentences):
        features = self.encoder.tokenize(sentences)
        features = {k: v.to(next(self.parameters()).device) for k, v in features.items()}

        output = self.encoder.forward(features)
        embeddings = output["sentence_embedding"]   # ✅ grad-enabled

        return self.classifier(embeddings)


In [36]:
secondary_models_ft = {}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for primary in FINETUNE_PRIMARIES:
    subset = train_df[train_df["primary_norm"] == primary]

    if len(subset) < 200:
        continue

    labels = sorted(subset["secondary_category"].unique())
    label2id = {l: i for i, l in enumerate(labels)}
    id2label = {i: l for l, i in label2id.items()}

    print(f"\nTraining secondary model for PRIMARY = {primary}")
    print("Label distribution:", Counter(subset["secondary_category"]))

    texts = subset["complaint_text"].tolist()
    y = [label2id[l] for l in subset["secondary_category"]]

    dataset = SecondaryDataset(texts, y)
    dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

    encoder = SentenceTransformer("all-MiniLM-L6-v2")
    model = SentenceClassifier(encoder, len(labels)).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    loss_fn = nn.CrossEntropyLoss()

    model.train()
    for epoch in range(2):   # 🔥 DO NOT INCREASE
        for batch_texts, batch_labels in dataloader:
            batch_labels = torch.tensor(batch_labels).to(device)

            optimizer.zero_grad()
            logits = model(batch_texts)
            loss = loss_fn(logits, batch_labels)
            loss.backward()
            optimizer.step()

    secondary_models_ft[primary] = {
        "model": model,
        "label2id": label2id,
        "id2label": id2label
    }



Training secondary model for PRIMARY = Debt collection
Label distribution: Counter({'Attempts to collect debt not owed': 295, 'Written notification about debt': 284})


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
  batch_labels = torch.tensor(batch_labels).to(device)



Training secondary model for PRIMARY = Mortgage
Label distribution: Counter({'Trouble during payment process': 309, 'Loan modification,collection,foreclosure': 301, 'Incorrect information on your report': 1})


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.



Training secondary model for PRIMARY = Credit reporting (all)
Label distribution: Counter({'Incorrect information on credit report': 308, "Problem with a credit reporting company's investigation into an existing problem": 307, "Problem with a company's investigation into an existing problem": 305, 'Incorrect information on your report': 295, 'Improper use of your report': 291})


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [37]:
# def predict_secondary(text, primary_pred):
#     # Deterministic case
#     if primary_pred == "Checking or savings account":
#         return "Managing an account"

#     # Rule override (high precision)
#     rule_hit = apply_secondary_rules(text, primary_pred)
#     if rule_hit is not None:
#         return rule_hit

#     # Fine-tuned model exists
#     if primary_pred in secondary_models_ft:
#         pack = secondary_models_ft[primary_pred]
#         model = pack["model"]
#         id2label = pack["id2label"]

#         emb = model.encode([text], convert_to_numpy=True)
#         logits = model._first_module().auto_model(
#             input_ids=None
#         )

#         # Sentence-Transformers shortcut
#         probs = model.predict([text])
#         pred_id = int(np.argmax(probs))
#         return id2label[pred_id]

#     # Fallback (should be rare)
#     return None


In [38]:
def predict_secondary(text, primary_pred):
    if primary_pred == "Checking or savings account":
        return "Managing an account"

    rule_hit = apply_secondary_rules(text, primary_pred)
    if rule_hit is not None:
        return rule_hit

    if primary_pred in secondary_models_ft:
        pack = secondary_models_ft[primary_pred]
        model = pack["model"]
        id2label = pack["id2label"]

        model.eval()
        with torch.no_grad():
            logits = model([text])
            pred_id = int(torch.argmax(logits, dim=1).item())

        return id2label[pred_id]

    return None


###SEVERITY

In [39]:
SEVERITY_5_KEYWORDS = [
    "fraud",
    "identity theft",
    "unauthorized transaction"
]


def is_severity_5(text):
    text = text.lower()
    return any(kw in text for kw in SEVERITY_5_KEYWORDS)


In [40]:
from sklearn.linear_model import Ridge
from sklearn.feature_extraction.text import TfidfVectorizer

severity_vec = TfidfVectorizer(
    ngram_range=(1, 2),
    min_df=5,
    max_df=0.9,
    stop_words="english",
    sublinear_tf=True
)

X = severity_vec.fit_transform(train_df["complaint_text"])
y = train_df["severity"]

severity_reg = Ridge(alpha=5.0)
severity_reg.fit(X, y)


In [41]:
import numpy as np

def predict_severity(text):
    if is_severity_5(text):
        return 5

    Xs = severity_vec.transform([text])
    pred = severity_reg.predict(Xs)[0]

    # HARD conservative bias
    pred = np.clip(pred, 1, 3)
    pred = int(round(pred))

    # Collapse 2 → 1 (IMPORTANT)
    if pred == 2:
        return 1

    return pred



In [42]:
def unmerge_credit_reporting(secondary):
    if secondary == "Problem with a credit reporting company's investigation into an existing problem":
        return "Credit reporting, credit repair services, or other personal consumer reports"

    if secondary in {
        "Incorrect information on credit report",
        "Incorrect information on your report",
        "Improper use of your report"
    }:
        return "Credit reporting or other personal consumer reports"

    # fallback (rare)
    return "Credit reporting"


In [43]:
def map_primary_for_submission(primary_internal, secondary_pred):
    if primary_internal != "Credit reporting (all)":
        return primary_internal

    return unmerge_credit_reporting(secondary_pred)


In [46]:
test_predictions = []

for _, row in test_df.iterrows():
    text = row["complaint_text"]

    primary_pred = primary_clf.predict(
        primary_vectorizer.transform([text])
    )[0]

    secondary_pred = predict_secondary(text, primary_pred)
    severity_pred = predict_severity(text)

    primary_out = map_primary_for_submission(
        primary_pred,
        secondary_pred
    )

    test_predictions.append({
        "complaint_id": row["complaint_id"],
        "primary_category": primary_out,
        "secondary_category": secondary_pred,
        "severity": severity_pred
    })


In [47]:
submission_df = pd.DataFrame(test_predictions)

submission_df = submission_df[
    ["complaint_id", "primary_category", "secondary_category", "severity"]
]

submission_df.to_csv("submission.csv", index=False)


In [48]:
submission_df.isnull().sum()


Unnamed: 0,0
complaint_id,0
primary_category,0
secondary_category,0
severity,0
