# Multi-Output Classifier Workflow

This notebook demonstrates a complete workflow for building a multi-output classifier for review data. The steps include data loading, preprocessing, feature engineering, train-test splitting, model training, hyperparameter tuning, evaluation, and error analysis.

### Dependencies

## Step 1: Import Dependencies
Import all required libraries and install any missing packages needed for data processing, modeling, and evaluation.

In [1]:
import yaml
import os
import json

! pip install tldextract
import re
import tldextract

from transformers import pipeline
from tqdm import tqdm

! pip install textblob
from textblob import TextBlob
import pandas as pd

import torch
from transformers import pipeline

from sklearn.model_selection import train_test_split
! pip install iterative-stratification
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import numpy as np

import torch
from torch.utils.data import IterableDataset, DataLoader
import torch.nn as nn
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModel
from torch.optim import AdamW
from sklearn.metrics import precision_recall_fscore_support, average_precision_score



Collecting tldextract
  Downloading tldextract-5.3.0-py3-none-any.whl.metadata (11 kB)
Collecting requests-file>=1.4 (from tldextract)
  Downloading requests_file-2.1.0-py2.py3-none-any.whl.metadata (1.7 kB)
Downloading tldextract-5.3.0-py3-none-any.whl (107 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.4/107.4 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading requests_file-2.1.0-py2.py3-none-any.whl (4.2 kB)
Installing collected packages: requests-file, tldextract
Successfully installed requests-file-2.1.0 tldextract-5.3.0
Collecting iterative-stratification
  Downloading iterative_stratification-0.1.9-py3-none-any.whl.metadata (1.3 kB)
Downloading iterative_stratification-0.1.9-py3-none-any.whl (8.5 kB)
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.9


## Step 2: Load Data
Read the datasets and display basic information about missing values.

In [2]:
with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)


labeled_input_folder = config['labeled_input']

full_df = pd.read_csv('data/labeled/all_reviews_with_labels_normalised.csv')
full_df.isnull().sum()

Unnamed: 0,0
review_text,0
rating,253
has_photo,0
author_name,0
user_review_count,253
business_name,0
category,0
source,0
review_id,0
comprehensive_review,0


In [3]:
synthetic_combined_df = pd.read_csv('data/labeled/synthetic_combined.csv')
synthetic_combined_df.isnull().sum()

Unnamed: 0,0
review_text,0
rating,0
has_photo,0
author_name,0
user_review_count,0
business_name,0
category,0
source,0
review_id,0
is_ad,0


In [4]:
# Save as JSON
output_json_path = os.path.join(labeled_input_folder, "full_df.json")
full_df.to_json(output_json_path, orient="records", lines=True, force_ascii=False)
print(f"JSON file saved to: {output_json_path}")

# Save as Parquet
output_parquet_path = os.path.join(labeled_input_folder, "full_df.parquet")
full_df.to_parquet(output_parquet_path, index=False)
print(f"Parquet file saved to: {output_parquet_path}")

JSON file saved to: ./data/labeled/full_df.json
Parquet file saved to: ./data/labeled/full_df.parquet


In [74]:
to_clean_df = full_df.dropna(subset=['review_text', 'is_ad', 'is_relevant', 'is_rant', 'is_legit'])
to_clean_df = to_clean_df.drop(columns='is_legit')
to_clean_df.head()

Unnamed: 0,review_text,rating,has_photo,author_name,user_review_count,business_name,category,source,review_id,comprehensive_review,is_ad,is_relevant,is_rant
0,Love the convenience of this neighborhood carw...,4.0,False,Doug Schmidt,1.0,"Auto Spa Speedy Wash - Harvester, MO",['Car wash'],google,1001,"[Business] Auto Spa Speedy Wash - Harvester, M...",False,True,False
1,"2 bathrooms (for a large 2 story building), 1 ...",2.0,False,Duf Duftopia,1.0,Kmart,"['Discount store', 'Appliance store', 'Baby st...",google,1002,[Business] Kmart | [Category] ['Discount store...,True,True,True
2,My favorite pizza shop hands down!,5.0,False,Andrew Phillips,1.0,Papa’s Pizza,"['Pizza restaurant', 'Chicken wings restaurant...",google,1003,[Business] Papa’s Pizza | [Category] ['Pizza r...,False,True,False
3,BOTCHED INSTRUMENT REPAIR IS COSTING US HUNDRE...,1.0,False,Julie Heiland,1.0,The Music Place,['Musical instrument store'],google,1004,[Business] The Music Place | [Category] ['Musi...,False,True,True
4,Very unprofessional!!!!!,1.0,False,Alan Khasanov,1.0,Park Motor Cars Inc,['Used car dealer'],google,1005,[Business] Park Motor Cars Inc | [Category] ['...,False,True,True


In [75]:
synthetic_to_clean_df = synthetic_combined_df.dropna(subset=['review_text', 'is_ad', 'is_relevant', 'is_rant', 'is_legit'])
synthetic_to_clean_df = synthetic_to_clean_df.drop(columns='is_legit')
synthetic_to_clean_df.head()

Unnamed: 0,review_text,rating,has_photo,author_name,user_review_count,business_name,category,source,review_id,is_ad,is_rant,is_relevant
0,Had a fantastic meal at The Gourmet Place! The...,5,True,Sophia Turner,23,The Gourmet Place,restaurant,Google,20615,True,False,True
1,Great experience at FitLife Gym! The equipment...,4,False,Michael Brown,15,FitLife Gym,gym,Yelp,20616,True,False,True
2,Staying at Oceanview Hotel was a dream! The vi...,5,True,Emily Davis,30,Oceanview Hotel,hotel,Google,20617,True,False,True
3,The Family Clinic is amazing! They really take...,4,False,Liam Wilson,10,Family Clinic,clinic,Yelp,20618,True,False,True
4,I love shopping at Trendy Mall! They have ever...,5,True,Olivia Johnson,28,Trendy Mall,shopping mall,Google,20619,True,False,True


In [76]:
print(to_clean_df.shape)
print(to_clean_df.isnull().sum())

(11920, 13)
review_text               0
rating                  253
has_photo                 0
author_name               0
user_review_count       253
business_name             0
category                  0
source                    0
review_id                 0
comprehensive_review      0
is_ad                     0
is_relevant               0
is_rant                   0
dtype: int64


In [77]:
print(synthetic_to_clean_df.shape)
print(synthetic_to_clean_df.isnull().sum())

(714, 12)
review_text          0
rating               0
has_photo            0
author_name          0
user_review_count    0
business_name        0
category             0
source               0
review_id            0
is_ad                0
is_rant              0
is_relevant          0
dtype: int64


In [78]:
# Save as JSON
output_json_path = os.path.join(labeled_input_folder, "to_clean_df.json")
to_clean_df.to_json(output_json_path, orient="records", lines=True, force_ascii=False)
print(f"JSON file saved to: {output_json_path}")

# Save as Parquet
output_parquet_path = os.path.join(labeled_input_folder, "to_clean_df.parquet")
to_clean_df.to_parquet(output_parquet_path, index=False)
print(f"Parquet file saved to: {output_parquet_path}")

JSON file saved to: ./data/labeled/to_clean_df.json
Parquet file saved to: ./data/labeled/to_clean_df.parquet


## Step 3: Preprocess DataFrames
Clean the review text, extract features, and perform sentiment analysis to enrich the dataset for modeling.

##### 2.1 Cleaning Functions

In [79]:
def normalize_whitespace(text):
    return re.sub(r'\s+', ' ', text).strip()

def clean_urls(text):
    url_pattern = re.compile(r'https?://[^\s]+')
    urls = url_pattern.findall(text)
    domains = [tldextract.extract(u).domain for u in urls]
    text_cleaned = url_pattern.sub(' '.join(domains), text)
    return text_cleaned

def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text)
    text = clean_urls(text)
    text = normalize_whitespace(text)
    return text

##### 2.2 Compute Basic Signals

In [80]:
def compute_basic_signals(text):
    url_count = len(re.findall(r'https?://\S+', text))
    phone_count = len(re.findall(r'\+?\d[\d\s-]{7,}\d', text))
    caps_ratio = sum(1 for c in text if c.isupper()) / max(len(text), 1)
    return url_count, phone_count, caps_ratio

##### 2.3 Sentiment Analysis

In [81]:
def add_textblob_sentiment(df, text_col="review_text", positive_threshold=0.9, negative_threshold=-0.9):
    def get_sentiment(text):
        if pd.isna(text) or not isinstance(text, str) or text.strip() == "":
            return 0.0, 0.0
        try:
            analysis = TextBlob(text)
            return analysis.sentiment.polarity, analysis.sentiment.subjectivity
        except Exception:
            return 0.0, 0.0

    sentiment_results = df[text_col].apply(get_sentiment)
    df["sentiment_polarity"], df["sentiment_subjectivity"] = zip(*sentiment_results)

    df["is_extreme_sentiment"] = df["sentiment_polarity"].apply(
        lambda x: 1 if x >= positive_threshold or x <= negative_threshold else 0
    )

    return df

##### Apply to Dataframe

In [82]:
def preprocess_reviews(df):
    df["clean_text"] = df["review_text"].apply(clean_text)
    signals = df["clean_text"].apply(compute_basic_signals)
    df["url_count"], df["phone_count"], df["caps_ratio"] = zip(*signals)
    df = add_textblob_sentiment(df)
    return df

cleaned_df = preprocess_reviews(to_clean_df)
cleaned_df.shape

(11920, 20)

In [83]:
cleaned_df.head()

Unnamed: 0,review_text,rating,has_photo,author_name,user_review_count,business_name,category,source,review_id,comprehensive_review,is_ad,is_relevant,is_rant,clean_text,url_count,phone_count,caps_ratio,sentiment_polarity,sentiment_subjectivity,is_extreme_sentiment
0,Love the convenience of this neighborhood carw...,4.0,False,Doug Schmidt,1.0,"Auto Spa Speedy Wash - Harvester, MO",['Car wash'],google,1001,"[Business] Auto Spa Speedy Wash - Harvester, M...",False,True,False,Love the convenience of this neighborhood carw...,0,0,0.02,0.5,0.6,0
1,"2 bathrooms (for a large 2 story building), 1 ...",2.0,False,Duf Duftopia,1.0,Kmart,"['Discount store', 'Appliance store', 'Baby st...",google,1002,[Business] Kmart | [Category] ['Discount store...,True,True,True,"2 bathrooms (for a large 2 story building), 1 ...",0,0,0.016949,-0.026339,0.353571,0
2,My favorite pizza shop hands down!,5.0,False,Andrew Phillips,1.0,Papa’s Pizza,"['Pizza restaurant', 'Chicken wings restaurant...",google,1003,[Business] Papa’s Pizza | [Category] ['Pizza r...,False,True,False,My favorite pizza shop hands down!,0,0,0.029412,0.152778,0.644444,0
3,BOTCHED INSTRUMENT REPAIR IS COSTING US HUNDRE...,1.0,False,Julie Heiland,1.0,The Music Place,['Musical instrument store'],google,1004,[Business] The Music Place | [Category] ['Musi...,False,True,True,BOTCHED INSTRUMENT REPAIR IS COSTING US HUNDRE...,0,0,0.042589,0.016936,0.550466,0
4,Very unprofessional!!!!!,1.0,False,Alan Khasanov,1.0,Park Motor Cars Inc,['Used car dealer'],google,1005,[Business] Park Motor Cars Inc | [Category] ['...,False,True,True,Very unprofessional!!!!!,0,0,0.041667,0.610352,0.3,0


In [84]:
synthetic_cleaned_df = preprocess_reviews(synthetic_to_clean_df)
synthetic_cleaned_df.shape

(714, 19)

In [85]:
synthetic_cleaned_df.head()

Unnamed: 0,review_text,rating,has_photo,author_name,user_review_count,business_name,category,source,review_id,is_ad,is_rant,is_relevant,clean_text,url_count,phone_count,caps_ratio,sentiment_polarity,sentiment_subjectivity,is_extreme_sentiment
0,Had a fantastic meal at The Gourmet Place! The...,5,True,Sophia Turner,23,The Gourmet Place,restaurant,Google,20615,True,False,True,Had a fantastic meal at The Gourmet Place! The...,0,0,0.045455,0.484722,0.684444,0
1,Great experience at FitLife Gym! The equipment...,4,False,Michael Brown,15,FitLife Gym,gym,Yelp,20616,True,False,True,Great experience at FitLife Gym! The equipment...,0,1,0.041958,0.68875,0.595,0
2,Staying at Oceanview Hotel was a dream! The vi...,5,True,Emily Davis,30,Oceanview Hotel,hotel,Google,20617,True,False,True,Staying at Oceanview Hotel was a dream! The vi...,0,0,0.041667,1.0,1.0,1
3,The Family Clinic is amazing! They really take...,4,False,Liam Wilson,10,Family Clinic,clinic,Yelp,20618,True,False,True,The Family Clinic is amazing! They really take...,0,0,0.037879,0.483333,0.533333,0
4,I love shopping at Trendy Mall! They have ever...,5,True,Olivia Johnson,28,Trendy Mall,shopping mall,Google,20619,True,False,True,I love shopping at Trendy Mall! They have ever...,0,0,0.053763,0.71875,0.75,0


## Step 4: Train-Test Split with Multi-Label Stratification
Split the cleaned data into training and testing sets using multi-label stratification to preserve label distribution.

In [87]:
meta_cols = ["url_count","phone_count","caps_ratio","rating","has_photo","user_review_count","sentiment_polarity","sentiment_subjectivity"]
label_cols = ["is_ad","is_relevant","is_rant"]

X = cleaned_df.drop(columns=label_cols)
y = cleaned_df[label_cols].values

mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=42)
train_idx, test_idx = next(mskf.split(X, y))

train_df = cleaned_df.iloc[train_idx].reset_index(drop=True)
test_df = cleaned_df.iloc[test_idx].reset_index(drop=True)

In [88]:
train_df.shape

(9536, 20)

In [89]:
train_df = pd.concat([train_df, synthetic_cleaned_df], ignore_index=True)

In [90]:
train_df.shape

(10250, 20)

In [91]:
# Save train_df and test_df as CSV files
train_df.to_csv('data/labeled/train_df.csv', index=False)
test_df.to_csv('data/labeled/test_df.csv', index=False)
print('train_df and test_df saved to data/labeled/train_df.csv and data/labeled/test_df.csv')

train_df and test_df saved to data/labeled/train_df.csv and data/labeled/test_df.csv


## Step 5: Tokenisation and Dataset Preparation
Tokenize the review text and prepare PyTorch datasets and dataloaders for model training and evaluation.

In [92]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
max_len = 128

class ReviewDataset(IterableDataset):
    def __init__(self, df, tokenizer, max_len, meta_cols, label_cols):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.meta_cols = meta_cols
        self.label_cols = label_cols

    def __iter__(self):
        for _, row in self.df.iterrows():
            enc = self.tokenizer(
                row["clean_text"],
                truncation=True,
                padding="max_length",
                max_length=self.max_len,
                return_tensors="pt"
            )
            meta = torch.tensor([row[c] for c in self.meta_cols], dtype=torch.float32)
            labels = torch.tensor([row[c] for c in self.label_cols], dtype=torch.float32)

            yield {
                "input_ids": enc["input_ids"].squeeze(0),
                "attention_mask": enc["attention_mask"].squeeze(0),
                "meta": meta,
                "labels": labels
            }

train_dataset = ReviewDataset(train_df, tokenizer, max_len, meta_cols, label_cols)
test_dataset = ReviewDataset(test_df, tokenizer, max_len, meta_cols, label_cols)

train_loader = DataLoader(train_dataset, batch_size=8)
test_loader = DataLoader(test_dataset, batch_size=8)


## Step 6: Multi-Output Classifier
Build and train a multi-output classifier using Random Forests to predict multiple labels for each review.

In [93]:
# run this to remove review_category_similarity whenever we need to rerun
if 'review_category_similarity' in train_df.columns and 'review_category_similarity' in test_df.columns:
    train_df = train_df.drop(columns=['review_category_similarity'])
    test_df = test_df.drop(columns=['review_category_similarity'])

In [94]:
meta_cols = ["url_count","phone_count","caps_ratio","rating","has_photo","user_review_count","sentiment_polarity","sentiment_subjectivity","review_category_similarity"]

In [None]:
#! pip install sentence-transformers

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load the MiniLM model
embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Ensure category is a string (if it's a list, join it)
train_df['category'] = train_df['category'].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))
test_df['category'] = test_df['category'].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))

# Compute embeddings for all clean_text and category in both train and test
train_text_emb = embedder.encode(train_df['clean_text'].tolist(), convert_to_numpy=True, show_progress_bar=True)
train_cat_emb = embedder.encode(train_df['category'].tolist(), convert_to_numpy=True, show_progress_bar=True)
test_text_emb = embedder.encode(test_df['clean_text'].tolist(), convert_to_numpy=True, show_progress_bar=True)
test_cat_emb = embedder.encode(test_df['category'].tolist(), convert_to_numpy=True, show_progress_bar=True)

# Compute cosine similarity for each row
train_df['review_category_similarity'] = np.array([cosine_similarity([a], [b])[0,0] for a, b in zip(train_text_emb, train_cat_emb)])
test_df['review_category_similarity'] = np.array([cosine_similarity([a], [b])[0,0] for a, b in zip(test_text_emb, test_cat_emb)])

Batches:   0%|          | 0/321 [00:00<?, ?it/s]

Batches:   0%|          | 0/321 [00:00<?, ?it/s]

Batches:   0%|          | 0/75 [00:00<?, ?it/s]

Batches:   0%|          | 0/75 [00:00<?, ?it/s]

In [96]:
train_df[1:10]

Unnamed: 0,review_text,rating,has_photo,author_name,user_review_count,business_name,category,source,review_id,comprehensive_review,...,is_relevant,is_rant,clean_text,url_count,phone_count,caps_ratio,sentiment_polarity,sentiment_subjectivity,is_extreme_sentiment,review_category_similarity
1,"2 bathrooms (for a large 2 story building), 1 ...",2.0,False,Duf Duftopia,1.0,Kmart,"['Discount store', 'Appliance store', 'Baby st...",google,1002,[Business] Kmart | [Category] ['Discount store...,...,True,True,"2 bathrooms (for a large 2 story building), 1 ...",0,0,0.016949,-0.026339,0.353571,0,0.461022
2,My favorite pizza shop hands down!,5.0,False,Andrew Phillips,1.0,Papa’s Pizza,"['Pizza restaurant', 'Chicken wings restaurant...",google,1003,[Business] Papa’s Pizza | [Category] ['Pizza r...,...,True,False,My favorite pizza shop hands down!,0,0,0.029412,0.152778,0.644444,0,0.615682
3,BOTCHED INSTRUMENT REPAIR IS COSTING US HUNDRE...,1.0,False,Julie Heiland,1.0,The Music Place,['Musical instrument store'],google,1004,[Business] The Music Place | [Category] ['Musi...,...,True,True,BOTCHED INSTRUMENT REPAIR IS COSTING US HUNDRE...,0,0,0.042589,0.016936,0.550466,0,0.343129
4,Very unprofessional!!!!!,1.0,False,Alan Khasanov,1.0,Park Motor Cars Inc,['Used car dealer'],google,1005,[Business] Park Motor Cars Inc | [Category] ['...,...,True,True,Very unprofessional!!!!!,0,0,0.041667,0.610352,0.3,0,0.064715
5,good Donuts!,4.0,False,Mar De Mundo,1.0,Dunkin',"['Coffee shop', 'Bagel shop', 'Bakery', 'Break...",google,1006,[Business] Dunkin' | [Category] ['Coffee shop'...,...,True,False,good Donuts!,0,0,0.083333,0.875,0.6,0,0.228043
6,(Translated by Google) The food is very tasty....,5.0,False,Felix Toledo,1.0,Fire Pit Barbecue,"['Barbecue restaurant', 'Portuguese restaurant']",google,1008,[Business] Fire Pit Barbecue | [Category] ['Ba...,...,True,False,(Translated by Google) The food is very tasty....,0,0,0.064103,0.2875,0.525,0,0.324582
7,It's a wegmans. If you've been to one you've b...,4.0,False,Francis C,1.0,Wegmans,"['Grocery store', 'Supermarket']",google,1009,[Business] Wegmans | [Category] ['Grocery stor...,...,True,False,It's a wegmans. If you've been to one you've b...,0,0,0.033333,0.5,0.6,0,0.302135
8,Ok,5.0,False,Michael Dicellis,1.0,Starbucks,"['Coffee shop', 'Breakfast restaurant', 'Cafe'...",google,1012,[Business] Starbucks | [Category] ['Coffee sho...,...,True,False,Ok,0,0,0.5,0.5,0.5,0,0.066131
9,"Nice easy place to hike for the kids, lots of ...",5.0,True,Glenn Gaerlan,1.0,Great Swamp National Wildlife Refuge,"['Wildlife refuge', 'Tourist attraction']",google,1013,[Business] Great Swamp National Wildlife Refug...,...,True,False,"Nice easy place to hike for the kids, lots of ...",0,0,0.020619,0.511111,0.777778,0,0.564347


In [97]:
test_df.head()

Unnamed: 0,review_text,rating,has_photo,author_name,user_review_count,business_name,category,source,review_id,comprehensive_review,...,is_relevant,is_rant,clean_text,url_count,phone_count,caps_ratio,sentiment_polarity,sentiment_subjectivity,is_extreme_sentiment,review_category_similarity
0,Excellent fresh seafood. You can get a delicio...,5.0,False,Marti Helmick,1.0,Anthony's Seafood,['Seafood restaurant'],google,1007,[Business] Anthony's Seafood | [Category] ['Se...,...,True,False,Excellent fresh seafood. You can get a delicio...,0,0,0.017007,0.261111,0.538889,0,0.531162
1,High-end mall,5.0,False,Tomer Laufman,1.0,Menlo Park Mall,['Shopping mall'],google,1010,[Business] Menlo Park Mall | [Category] ['Shop...,...,True,False,High-end mall,0,0,0.076923,0.0,0.0,0,0.74091
2,Wonderful ambiance and food. You can't beat t...,5.0,False,George Levites,1.0,Pantagis Renaissance,['Wedding venue'],google,1011,[Business] Pantagis Renaissance | [Category] [...,...,True,False,Wonderful ambiance and food. You can't beat th...,0,0,0.034483,1.0,1.0,1,0.112861
3,My wife and I decided to give this place a try...,3.0,True,Samuel Thomas,1.0,Cara Mia,"['Italian restaurant', 'Restaurant']",google,1017,[Business] Cara Mia | [Category] ['Italian res...,...,True,False,My wife and I decided to give this place a try...,0,0,0.020666,0.073144,0.587454,0,0.38059
4,Very good and professional services. Very nice...,5.0,False,Asem Shamah,1.0,Ramsey Subaru Service Center,"['Auto repair shop', 'Auto air conditioning se...",google,1026,[Business] Ramsey Subaru Service Center | [Cat...,...,True,False,Very good and professional services. Very nice...,0,0,0.028986,0.5225,0.6575,0,0.252677


In [98]:
print(meta_cols)

['url_count', 'phone_count', 'caps_ratio', 'rating', 'has_photo', 'user_review_count', 'sentiment_polarity', 'sentiment_subjectivity', 'review_category_similarity']


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

train_text_emb = train_text_emb.astype(np.float32)
test_text_emb = test_text_emb.astype(np.float32)

X_train_meta = train_df[meta_cols].fillna(0).astype(np.float32)
X_test_meta  = test_df[meta_cols].fillna(0).astype(np.float32)

X_train_combined = np.hstack([X_train_meta, train_text_emb])
X_test_combined  = np.hstack([X_test_meta, test_text_emb])

y_train = train_df[label_cols]
y_train_int = y_train.astype(int)
y_test = test_df[label_cols]
y_test_int = y_test.astype(int)

In [None]:
rf_classifier = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42))
rf_classifier.fit(X_train_combined, y_train_int)

y_pred = rf_classifier.predict(X_test_combined)

In [None]:
from sklearn.metrics import precision_score, recall_score
for i, label in enumerate(label_cols):
    f1 = f1_score(y_test_int.iloc[:, i], y_pred[:, i])
    prec = precision_score(y_test_int.iloc[:, i], y_pred[:, i], zero_division=0)
    rec = recall_score(y_test_int.iloc[:, i], y_pred[:, i], zero_division=0)
    print(f"{label} F1-score: {f1:.3f}, Precision: {prec:.3f}, Recall: {rec:.3f}")

is_ad F1-score: 0.000, Precision: 0.000, Recall: 0.000
is_relevant F1-score: 0.983, Precision: 0.967, Recall: 1.000
is_rant F1-score: 0.208, Precision: 0.605, Recall: 0.126


In [None]:
import pickle

filename = 'random_forest_model.pkl'
pickle.dump(rf_classifier, open(filename, 'wb'))
print(f"Model saved as {filename}")
print(rf_classifier.get_params())

Model saved as random_forest_model.pkl
{'estimator__bootstrap': True, 'estimator__ccp_alpha': 0.0, 'estimator__class_weight': None, 'estimator__criterion': 'gini', 'estimator__max_depth': None, 'estimator__max_features': 'sqrt', 'estimator__max_leaf_nodes': None, 'estimator__max_samples': None, 'estimator__min_impurity_decrease': 0.0, 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 2, 'estimator__min_weight_fraction_leaf': 0.0, 'estimator__monotonic_cst': None, 'estimator__n_estimators': 100, 'estimator__n_jobs': None, 'estimator__oob_score': False, 'estimator__random_state': 42, 'estimator__verbose': 0, 'estimator__warm_start': False, 'estimator': RandomForestClassifier(random_state=42), 'n_jobs': None}


## Step 7: Hyperparameter Tuning with RandomizedSearchCV
Use RandomizedSearchCV to tune hyperparameters of the multi-output Random Forest classifier for improved performance.

In [117]:
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.metrics import make_scorer, f1_score

param_dist = {
    'estimator__n_estimators': np.arange(100, 1001, 100),
    'estimator__max_depth': [None] + list(np.arange(5, 51, 5)),
    'estimator__min_samples_split': [2, 5, 10],
    'estimator__min_samples_leaf': [1, 2, 4, 6],
    'estimator__max_features': ['sqrt', 'log2']
}

# Make F1 scorer for multilabel
f1_macro = make_scorer(f1_score, average='macro', zero_division=0)

# Multi-output random forest
rf_multi = MultiOutputClassifier(RandomForestClassifier(random_state=42))

# Randomized search
random_search = RandomizedSearchCV(
    estimator=rf_multi,
    param_distributions=param_dist,
    n_iter=50,
    cv=KFold(n_splits=5, shuffle=True, random_state=42),
    n_jobs=-1,
    random_state=42,
    scoring=f1_macro
)

In [116]:
import numpy as np
print(np.unique(y_train_int, return_counts=True))

(array([0, 1]), array([19672, 11078]))


In [None]:
random_search.fit(X_train_combined, y_train_int)

In [None]:
print("Best Parameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)

In [None]:
best_model = random_search.best_estimator_
best_model.fit(X_train_combined, y_train)
y_pred = best_model.predict(X_test_combined)

In [None]:
import pickle

filename = 'finetuned_random_forest_model.pkl'
pickle.dump(best_model, open(filename, 'wb'))
print(f"Model saved as {filename}")
print(best_model.get_params())

In [None]:
from sklearn.metrics import precision_score, recall_score

for i, label in enumerate(label_cols):
    f1 = f1_score(y_test.iloc[:, i], y_pred[:, i])
    prec = precision_score(y_test.iloc[:, i], y_pred[:, i])
    rec = recall_score(y_test.iloc[:, i], y_pred[:, i])
    print(f"{label} F1-score: {f1:.3f}, Precision: {prec:.3f}, Recall: {rec:.3f}")

## Step 8: Model Evaluation
Evaluate the best model's performance on the test set using F1-score, precision, and recall for each label.

In [None]:
# Save y_test and y_pred as separate CSV files
import pandas as pd
y_test_reset = y_test.reset_index(drop=True)
y_pred_df = pd.DataFrame(y_pred, columns=label_cols)
y_test_reset.to_csv('data/labeled/y_test.csv')
y_pred_df.to_csv('data/labeled/y_pred.csv')
print('y_test and y_pred saved to data/labeled/y_test.csv and data/labeled/y_pred.csv')

## Step 9: Save Predictions
Save the true and predicted labels to CSV files for further analysis and error inspection.

In [None]:
# Print rows where y_pred does not match y_test (wrong predictions for any label)
import numpy as np
mismatch_mask = (y_pred_df.values != y_test_reset.values).any(axis=1)
wrong_preds = y_test_reset.copy()
for col in label_cols:
    wrong_preds[col + '_pred'] = y_pred_df[col]
wrong_rows = wrong_preds[mismatch_mask]
print(f"Number of wrong predictions: {wrong_rows.shape[0]}")
wrong_rows.head()