In [2]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, TFAutoModel
from sklearn.model_selection import train_test_split

In [3]:
# -----------------------------
# Load data
# -----------------------------
data_path = '/Users/jonathankipping/code/syeda-tabassum-rahaman/scam-job-detector/raw_data/data_cleaned.csv'
df = pd.read_csv(data_path)

X = df.drop(columns=["fraudulent"])
y = df["fraudulent"]

# -----------------------------
# Combine text columns
# -----------------------------
text_columns = [
    "title",
    "company_profile",
    "description",
    "requirements",
    "benefits"
]

combined_text = X[text_columns].fillna("").agg(" ".join, axis=1).to_list()

# -----------------------------
# Train-test split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    combined_text, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# -----------------------------
# Tokenizer
# -----------------------------
tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-tiny")

train_tokens = tokenizer(
    X_train,
    padding="max_length",
    truncation=True,
    max_length=256,
    return_tensors="np",
)

test_tokens = tokenizer(
    X_test,
    padding="max_length",
    truncation=True,
    max_length=256,
    return_tensors="np",
)

# -----------------------------
# Load Tiny BERT (base model, not classifier)
# -----------------------------
model = TFAutoModel.from_pretrained(
    "prajjwal1/bert-tiny",
    from_pt=True
)

# -----------------------------
# Feature Extraction
# -----------------------------
def get_bert_features(tokens):
    outputs = model(
        input_ids=tokens["input_ids"],
        attention_mask=tokens["attention_mask"],
    )

    # CLS token â†’ outputs.last_hidden_state[:, 0, :]
    cls_embeddings = outputs.last_hidden_state[:, 0, :]

    return cls_embeddings.numpy()

X_train_features = get_bert_features(train_tokens)
X_test_features = get_bert_features(test_tokens)

print("Train embeddings shape:", X_train_features.shape)
print("Test embeddings shape:", X_test_features.shape)


TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'bert.embeddings.position_ids', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClas

: 

In [9]:
print(f"""
Model Performance
Recall:             {recall_score(y_test, y_pred):.4f}
Precision:          {precision_score(y_test, y_pred):.4f}
Balanced Accuracy:  {balanced_accuracy_score(y_test, y_pred):.4f}
F1 Score:           {f1_score(y_test, y_pred):.4f}
""")


Model Performance
Recall:             0.7977
Precision:          0.9517
Balanced Accuracy:  0.8978
F1 Score:           0.8679

