In [25]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/masked-email/masked_emails (1).csv
/kaggle/input/email-type/Email-type.csv


In [None]:
# 1) Install & download spaCy models
# Install required libraries
!pip install datasets presidio-analyzer presidio-anonymizer accelerate==0.27.2
!pip install transformers==4.39.3 peft==0.10.0
!pip install -q evaluate


!python -m spacy download en_core_web_lg --quiet
!python -m spacy download de_core_news_md --quiet
!python -m spacy download es_core_news_md --quiet
!python -m spacy download fr_core_news_md --quiet
!python -m spacy download pt_core_news_md --quiet

In [None]:
import pandas as pd
import re
from presidio_analyzer import (
    RecognizerRegistry,
    AnalyzerEngine,
    PatternRecognizer,
    Pattern,
)
from presidio_analyzer.predefined_recognizers import (
    EmailRecognizer,
    PhoneRecognizer,
    DateRecognizer,
    CreditCardRecognizer,
    SpacyRecognizer,
)
from presidio_analyzer.context_aware_enhancers import LemmaContextAwareEnhancer
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig
from presidio_analyzer.nlp_engine import NlpEngineProvider



def clean_text(text):

    text = re.sub(r"<[^>]+>", "", text)

    text = re.sub(r"[^a-zA-Z0-9\s.,!?]", "", text)

    text = re.sub(r"\s+", " ", text).strip()
    return text.lower()



nlp_conf = {
    "nlp_engine_name": "spacy",
    "models": [
        {"lang_code": code, "model_name": model}
        for code, model in [
            ("en", "en_core_web_lg"),
            ("de", "de_core_news_md"),
            ("es", "es_core_news_md"),
            ("fr", "fr_core_news_md"),
            ("pt", "pt_core_news_md"),
            ("nl", "nl_core_news_md"),
            ("it", "it_core_news_md"),
        ]
    ],
}
nlp_engine = NlpEngineProvider(nlp_configuration=nlp_conf).create_engine()

registry = RecognizerRegistry()

registry.supported_languages = ["en", "de", "es", "fr", "pt", "nl", "it"]

enhancer = LemmaContextAwareEnhancer(
    context_similarity_factor=0.35,
    min_score_with_context_similarity=0.4,
    context_prefix_count=3,
    context_suffix_count=3,
)

language_contexts = {
    "en": {
        "person": ["name", "called", "i am", "my name is"],
        "email": ["email", "e-mail", "mail address"],
        "phone": ["phone", "tel", "mobile number"],
        "date": ["born", "dob", "birthdate", "birth date"],
        "card": ["card", "credit", "debit", "cvv", "expiry"],
    },
    "de": {
        "person": ["name", "heiße", "ich bin", "mein name ist", "herr", "frau"],
        "email": ["e-mail", "mail", "email-adresse"],
        "phone": ["telefon", "handy", "telefonnummer"],
        "date": ["geburtsdatum", "geburtstag", "datum"],
        "card": ["karte", "kreditkarte", "debitkarte", "ablaufdatum", "cvv"],
    },
    "es": {
        "person": ["nombre", "me llamo", "soy", "mi nombre es", "señor", "señora"],
        "email": ["correo", "electrónico", "correo electrónico"],
        "phone": ["teléfono", "móvil", "número de teléfono"],
        "date": ["nacimiento", "fecha de nacimiento"],
        "card": ["tarjeta", "crédito", "débito", "cvv", "fecha de vencimiento"],
    },
    "fr": {
        "person": [
            "nom",
            "je suis",
            "je m’appelle",
            "mon nom est",
            "monsieur",
            "madame",
        ],
        "email": ["courriel", "email", "adresse électronique"],
        "phone": ["téléphone", "portable", "numéro de téléphone"],
        "date": ["naissance", "date de naissance"],
        "card": ["carte", "crédit", "débit", "cvv", "date d’expiration"],
    },
    "pt": {
        "person": ["nome", "me chamo", "sou", "meu nome é"],
        "email": ["email", "correio", "endereço de email"],
        "phone": ["telefone", "celular", "número de telefone"],
        "date": ["nascimento", "data de nascimento"],
        "card": ["cartão", "crédito", "débito", "cvv", "validade"],
    },
    "it": {
        "person": ["nome", "mi chiamo", "sono", "il mio nome è", "signor", "signora"],
        "email": ["email", "indirizzo email", "posta elettronica"],
        "phone": ["telefono", "numero di telefono", "cellulare"],
        "date": ["nascita", "data di nascita", "compleanno"],
        "card": [
            "carta",
            "carta di credito",
            "carta di debito",
            "numero carta",
            "cvv",
            "scadenza",
        ],
    },
    "nl": {
        "person": [
            "naam",
            "ik ben",
            "mijn naam is",
            "voornaam",
            "achternaam",
            "dhr",
            "mevrouw",
        ],
        "email": ["e-mail", "e-mailadres", "emailadres"],
        "phone": ["telefoon", "telefoonnummer", "mobiel nummer", "mobiele telefoon"],
        "date": ["geboortedatum", "verjaardag", "datum van geboorte"],
        "card": [
            "kaart",
            "creditcard",
            "debetkaart",
            "pinpas",
            "bankkaart",
            "kaartnummer",
            "cvv",
            "vervaldatum",
        ],
    },
}

for lang, ctx in language_contexts.items():
    
    registry.add_recognizer(
        SpacyRecognizer(
            supported_language=lang,
            supported_entities=["PERSON"],
            context=ctx["person"],
        )
    )
    
    registry.add_recognizer(
        EmailRecognizer(supported_language=lang, context=ctx["email"])
    )
    registry.add_recognizer(
        PhoneRecognizer(supported_language=lang, context=ctx["phone"])
    )
    registry.add_recognizer(
        DateRecognizer(supported_language=lang, context=ctx["date"])
    )
    registry.add_recognizer(
        CreditCardRecognizer(supported_language=lang, context=ctx["card"])
    )


registry.add_recognizer(
    PatternRecognizer(
        supported_entity="IN_AADHAAR",
        patterns=[Pattern("aadhar", r"\b\d{4}[\s-]?\d{4}[\s-]?\d{4}\b", 0.8)],
        context=["aadhar", "uidai"],
    )
)
registry.add_recognizer(
    PatternRecognizer(
        supported_entity="CVV_NO",
        patterns=[Pattern("cvv", r"\b\d{3,4}\b", 0.7)],
        context=["cvv", "cvc", "security code"],
    )
)
registry.add_recognizer(
    PatternRecognizer(
        supported_entity="EXPIRY_NO",
        patterns=[Pattern("expiry", r"\b(0[1-9]|1[0-2])/(?:\d{2}|\d{4})\b", 0.7)],
        context=["expiry", "valid thru", "valide"],
    )
)


analyzer = AnalyzerEngine(
    registry=registry,
    nlp_engine=nlp_engine,
    supported_languages=["en", "de", "es", "fr", "pt", "nl", "it"],
    context_aware_enhancer=enhancer,
    default_score_threshold=0.3,
)
anonymizer = AnonymizerEngine()


pres_map = {
    "PERSON": "[full_name]",
    "EMAIL_ADDRESS": "[email]",
    "PHONE_NUMBER": "[phone_number]",
    "DATE_TIME": "[dob]",
    "IN_AADHAAR": "[aadhar_num]",
    "CREDIT_CARD": "[credit_debit_no]",
    "CVV_NO": "[cvv_no]",
    "EXPIRY_NO": "[expiry_no]",
}



def merge_spans(spans):
    spans = sorted(spans, key=lambda x: x.start)
    merged = []
    for s in spans:
        if merged and s.start <= merged[-1].end:
            
            if (s.end - s.start) > (merged[-1].end - merged[-1].start):
                merged[-1] = s
        else:
            merged.append(s)
    return merged



def mask_pii(text: str):

    detections = []
    for lang in ["en", "de", "es", "fr", "pt", "nl", "it"]:
        detections += analyzer.analyze(
            text=text,
            language=lang,
            entities=list(pres_map.keys()),
            score_threshold=0.3,
        )


    spans = merge_spans(detections)


    operators = {
        ent: OperatorConfig("replace", {"new_value": tok})
        for ent, tok in pres_map.items()
    }

 
    result = anonymizer.anonymize(
        text=text,
        analyzer_results=spans,
        operators={
            ent: OperatorConfig("replace", {"new_value": tok})
            for ent, tok in pres_map.items()
        },
    )


    masked_text = result.text
    entities = []
    for s in spans:
        token = pres_map[s.entity_type]
        entities.append(
            {
                "position": [s.start, s.end],
                "classification": token.strip("[]"),
                "entity": text[s.start : s.end],
            }
        )

    return masked_text, entities


# Model Training

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import numpy as np
import evaluate

# 1. Load original data for training
ds = load_dataset("csv",data_files="/kaggle/input/email-type/Email-type.csv")["train"]  

# 2. Label encoding
labels = ds.unique("type")
label2id = {l:i for i,l in enumerate(labels)}
id2label = {i:l for l,i in label2id.items()}

# 3. Tokenizer + model
model_name="xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=len(labels), id2label=id2label, label2id=label2id
)

# 4. Preprocess
def preprocess(batch):
    enc = tokenizer(batch["email"], truncation=True, padding="max_length", max_length=256)
    enc["labels"] = [label2id[l] for l in batch["type"]]
    return enc

tok_ds = ds.map(preprocess, batched=True)

# 5. Train/test split
split = tok_ds.train_test_split(0.2, seed=42)
metric = evaluate.load("accuracy")

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return metric.compute(predictions=preds, references=p.label_ids)


In [None]:
from transformers import TrainingArguments, Trainer
from transformers.utils import logging as hf_logging


hf_logging.set_verbosity_info()
hf_logging.enable_progress_bar()


training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=2e-5,
    evaluation_strategy="steps",    
    eval_steps=200,                 
    logging_strategy="steps",       
    logging_steps=100,              
    save_strategy="epoch",
    report_to="none",               
    disable_tqdm=False              
)

PyTorch: setting up devices


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split["train"],
    eval_dataset=split["test"],
    compute_metrics=compute_metrics
)
trainer.train()  
trainer.save_model("email_classifier")


The following columns in the training set don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: type, email. If type, email are not expected by `XLMRobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 19,200
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Training with DataParallel so batch size has been adjusted to: 16
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 2
  Total optimization steps = 1,800
  Number of trainable parameters = 278,046,724


Step,Training Loss,Validation Loss,Accuracy
200,0.602,0.537806,0.748125
400,0.5336,0.51402,0.755625
600,0.5096,0.509888,0.758333
800,0.4546,0.481865,0.76875
1000,0.4532,0.47485,0.773333
1200,0.4492,0.489799,0.772917
1400,0.4115,0.466206,0.776042
1600,0.3877,0.466128,0.77625
1800,0.4024,0.462707,0.7775


The following columns in the evaluation set don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: type, email. If type, email are not expected by `XLMRobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4800
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: type, email. If type, email are not expected by `XLMRobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4800
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: type, email. If type, email are not expected by `XLMRobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** 

In [None]:
model.save_pretrained("email_classifier")
tokenizer.save_pretrained("email_classifier")


Configuration saved in email_classifier/config.json
Model weights saved in email_classifier/model.safetensors
tokenizer config file saved in email_classifier/tokenizer_config.json
Special tokens file saved in email_classifier/special_tokens_map.json


('email_classifier/tokenizer_config.json',
 'email_classifier/special_tokens_map.json',
 'email_classifier/sentencepiece.bpe.model',
 'email_classifier/added_tokens.json',
 'email_classifier/tokenizer.json')

In [None]:
from transformers import pipeline

# Load your fine-tuned classifier
clf = pipeline(
    "text-classification",
    model="email_classifier",
    tokenizer=tokenizer,
    device=0  
)

def classify_email(input_email_body: str):
    # a) Mask PII and collect entities
    masked, entities = mask_pii(input_email_body)

    # b) Classify the masked text
    pred = clf(masked)[0]

    # c) Return required JSON structure
    return {
        "input_email_body": input_email_body,
        "list_of_masked_entities": entities,
        "masked_email": masked,
        "category_of_the_email": pred["label"]
    }



loading configuration file email_classifier/config.json
Model config XLMRobertaConfig {
  "_name_or_path": "email_classifier",
  "architectures": [
    "XLMRobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "Incident",
    "1": "Request",
    "2": "Problem",
    "3": "Change"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "Change": 3,
    "Incident": 0,
    "Problem": 2,
    "Request": 1
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.39.3",
  "type_vocab_size": 1

In [None]:
# Example usage
result = classify_email(
    "Subject: Customer Support Inquiry Seeking information on digital strategies that can aid in brand growth and details on the available services. Looking forward to learning more to help our business grow My name is Elena Ivanova.. Thank you, and I look forward to hearing from you soon. You can reach me at fatima.farsi@help.com"
)


df = pd.DataFrame([result])

df.head()

Unnamed: 0,input_email_body,list_of_masked_entities,masked_email,category_of_the_email
0,"Subject: Customer Support Inquiry Seeking information on digital strategies that can aid in brand growth and details on the available services. Looking forward to learning more to help our business grow My name is Elena Ivanova.. Thank you, and I look forward to hearing from you soon. You can reach me at fatima.farsi@help.com","[{'position': [214, 227], 'classification': 'PERSON', 'entity': 'Elena Ivanova'}, {'position': [230, 239], 'classification': 'PERSON', 'entity': 'Thank you'}, {'position': [306, 327], 'classification': 'EMAIL_ADDRESS', 'entity': 'fatima.farsi@help.com'}]","Subject: Customer Support Inquiry Seeking information on digital strategies that can aid in brand growth and details on the available services. Looking forward to learning more to help our business grow My name is [full_name].. [full_name], and I look forward to hearing from you soon. You can reach me at [email]",Request


In [31]:
import shutil

# Zip the model directory
shutil.make_archive("email_classifier", 'zip', "email_classifier")


'/kaggle/working/email_classifier.zip'

In [32]:
from IPython.display import FileLink

FileLink(r'email_classifier.zip')
