In [1]:
# Import necessary libraries
import pandas as pd
import re
from bs4 import BeautifulSoup
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
import os
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

In [2]:

# Define constants
COLUMN_NAMES = [
    "id", "category_label", "unknown1", "unknown2", "unknown3",
    "category", "json_content", "date", "policy_url", "policy_text"
]

# Function to process annotations from the CSV file
def process_annotations(csv_file_path):
    """Load and process annotations from a CSV file.
    Args:
        csv_file_path (str): Path to the CSV file.
    Returns:
        list: List of dictionaries with 'id', 'category', and 'policy_text'.
    """
    data = pd.read_csv(csv_file_path, names=["id", "category", "policy_text"])
    annotations = [
        {"id": row["id"], "category": row["category"], "policy_text": row["policy_text"]}
        for _, row in data.iterrows()
    ]
    return annotations

# Function to extract policy text from an HTML file
def extract_policy_text(html_file_path):
    """Extract all visible text from an HTML file.
    Args:
        html_file_path (str): Path to the HTML file.
    Returns:
        str: Extracted text.
    """
    with open(html_file_path, 'r', encoding='utf-8') as f:
        html_content = f.read()
    soup = BeautifulSoup(html_content, 'html.parser')
    policy_text = ' '.join(soup.stripped_strings)
    return policy_text

# Provide file paths
csv_file_path = r'./content/20_theatlantic.com.csv'
html_file_path = r'./content/20_theatlantic.com.html'

# Call functions
annotations = process_annotations(csv_file_path)
policy_text = extract_policy_text(html_file_path)

print(annotations)
print(policy_text)

[{'id': '{"Other Type": {"endIndexInSegment": 762, "startIndexInSegment": 100, "selectedText": "At the Atlantic Monthly Group, Inc. (\\"The Atlantic\\"), we want you to enjoy and benefit from our websites and online services secure in the knowledge that we have implemented fair information practices designed to protect your privacy. Our privacy policy is applicable to The Atlantic, and The Atlantics affiliates and subsidiaries whose websites, mobile applications and other online services are directly linked (the Sites). The privacy policy describes the kinds of information we may gather during your visit to these Sites, how we use your information, when we might disclose your personally identifiable information, and how you can manage your information.", "value": "Introductory/Generic"}}', 'category': '1/1/15', 'policy_text': 'http://www.theatlantic.com/privacy-policy/'}, {'id': '{"Other Type": {"endIndexInSegment": 762, "startIndexInSegment": 100, "selectedText": "At the Atlantic Mont

In [20]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Define label mapping
label_mapping = {
    "better": 0,
    "not better": 1
}
class PrivacyPolicyDataset(Dataset):
    """Dataset for preparing privacy policies for fine-tuning."""
    def __init__(self, annotations, label_mapping):
        self.annotations = annotations
        self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        self.label_mapping = label_mapping
        self.data = self._prepare_data()

    def _prepare_data(self):
        """Prepare data by encoding text and labels."""
        data = []
        for annotation in annotations: # this used to be self.annoations
            text = annotation["policy_text"]
            category = annotation["category"]
            label = self.label_mapping.get(category, -1)
            data.append((text, label))
        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text, label = self.data[idx]
        text = str(text)
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=512,
            return_tensors="pt",
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item["labels"] = torch.tensor(label, dtype=torch.long)
        return item


In [24]:

def fine_tune_model_optimized(dataset, tokenizer, label_mapping, output_dir="./fine_tuned_model"):
    """Fine-tune a pre-trained BERT model on the given dataset.
    Args:
        dataset (PrivacyPolicyDataset): Dataset for training and evaluation.
        tokenizer (AutoTokenizer): Tokenizer for the model.
        output_dir (str): Directory to save the fine-tuned model.
    Returns:
        AutoModelForSequenceClassification: Fine-tuned model.
    """
    model = AutoModelForSequenceClassification.from_pretrained(
        "bert-base-uncased",
        num_labels=len(label_mapping)
    )

    training_args = TrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir="./logs",
        save_total_limit=2,
        fp16=True,
        dataloader_num_workers=4
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        eval_dataset=dataset,
        tokenizer=tokenizer,
    )

    trainer.train()
    model.save_pretrained(output_dir)
    dataset.tokenizer.save_pretrained(output_dir)
    return model


In [25]:
def predict_policy(policy_text, model, tokenizer, label_mapping):
    """Predict the category of a given policy text.
    Args:
        policy_text (str): The text of the policy.
        model (AutoModelForSequenceClassification): Fine-tuned model.
        tokenizer (AutoTokenizer): Tokenizer for the model.
        label_mapping (dict): Mapping of labels to indices.
    Returns:
        str: Predicted category label.
    """
    inputs = tokenizer(
        policy_text,
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt"
    )
    outputs = model(**inputs)
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predicted_class = torch.argmax(probabilities).item()
    category = {idx: label for label, idx in label_mapping.items()}
    return category.get(predicted_class, "Unknown")


# Prepare dataset
csv_file_path = r"./content/20_theatlantic.com.csv"
annotations = process_annotations(csv_file_path)

# Create a list of policy texts
policy_texts = [annotation["policy_text"] for annotation in annotations if annotation["policy_text"]]

# Initialize the model, tokenizer, and label mapping
model_name = "bert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)
label_mapping = {
    "better": 0,
    "not better": 1
}

# Make predictions
predicted_categories = []
for policy_text in policy_texts:
    predicted_category = predict_policy(policy_text, model, tokenizer, label_mapping)
    predicted_categories.append(predicted_category)

# Print the predicted categories
print(predicted_categories)


# Split dataset into training and testing sets
train_size = int(0.8 * len(policy_texts))
test_size = len(policy_texts) - train_size
train_texts, test_texts = policy_texts[:train_size], policy_texts[train_size:]



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


['better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better', 'better',

In [26]:
"""print(type(annotations))
for annotation in annotations:
  print(annotation)"""
# Create datasets for training and testing
train_dataset = PrivacyPolicyDataset(train_texts, label_mapping)
test_dataset = PrivacyPolicyDataset(test_texts, label_mapping)

# Fine-tune model
output_dir = "./fine_tuned_model"
model = fine_tune_model_optimized(train_dataset, tokenizer, label_mapping, output_dir)

# Evaluate model on test set
test_pred = []
test_labels = []
for policy_text in test_texts:
    pred = predict_policy(policy_text, model, tokenizer, label_mapping)
    test_pred.append(pred)
    test_labels.append(0)

# Calculate evaluation metrics
accuracy = accuracy_score(test_labels, [label_mapping.get(label, -1) for label in test_pred])
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(test_labels, [label_mapping.get(label, -1) for label in test_pred]))


# Use the model to make predictions
html_file_path = r'./content/20_theatlantic.com.html'
policy_text = extract_policy_text(html_file_path)
predicted_category = predict_policy(policy_text, model, tokenizer, label_mapping)
print("Predicted category:", predicted_category)
print("\n\nend of run.")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
