In [2]:
import torch
import nltk
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from nltk.tokenize import sent_tokenize

In [10]:
# Download and set up the sentence tokenizer
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [33]:
# Load the pre-trained BERT model and tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

MODEL_NAME = "adeeteya/distilbert_base_uncased_finetuned_tos"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.to(device)
# Define the labels
LABELS = ['Risk', 'Safe', 'Information']

CONFIDENCE_THRESHOLD = 0.7

In [5]:
# Preprocessing function for the sentences
def preprocess(sentence):
    """Tokenize and prepare inputs for the BERT model."""
    return tokenizer(sentence, truncation=True, padding=True, max_length=512, return_tensors='pt')

In [46]:
# Function to classify a single sentence
def classify_sentence(sentence):
    """Classify a sentence into Risk, Safe, or Information."""
    inputs = preprocess(sentence)
    with torch.no_grad():
        outputs = model(**inputs)
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predicted_class = torch.argmax(probabilities).item()
    confidence = probabilities[0, predicted_class].item()

    # Skip sentences with low confidence
    if confidence < CONFIDENCE_THRESHOLD and predicted_class==0:
        return None, None
    elif confidence+0.1 < CONFIDENCE_THRESHOLD:
        return None, None
    return LABELS[predicted_class], confidence

In [47]:
# Function to process the entire document
def classify_document(document):
    """Split a document into sentences, classify each sentence."""
    sentences = sent_tokenize(document)
    seen_sentences = set()
    classified_sentences = {
        "Risk": [],
        "Safe": [],
        "Information": []
    }
    for sentence in sentences:
      # Skip duplicate sentences
        if sentence in seen_sentences:
            continue
        seen_sentences.add(sentence)
        classification, confidence = classify_sentence(sentence)
        if classification:
            classified_sentences[classification].append((sentence, confidence))
    return classified_sentences

In [43]:
def assign_grade(classified_sentences):
    """Assign a grade based on the counts of Risk and Safe sentences."""
    risk_count = len(classified_sentences['Risk'])
    safe_count = len(classified_sentences['Safe'])

    if risk_count < 3 and safe_count > 0:
        return "A"  # No risks, some safes
    elif risk_count < 10 and safe_count > 5:
        return "B"  # Very few risks compared to safes
    elif risk_count < 15 and safe_count > 2:
        return "C"  # Moderate risk
    elif risk_count < 20:
        return "D"  # Risks approaching safes
    else:
        return "F"  # More risks than safes

In [51]:
with open('Facebook_CookiePolicy.txt') as f:
  input_document=f.read()

In [52]:
classified_output = classify_document(input_document)
print("Classified Sentences:")
grade = assign_grade(classified_output)
print(f"\nService Grade: {grade}\n")

for label, sentences in classified_output.items():
    if sentences:
      print(f"\n{label}:")
    for sentence,confidence in sentences:
        print(f"  - {sentence} (Confidence: {confidence:.2f})")

Classified Sentences:

Service Grade: F


Risk:
  - other storage technologies Cookies are small pieces of text used to store information on web browsers. (Confidence: 0.71)
  - Cookies are used to store and receive identifiers and other information on computers, phones and other devices. (Confidence: 0.77)
  - Other technologies, including data we store on your web browser or device, identifiers associated with your device, and other software, are used for similar purposes. (Confidence: 0.73)
  - Cookies enable Facebook to offer the Facebook Products to you and to understand the information we receive about you, including information about your use of other websites and apps, whether or not you are registered or logged in. (Confidence: 0.80)
  - Cookies help us provide, protect and improve the Facebook Products, such as by personalising content, tailoring and measuring ads, and providing a safer experience. (Confidence: 0.80)
  - The cookies that we use include session cookies, which 