# basic analysis



In [None]:
import pandas as pd
dataframe = pd.read_csv("Other.csv")
dataframe.drop('category', axis=1, inplace=True)
value_counts = dataframe['Other Type'].value_counts()
print(value_counts)
dataframe.head()
duplicates_across_a = dataframe['segment'].nunique()
print(duplicates_across_a)

Other Type
Introductory/Generic           1376
Practice not covered            788
Other                           735
Privacy contact information     649
Name: count, dtype: int64
1741


In [None]:
df_deduplicated = dataframe[~dataframe.duplicated(subset=['segment', 'Other Type'])]

value_counts = df_deduplicated['Other Type'].value_counts()
print(value_counts)

Other Type
Introductory/Generic           798
Practice not covered           641
Other                          580
Privacy contact information    322
Name: count, dtype: int64


In [None]:
duplicates_across_a = df_deduplicated.groupby('segment')['Other Type'].nunique()
count = duplicates_across_a[duplicates_across_a > 1].sum()

print(f"Number of rows duplicated across column_a but with different values in column_b: {count}")


Number of rows duplicated across column_a but with different values in column_b: 1123


In [None]:

# Count unique values of column_b for each value in column_a
unique_count = df_deduplicated.groupby('segment')['Other Type'].nunique()

# Filter to find values in column_a with more than one unique column_b value
multiple_b = unique_count[unique_count > 1]

# Rows corresponding to these values in column_a
rows_with_multiple_b = df_deduplicated[df_deduplicated['segment'].isin(multiple_b.index)]

print("Number of rows where column_a has different values in column_b:")
print(len(rows_with_multiple_b))


Number of rows where column_a has different values in column_b:
1123


In [None]:
print("\nRows where column_a has different values in column_b:")
rows_with_multiple_b.head(20)


Rows where column_a has different values in column_b:


Unnamed: 0,annotationID,segmentID,segment,Other Type
7,20600,10,Your Consent and Rights <br> <br> By using Sci...,Introductory/Generic
11,20253,10,Your Consent and Rights <br> <br> By using Sci...,Other
16,20423,2,<strong> Information Collection and Use </stro...,Practice not covered
17,20442,10,<strong> This privacy statement covers the use...,Practice not covered
18,20446,14,"Clear Gifs can ""work with"" existing cookies on...",Other
20,20455,19,Customer Service <br> <br> We communicate with...,Practice not covered
23,20464,27,Users of our site are always notified when the...,Practice not covered
28,20449,41,Contact Information <br> <br>,Introductory/Generic
37,20125,10,<strong> This privacy statement covers the use...,Other
39,20133,14,"Clear Gifs can ""work with"" existing cookies on...",Introductory/Generic


In [None]:
df_deduplicated.shape

(2341, 4)

In [None]:
# 1. Count rows duplicated across columns A and B
duplicates_ab = df_deduplicated.duplicated(subset=['segment', 'Other Type'], keep=False).sum()

# 2. Count rows unique across columns A and B
unique_ab = len(df_deduplicated) - duplicates_ab

# 3. Count rows where A is duplicated but has different values in B
a_groups = df_deduplicated.groupby('segment')['Other Type'].nunique()
duplicates_a_diff_b = a_groups[a_groups > 1].sum()

df_repeated_seg = df_deduplicated[df_deduplicated.duplicated(subset=['segment'], keep=False)]
repeated_seg= df_deduplicated.duplicated(subset=['segment'], keep=False).sum()
unique_values_in_A = df_repeated_seg['segment'].nunique()


print(f"Rows duplicated across A and B: {duplicates_ab}")
print(f"Rows unique across A and B: {unique_ab}")
print(f"Rows duplicated in A but with different values in B: {duplicates_a_diff_b}")
print(df_deduplicated.shape)
print("number of rows just with duplicated segment is")
print(repeated_seg)
print("number of rows with one value:")
print( (len(df_deduplicated) - repeated_seg))
print("number of rows that will have more than value:")
print(unique_values_in_A)


Rows duplicated across A and B: 0
Rows unique across A and B: 2341
Rows duplicated in A but with different values in B: 1123
(2341, 4)
number of rows just with duplicated segment is
1123
number of rows with one value:
1218
number of rows that will have more than value:
523


In [None]:
df_repeated_seg.head()

Unnamed: 0,annotationID,segmentID,segment,Other Type
7,20600,10,Your Consent and Rights <br> <br> By using Sci...,Introductory/Generic
11,20253,10,Your Consent and Rights <br> <br> By using Sci...,Other
16,20423,2,<strong> Information Collection and Use </stro...,Practice not covered
17,20442,10,<strong> This privacy statement covers the use...,Practice not covered
18,20446,14,"Clear Gifs can ""work with"" existing cookies on...",Other


In [None]:
duplicates_ab = dataframe.duplicated(subset=['segment'], keep=False).sum()
duplicates_ab

2889

In [None]:
# Get the most frequent value and its count
most_frequent_value = dataframe['segment'].value_counts().idxmax()
most_frequent_count = dataframe['segment'].value_counts().max()

print(f"Most frequent value: {most_frequent_value}")
print(f"Number of occurrences: {most_frequent_count}")


Most frequent value: Privacy Policy <br> <br>
Number of occurrences: 30


# Attempt to reimplment the data pipeline

In [None]:
# THIS IS THE MAIN JUICY PART THAT WORKS
import numpy as np
import pandas as pd
dataframe = pd.read_csv("Other.csv")
dataframe.drop('category', axis=1, inplace=True)
dataframe = dataframe[~dataframe.duplicated(subset=['segment', 'Other Type'])]
print((dataframe['Other Type'].iloc[0]))


def label_to_vector(label, labels, count):
    """

    Returns a vector representing the label passed as an input.

    Args:
        label: string, label that we want to transform into a vector.
        labels: dictionary, dictionary with the labels as the keys and indexes as the values.
    Returns:
        vector: np.array, 1-D array of lenght 12.

    """

    vector = np.zeros((count), dtype=np.int64)
    try:

        index = labels[label]

        vector[index] = 1

    except KeyError:

        vector = np.zeros((count), dtype=np.int64)

    return vector

labels = {'Introductory/Generic': 0,'Practice not covered': 1,'Other': 2,'Privacy contact information': 3}


dataframe['Other Type'] = dataframe['Other Type'].apply(lambda x: label_to_vector(x, labels, 4)) # returns one hot encoding in a 1D-12D vector

labels_data = dataframe[[ 'segment', 'Other Type']]
labels = labels_data.groupby("segment").sum() # since segments can have many labels, it sums label vectors of the same segment, resuling in a vector with all the labels of the segment marked as 1, other values as 0 (summing together one hot encodings)

labels = labels.reset_index()
labels.head() # reindex this
labels.shape
# print(labels.columns)
labels['Other Type'] = labels['Other Type'].apply(lambda x: x.tolist()) # convert to list to be able to work with it later


labels.to_csv("Other2.csv", index=False)

labels.head()
print(type(labels['Other Type'][0]))

Introductory/Generic
<class 'list'>


In [None]:
filtered_df = dataframe[dataframe['segmentID'] == 0]
filtered_df

Unnamed: 0,annotationID,segmentID,segment,Other Type
0,20137,0,Privacy Policy <br> <br> Sci-News.com is commi...,"[1, 0, 0, 0]"
14,20421,0,"RedOrbit, Inc. Privacy Statement <br> <br>","[1, 0, 0, 0]"
58,13305,0,<strong> AOL Privacy Policy </strong> <br> <br...,"[1, 0, 0, 0]"
118,20842,0,"STATEMENT OF PRIVACY ( 12/23/2014, UNITED STAT...","[1, 0, 0, 0]"
157,9602,0,Amazon.com Privacy Notice <br> <br>,"[1, 0, 0, 0]"
...,...,...,...,...
3264,9754,0,Privacy Policy <br> <br> This is the web site ...,"[1, 0, 0, 0]"
3325,14872,0,Internet Brands Privacy Highlights <br> Privac...,"[1, 0, 0, 0]"
3394,14072,0,Internet Brands Privacy Highlights <br> Privac...,"[0, 0, 1, 0]"
3433,9955,0,lynda.com Privacy Policy <br> <br>,"[1, 0, 0, 0]"


In [None]:
segments.head()

Unnamed: 0_level_0,segment
segmentID,Unnamed: 1_level_1
0,Privacy Policy <br> <br> Sci-News.com is commi...
4,Sci-News.com does not knowingly collect or sol...
6,During the course of any visit to the Sci-News...
7,Please note that during or after your visits t...
11,We will remove you and your personally identif...


In [None]:
# ideas to preprocess data correctly

# go inside directory where files or
# for a single file,
  # only keeps rows where category is other
  # apply json format to that file
  # only keep relevant columns that you get out of json
  # now do the thing here with the one hot encoding


In [None]:
# how to repeat the process above for multi task
# divide dataframes into a single label dataframe and then recombine them in the end

# Other training script

In [None]:
!pip install datasets
!pip install evaluate



In [None]:
import wandb
import pandas as pd
import ast

wandb.login()

# HYPERPAREMETERS
LEARNING_RATE = 5e-5
EPOCHS = 5
BATCH_SIZE = 8
run = wandb.init(
    # Set the project where this run will be logged
    project="Data Security Project", name= "Distill Bert new dataset, First legit training attempt",
    # Track hyperparameters and run metadata
    config={
        "learning_rate": 0.01,
        "Batch_size": 8,
        "epochs": 15,
    },
)

df = pd.read_csv("Other2.csv")
df.rename(columns={'Other Type': 'label'}, inplace=True)
df['label'] = df['label'].apply(ast.literal_eval)

df['label'] = df['label'].apply(lambda x: [float(i) for i in x])

df.head()
print(type(df['label'][0]))
print(df['label'][0])



<class 'list'>
[0.0, 0.0, 1.0, 0.0]


In [None]:
# you might need to convert to float


import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from sklearn.model_selection import train_test_split
from datasets import Dataset
import evaluate
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, hamming_loss



train_df, test_df = train_test_split(df, test_size=0.2)
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize the text data
def tokenize_function(examples):
    return tokenizer(examples['segment'], padding="max_length", truncation=True)

train_dataset_tokenized = train_dataset.map(tokenize_function, batched=True)
test_dataset_tokenized = test_dataset.map(tokenize_function, batched=True)

# Load DistilBERT model for sequence classification with 3 labels (multi-label classification)
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased",
                                                            num_labels=4,
                                                            problem_type="multi_label_classification",
                                                            )

# Data Collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)



# Define training arguments

training_args = TrainingArguments(
    output_dir="30_epochs_model",
    eval_strategy="epoch",               # Evaluate after every epoch
    report_to="wandb",
    num_train_epochs= EPOCHS,                 # Total number of epochs
    save_strategy="epoch",               # Save the best model at the end of each epoch
    load_best_model_at_end=True,         # Load the best model based on validation loss
    learning_rate= 5e-5,                  # Learning rate for the optimizer
    weight_decay=0.01,                   # Weight decay for regularization
    per_device_train_batch_size= 8,      # Training batch size
    per_device_eval_batch_size= 8,       # Evaluation batch size
    gradient_accumulation_steps=4,       # Accumulate gradients for fewer backward passes
    logging_strategy="epoch",            # Log metrics at intervals of steps
    log_level="info",                    # Log level (e.g., "info" or "error")
    log_level_replica="warning",        # Adjust logs for distributed training replicas
    logging_dir="./logs",               # Directory for storing logs
    logging_steps = 100,
    metric_for_best_model="eval_loss",  # Metric to track the best model
    fp16=True,  # Enable mixed precision

    )






def compute_metrics(eval_pred):
    """
    Compute metrics for multilabel classification.
    :param eval_pred: Tuple (predictions, labels)
    :return: Dictionary with metric values
    """
    logits, labels = eval_pred
    # Apply sigmoid to logits for multilabel classification
    probs = 1 / (1 + np.exp(-logits))
    # Convert probabilities to binary predictions (0 or 1)
    preds = (probs > 0.5).astype(int)

    # Exact match ratio: Proportion of samples with all labels correct
    exact_match = np.all(preds == labels, axis=1).mean()

    # Multilabel accuracy: Average accuracy across all labels
    multilabel_accuracy = (preds == labels).mean()

    # F1 Score (macro and micro)
    f1_macro = f1_score(labels, preds, average="macro")
    f1_micro = f1_score(labels, preds, average="micro")

    # Hamming loss
    hamming = hamming_loss(labels, preds)

    return {
        "exact_match": exact_match,
        "multilabel_accuracy": multilabel_accuracy,
        "f1_macro": f1_macro,
        "f1_micro": f1_micro,
        "hamming_loss": hamming,
    }


loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/12040accade4e8a0f71eabdb258fecc2e7e948be/vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/12040accade4e8a0f71eabdb258fecc2e7e948be/tokenizer_config.json
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/12040accade4e8a0f71eabdb258fecc2e7e948be/tokenizer.json
loading file chat_template.jinja from cache at None
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/12040accade4e8a0f71eabdb258fecc2e7e948be/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ]

Map:   0%|          | 0/1392 [00:00<?, ? examples/s]

Map:   0%|          | 0/349 [00:00<?, ? examples/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/12040accade4e8a0f71eabdb258fecc2e7e948be/config.json
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "multi_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.47.1",
  "vocab_size": 30522
}

loading weights file model.safetensors from cache at /root/.cache/huggingf

In [None]:
# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_tokenized,
    eval_dataset=test_dataset_tokenized,
    compute_metrics=compute_metrics,
    data_collator=data_collator,


)

# Fine-tune the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(eval_results)

# Save the model
model.save_pretrained("./other_model_saved")
tokenizer.save_pretrained("./other_model_saved")

Using auto half precision backend
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, segment. If __index_level_0__, segment are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1,392
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 215
  Number of trainable parameters = 66,956,548
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss,Exact Match,Multilabel Accuracy,F1 Macro,F1 Micro,Hamming Loss,Runtime,Samples Per Second,Steps Per Second
1,2.1446,0.466385,0.461318,0.786533,0.572015,0.63835,0.213467,1.7246,202.37,25.514
2,1.7013,0.437182,0.467049,0.795129,0.615772,0.64951,0.204871,1.5814,220.684,27.823
3,1.4749,0.413809,0.498567,0.801576,0.652443,0.680507,0.198424,1.7112,203.951,25.713
4,1.1633,0.429145,0.498567,0.795845,0.676608,0.679415,0.204155,1.8321,190.494,24.016


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, segment. If __index_level_0__, segment are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 349
  Batch size = 8
Saving model checkpoint to 30_epochs_model/checkpoint-44
Configuration saved in 30_epochs_model/checkpoint-44/config.json
Model weights saved in 30_epochs_model/checkpoint-44/model.safetensors
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, segment. If __index_level_0__, segment are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 349
  Batch size = 8
Saving model checkpoint to 30_epochs_mo

Configuration saved in ./other_model_saved/config.json


{'eval_loss': 0.4138087034225464, 'eval_exact_match': 0.498567335243553, 'eval_multilabel_accuracy': 0.8015759312320917, 'eval_f1_macro': 0.6524432503194383, 'eval_f1_micro': 0.6805074971164936, 'eval_hamming_loss': 0.1984240687679083, 'eval_runtime': 1.6285, 'eval_samples_per_second': 214.309, 'eval_steps_per_second': 27.019, 'epoch': 4.896551724137931}


Model weights saved in ./other_model_saved/model.safetensors
tokenizer config file saved in ./other_model_saved/tokenizer_config.json
Special tokens file saved in ./other_model_saved/special_tokens_map.json


('./other_model_saved/tokenizer_config.json',
 './other_model_saved/special_tokens_map.json',
 './other_model_saved/vocab.txt',
 './other_model_saved/added_tokens.json')

# do not track

In [None]:
import numpy as np
import pandas as pd
dataframe = pd.read_csv("Do_Not_Track.csv")
value_counts = dataframe['Do Not Track policy'].value_counts()
duplicates_across_a = dataframe['segment'].nunique()
print(value_counts)
print(duplicates_across_a)
print("\n\n")
df_deduplicated = dataframe[~dataframe.duplicated(subset=['segment', 'Do Not Track policy'])]

value_counts = df_deduplicated['Do Not Track policy'].value_counts()
print(value_counts)

Do Not Track policy
Not honored                          75
Mentioned, but unclear if honored     8
Other                                 3
Honored                               3
Not mentioned                         1
Name: count, dtype: int64
32



Do Not Track policy
Not honored                          29
Mentioned, but unclear if honored     4
Other                                 2
Not mentioned                         1
Honored                               1
Name: count, dtype: int64


In [None]:
# import numpy as np
# import pandas as pd
# dataframe = pd.read_csv("Do_Not_Track.csv")
dataframe.drop('category', axis=1, inplace=True)
dataframe = dataframe[~dataframe.duplicated(subset=['segment', 'Do Not Track policy'])]


def label_to_vector(label, labels, count):
    """

    Returns a vector representing the label passed as an input.

    Args:
        label: string, label that we want to transform into a vector.
        labels: dictionary, dictionary with the labels as the keys and indexes as the values.
    Returns:
        vector: np.array, 1-D array of lenght 12.

    """

    vector = np.zeros((count), dtype=np.int64)
    try:

        index = labels[label]

        vector[index] = 1

    except KeyError:

        vector = np.zeros((count), dtype=np.int64)

    return vector

labels = {'Introductory/Generic': 0,'Practice not covered': 1,'Other': 2,'Privacy contact information': 3}


dataframe['Do Not Track policy'] = dataframe['Do Not Track policy'].apply(lambda x: label_to_vector(x, labels, 4)) # returns one hot encoding in a 1D-12D vector

labels_data = dataframe[[ 'segment', 'Do Not Track policy']]
labels = labels_data.groupby("segment").sum() # since segments can have many labels, it sums label vectors of the same segment, resuling in a vector with all the labels of the segment marked as 1, other values as 0 (summing together one hot encodings)

labels = labels.reset_index()
labels.head() # reindex this
labels.shape
# print(labels.columns)
labels['Do Not Track policy'] = labels['Do Not Track policy'].apply(lambda x: x.tolist()) # convert to list to be able to work with it later


labels.to_csv("Other2.csv", index=False)

labels.head()
print(type(labels['Do Not Track policy'][0]))

# Data Security

In [None]:
import numpy as np
import pandas as pd
dataframe = pd.read_csv("Data_Security.csv")

# data distribution
value_counts = dataframe['Security Measure'].value_counts()
duplicates_across_a = dataframe['segment'].nunique()
print(value_counts)
print(duplicates_across_a)
print("\n\n")
df_deduplicated = dataframe[~dataframe.duplicated(subset=['segment', 'Security Measure'])]

value_counts = df_deduplicated['Security Measure'].value_counts()
print(value_counts)



Security Measure
Generic                       395
Data access limitation        136
Secure data transfer          135
Other                         128
Privacy/Security program       66
Secure data storage            53
Privacy review/audit           37
Secure user authentication     36
Unspecified                    12
Privacy training               10
Name: count, dtype: int64
372



Security Measure
Generic                       215
Other                          99
Data access limitation         81
Privacy/Security program       55
Secure data transfer           52
Secure data storage            36
Privacy review/audit           28
Secure user authentication     21
Unspecified                    11
Privacy training                7
Name: count, dtype: int64


In [None]:
# does multilabel apply?
duplicates_across_a = df_deduplicated.groupby('segment')['Security Measure'].nunique()
count = duplicates_across_a[duplicates_across_a > 1].sum()

print(f"Number of rows duplicated across column_a but with different values in column_b: {count}")


# Count unique values of column_b for each value in column_a
unique_count = df_deduplicated.groupby('segment')['Security Measure'].nunique()

# Filter to find values in column_a with more than one unique column_b value
multiple_b = unique_count[unique_count > 1]

# Rows corresponding to these values in column_a
rows_with_multiple_b = df_deduplicated[df_deduplicated['segment'].isin(multiple_b.index)]

print("Number of rows where column_a has different values in column_b:")
print(len(rows_with_multiple_b))

print("\nRows where column_a has different values in column_b:")
rows_with_multiple_b.head(20)

Number of rows duplicated across column_a but with different values in column_b: 365
Number of rows where column_a has different values in column_b:
365

Rows where column_a has different values in column_b:


Unnamed: 0,annotationID,segmentID,category,segment,Security Measure
1,20467,32,Data Security,When our registration/order form asks users to...,Secure data transfer
2,20468,32,Data Security,When our registration/order form asks users to...,Generic
3,20476,33,Data Security,While we use SSL encryption to protect sensiti...,Secure data storage
4,20477,33,Data Security,While we use SSL encryption to protect sensiti...,Privacy training
5,20480,33,Data Security,While we use SSL encryption to protect sensiti...,Privacy/Security program
8,20359,33,Data Security,While we use SSL encryption to protect sensiti...,Secure data transfer
9,20360,33,Data Security,While we use SSL encryption to protect sensiti...,Data access limitation
15,13298,34,Data Security,<strong> Our Commitment to Security </strong> ...,Secure data storage
16,13299,34,Data Security,<strong> Our Commitment to Security </strong> ...,Secure user authentication
17,13300,34,Data Security,<strong> Our Commitment to Security </strong> ...,Data access limitation


In [None]:
# 1. Count rows duplicated across columns A and B
duplicates_ab = df_deduplicated.duplicated(subset=['segment', 'Security Measure'], keep=False).sum()

# 2. Count rows unique across columns A and B
unique_ab = len(df_deduplicated) - duplicates_ab

# 3. Count rows where A is duplicated but has different values in B
a_groups = df_deduplicated.groupby('segment')['Security Measure'].nunique()
duplicates_a_diff_b = a_groups[a_groups > 1].sum()

df_repeated_seg = df_deduplicated[df_deduplicated.duplicated(subset=['segment'], keep=False)]
repeated_seg= df_deduplicated.duplicated(subset=['segment'], keep=False).sum()
unique_values_in_A = df_repeated_seg['segment'].nunique()


print(f"Rows duplicated across A and B: {duplicates_ab}")
print(f"Rows unique across A and B: {unique_ab}")
print(f"Rows duplicated in A but with different values in B: {duplicates_a_diff_b}")
print("number of rows just with duplicated segment is")
print(repeated_seg)
print("number of rows with one value:")
print( (len(df_deduplicated) - repeated_seg))
print("number of rows that will have more than value:")
print(unique_values_in_A)
print(df_deduplicated.shape)

Rows duplicated across A and B: 0
Rows unique across A and B: 605
Rows duplicated in A but with different values in B: 365
number of rows just with duplicated segment is
365
number of rows with one value:
240
number of rows that will have more than value:
132
(605, 5)


# Inter audiences

In [None]:
import numpy as np
import pandas as pd
dataframe = pd.read_csv("International_and_Specific_Audiences.csv")

# data distribution
value_counts = dataframe['Audience Type'].value_counts()
duplicates_across_a = dataframe['segment'].nunique()
print(value_counts)
print(duplicates_across_a)
print("\n\n")
df_deduplicated = dataframe[~dataframe.duplicated(subset=['segment', 'Audience Type'])]

value_counts = df_deduplicated['Audience Type'].value_counts()
print(value_counts)



Audience Type
Children                         520
Californians                     199
Citizens from other countries    125
Europeans                         59
Other                             36
Name: count, dtype: int64
348



Audience Type
Children                         191
Californians                      73
Citizens from other countries     69
Europeans                         31
Other                             27
Name: count, dtype: int64


In [None]:
# does multilabel apply?
duplicates_across_a = df_deduplicated.groupby('segment')['Audience Type'].nunique()
count = duplicates_across_a[duplicates_across_a > 1].sum()

print(f"Number of rows duplicated across column_a but with different values in column_b: {count}")


# Count unique values of column_b for each value in column_a
unique_count = df_deduplicated.groupby('segment')['Audience Type'].nunique()

# Filter to find values in column_a with more than one unique column_b value
multiple_b = unique_count[unique_count > 1]

# Rows corresponding to these values in column_a
rows_with_multiple_b = df_deduplicated[df_deduplicated['segment'].isin(multiple_b.index)]

print("Number of rows where column_a has different values in column_b:")
print(len(rows_with_multiple_b))

print("\nRows where column_a has different values in column_b:")
rows_with_multiple_b.head(20)

Number of rows duplicated across column_a but with different values in column_b: 84
Number of rows where column_a has different values in column_b:
84

Rows where column_a has different values in column_b:


Unnamed: 0,annotationID,segmentID,category,segment,Audience Type
7,13649,39,International and Specific Audiences,AOL Inc. complies with the U.S.-EU Safe Harbor...,Europeans
12,12258,39,International and Specific Audiences,AOL Inc. complies with the U.S.-EU Safe Harbor...,Citizens from other countries
17,9117,27,International and Specific Audiences,In compliance with the US-EU and US-Swiss Safe...,Europeans
18,9120,28,International and Specific Audiences,"Under the Safe Harbor program, any unresolved ...",Europeans
84,10218,47,International and Specific Audiences,<strong> How Do We Respect Children's Privacy-...,Children
85,10199,51,International and Specific Audiences,This Privacy Statement applies to the operatio...,Citizens from other countries
86,10798,2,International and Specific Audiences,This Privacy Statement answers the following q...,Children
87,10800,2,International and Specific Audiences,This Privacy Statement answers the following q...,Europeans
91,11087,51,International and Specific Audiences,This Privacy Statement applies to the operatio...,Europeans
92,12282,47,International and Specific Audiences,<strong> How Do We Respect Children's Privacy-...,Californians


In [None]:
# 1. Count rows duplicated across columns A and B
duplicates_ab = df_deduplicated.duplicated(subset=['segment', 'Audience Type'], keep=False).sum()

# 2. Count rows unique across columns A and B
unique_ab = len(df_deduplicated) - duplicates_ab

# 3. Count rows where A is duplicated but has different values in B
a_groups = df_deduplicated.groupby('segment')['Audience Type'].nunique()
duplicates_a_diff_b = a_groups[a_groups > 1].sum()

df_repeated_seg = df_deduplicated[df_deduplicated.duplicated(subset=['segment'], keep=False)]
repeated_seg= df_deduplicated.duplicated(subset=['segment'], keep=False).sum()
unique_values_in_A = df_repeated_seg['segment'].nunique()


print(f"Rows duplicated across A and B: {duplicates_ab}")
print(f"Rows unique across A and B: {unique_ab}")
print(f"Rows duplicated in A but with different values in B: {duplicates_a_diff_b}")
print("number of rows just with duplicated segment is")
print(repeated_seg)
print("number of rows with one value:")
print( (len(df_deduplicated) - repeated_seg))
print("number of rows that will have more than value:")
print(unique_values_in_A)
print(df_deduplicated.shape)

Rows duplicated across A and B: 0
Rows unique across A and B: 391
Rows duplicated in A but with different values in B: 84
number of rows just with duplicated segment is
84
number of rows with one value:
307
number of rows that will have more than value:
41
(391, 5)


In [2]:
#find loss of model
from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=5)

# print(model.config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
print(model.loss)

AttributeError: 'DistilBertForSequenceClassification' object has no attribute 'loss'

In [1]:
!pip install --upgrade sympy
!pip install --upgrade transformers

Collecting transformers
  Using cached transformers-4.48.2-py3-none-any.whl.metadata (44 kB)
Using cached transformers-4.48.2-py3-none-any.whl (9.7 MB)
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.47.1
    Uninstalling transformers-4.47.1:
      Successfully uninstalled transformers-4.47.1
Successfully installed transformers-4.48.2
