In [1]:
import pandas as pd
fbs_df = pd.read_csv('processed_data/fbs_sms_df.csv')
fbs_df_phish = fbs_df[fbs_df['LABEL'] == 2]

chinese_df = pd.read_csv('processed_data/chinese_text_classification_df.csv')
chinese_df_scam = chinese_df[chinese_df['LABEL'] == 1]

# Load data
mendeley_df = pd.read_csv('processed_data/mendeley_df.csv')
mendeley_df_phish = mendeley_df[mendeley_df['LABEL'] == 2]


In [2]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, ZeroShotClassificationPipeline
from typing import Union, List
import shap

weights = "facebook/bart-large-mnli"

# Check if GPU is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

example_texts = ["U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030",
                 "BankOfAmerica Alert 137943. Please follow http://bit.do/cgjK-and re-activate",
                 "UR awarded a City Break and could WIN a Â£200 Summer Shopping spree every WK. Txt STORE to 88039 . SkilGme. TsCs087147403233",
                 "Apple ID: [BUXCX7GBVwWCcOD Final Notification Your Apple 1D is due to expire today. Prevent this by confirming your Apple ID atï¿½http://verifyapple.uk Apple Inc",
                 "<Forwarded from 448712404000>Please CALL 08712404000 immediately as there is an urgent message waiting for you.",
                 "Free Stuff! Come get it! at 111000",
                 "You need to come right now!",
                 "Warning! Your account is frozen!",
                 "Congrats!"
                 ]
#example_texts = ["I am happy to see you.","That was disappointing.","I would be lying if I said that movie was good."]
hypothesis_template="This phishing indicator of this text is {}."
# Define manipulation categories
categories = ["urgent", "trustworthy", "scary", "authoritative", "rewarding"]

model = AutoModelForSequenceClassification.from_pretrained(weights).to(device)
tokenizer = AutoTokenizer.from_pretrained(weights)

class MyZeroShotClassificationPipeline(ZeroShotClassificationPipeline):
    def __init__(self, model, tokenizer):
        super().__init__(model=model, tokenizer=tokenizer, device=device.index if device.type == 'cuda' else -1)  # Pass device to the superclass
        self.workaround_labels = None
        self.workaround_hypothesis_template = None

    def __call__(self, *args, **kwargs):
        o = super().__call__(args[0], self.workaround_labels, hypothesis_template=self.workaround_hypothesis_template)[0]
        return [[{"label": x[0], "score": x[1]} for x in zip(o["labels"], o["scores"])]]

    def set_labels_workaround(self, labels: Union[str, List[str]]):
        self.workaround_labels = labels

    def set_hypothesis_template_workaround(self, hypothesis_template: str):
        self.workaround_hypothesis_template = hypothesis_template

pipe = MyZeroShotClassificationPipeline(model, tokenizer)

# Set labels and hypothesis template
pipe.set_labels_workaround(categories)
pipe.set_hypothesis_template_workaround(hypothesis_template)
# In the following, we address issue 2.
model.config.label2id.update({v:k for k,v in enumerate(categories)})
model.config.id2label.update({k:v for k,v in enumerate(categories)})


def score_and_visualize(input_texts):
    
    for text in input_texts:

        print(text)
        result = pipe([input_texts])
        print(result)

    #explainer = Explainer(pipe)
    #shap_values = explainer(input_texts)
    #print(shap_values)

    #plots.text(shap_values)

score_and_visualize(example_texts)

Using device: cuda
U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030
[[{'label': 'authoritative', 'score': 0.332027792930603}, {'label': 'scary', 'score': 0.26767945289611816}, {'label': 'urgent', 'score': 0.20151887834072113}, {'label': 'rewarding', 'score': 0.13600435853004456}, {'label': 'trustworthy', 'score': 0.06276952475309372}]]
BankOfAmerica Alert 137943. Please follow http://bit.do/cgjK-and re-activate
[[{'label': 'authoritative', 'score': 0.332027792930603}, {'label': 'scary', 'score': 0.26767945289611816}, {'label': 'urgent', 'score': 0.20151887834072113}, {'label': 'rewarding', 'score': 0.13600435853004456}, {'label': 'trustworthy', 'score': 0.06276952475309372}]]
UR awarded a City Break and could WIN a Â£200 Summer Shopping spree every WK. Txt STORE to 88039 . SkilGme. TsCs087147403233
[[{'label': 'authoritative', 'score': 0.332027792930603}, {'label': 'scary', 'score': 0.26767945289611816}, {'label': '

In [2]:
!pip install stormtrooper[torch]

Collecting stormtrooper[torch]
  Using cached stormtrooper-0.4.1-py3-none-any.whl.metadata (6.2 kB)
Collecting aiohttp<4.0.0,>=3.8.0 (from stormtrooper[torch])
  Downloading aiohttp-3.9.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.5 kB)
Collecting thefuzz<0.19.0,>=0.18.0 (from stormtrooper[torch])
  Using cached thefuzz-0.18.0.tar.gz (29 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting aiosignal>=1.1.2 (from aiohttp<4.0.0,>=3.8.0->stormtrooper[torch])
  Using cached aiosignal-1.3.1-py3-none-any.whl.metadata (4.0 kB)
Collecting frozenlist>=1.1.1 (from aiohttp<4.0.0,>=3.8.0->stormtrooper[torch])
  Using cached frozenlist-1.4.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multidict<7.0,>=4.5 (from aiohttp<4.0.0,>=3.8.0->stormtrooper[torch])
  Using cached multidict-6.0.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Collecting yarl<2.0,>=

In [2]:
import torch
import pandas as pd
from stormtrooper import ZeroShotClassifier

print(torch.cuda.is_available())

# Load the dataset
mendeley_df = pd.read_csv('processed_data/mendeley_df.csv')

# Ensure all texts in the 'CLEANED_TEXT' column are strings
mendeley_df['CLEANED_TEXT'] = mendeley_df['CLEANED_TEXT'].astype(str)

# Initialize and move the model to the GPU
manipulation_model = ZeroShotClassifier("facebook/bart-large-mnli", device="cuda:0")
sentiment_model = ZeroShotClassifier("facebook/bart-large-mnli", device="cuda:0")
topic_model = ZeroShotClassifier("facebook/bart-large-mnli", device="cuda:0")

# Fit the model (fit method may not use data, so no need to move data to GPU here)
manipulation_model.fit(None, ["urgency", "invoking fear", "authority", "incentives", "impersonation"])
sentiment_model.fit(None, ["positive", "negative", "neutral"])
topic_model.fit(None, ["finance", "government", "retail", "shipping", "impersonation of someone they know"])

# Extract texts from the DataFrame
texts = mendeley_df['CLEANED_TEXT'].tolist()

# Predict manipulation
mendeley_df['manipulation'] = manipulation_model.predict(texts)

# Predict sentiment
mendeley_df['sentiment'] = sentiment_model.predict(texts)

# Predict topic
mendeley_df['topic'] = topic_model.predict(texts)

# Visualize the DataFrame as a table
print(mendeley_df[['CLEANED_TEXT', 'manipulation', 'sentiment', 'topic']])

# Save the DataFrame to a CSV file
mendeley_df.to_csv('predictions_mendeley.csv', index=False)


True


100%|██████████| 5971/5971 [06:04<00:00, 16.40it/s]
100%|██████████| 5971/5971 [03:38<00:00, 27.28it/s]
100%|██████████| 5971/5971 [06:07<00:00, 16.23it/s]


                                           CLEANED_TEXT   manipulation  \
0     your opinion about me 1 over 2 jada 3 kusruthi...  impersonation   
1     whats up do you want me to come online if you ...        urgency   
2                           so u workin overtime nigpun        urgency   
3     also sir i sent you an email about how to log ...      authority   
4     please stay at home to encourage the notion of...     incentives   
...                                                 ...            ...   
5966                                  but your not here  impersonation   
5967  becoz its  ltgt  jan whn al the post ofice is ...  invoking fear   
5968  its a valentine game   send dis msg to all ur ...     incentives   
5969                               we r outside already        urgency   
5970  the xmas story is peace the xmas msg is love t...      authority   

     sentiment                               topic  
0     negative                            shipping  
1    

In [16]:
# Load the predictions dataset
df = pd.read_csv('predictions.csv')

# Calculate the percentage of each type of prediction
prediction_counts = df['prediction'].value_counts(normalize=True) * 100

# Convert the result to a DataFrame for better visualization
prediction_percentages = prediction_counts.reset_index()
prediction_percentages.columns = ['Prediction', 'Percentage']

# Display the result
print(prediction_percentages)

      Prediction  Percentage
0     incentives   41.720779
1        urgency   34.577922
2      authority   14.935065
3          trust    6.331169
4  impersonation    1.948052
5           fear    0.487013


In [1]:
import torch
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset
import time

def classify_batch(texts, classifier, candidate_labels):
    return [classifier(text, candidate_labels=candidate_labels) for text in texts]

def classify_texts(dataset, text_column, candidate_labels, model_name, batch_size=16):
    # Check if GPU is available and set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Initialize the tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    model.to(device)
    
    # Initialize the zero-shot classification pipeline
    classifier = pipeline(
        "zero-shot-classification",
        model=model,
        tokenizer=tokenizer,
        device=0 if torch.cuda.is_available() else -1
    )

    def preprocess_function(examples):
        return tokenizer(examples[text_column], padding='max_length', truncation=True, max_length=512)

    # Ensure that the text column is correctly extracted
    tokenized_dataset = dataset.map(lambda examples: preprocess_function(examples), batched=True)

    # Split dataset into batches
    batch_predictions = []
    for i in range(0, len(tokenized_dataset), batch_size):
        batch = tokenized_dataset.select(range(i, min(i + batch_size, len(tokenized_dataset))))
        texts = [example[text_column] for example in batch[text_column]]
        batch_preds = classify_batch(texts, classifier, candidate_labels)
        batch_predictions.extend(batch_preds)

    # Extract the highest score label for each text
    highest_score_labels = [pred['labels'][pred['scores'].index(max(pred['scores']))] for pred in batch_predictions]

    return highest_score_labels

def classify_task(df, text_column, task_type):
    # Define candidate labels based on the task type and DataFrame language
    if df.equals(fbs_df) or df.equals(chinese_df):
        model_name = "joeddav/xlm-roberta-large-xnli"
        if task_type == "manipulation":
            candidate_labels = ["紧急", "威胁", "权威机构", "奖励", "冒充熟人"]
        elif task_type == "sentiment":
            candidate_labels = ["正面", "负面", "中性"]
        elif task_type == "topic":
            candidate_labels = ["金融机构", "政府", "交易", "快递", "冒充熟人"]
        else:
            raise ValueError("Invalid task type. Choose from 'manipulation', 'sentiment', or 'topic'.")
    else:
        model_name = "facebook/bart-large-mnli"
        if task_type == "manipulation":
            candidate_labels = ["urgency", "invoking fear", "authority", "incentives", "impersonation"]
        elif task_type == "sentiment":
            candidate_labels = ["positive", "negative", "neutral"]
        elif task_type == "topic":
            candidate_labels = ["finance", "government", "retail", "shipping", "impersonation of someone they know"]
        else:
            raise ValueError("Invalid task type. Choose from 'manipulation', 'sentiment', or 'topic'.")

    # Convert DataFrame to Hugging Face Dataset
    dataset = Dataset.from_pandas(df)

    # Make predictions and add results to the DataFrame
    df[f'{task_type}_label'] = classify_texts(dataset, text_column, candidate_labels, model_name=model_name)
    return df

# Load datasets
print("Load datasets")
fbs_df = pd.read_csv('processed_data/fbs_sms_df.csv')
chinese_df = pd.read_csv('processed_data/chinese_text_classification_df.csv')
mendeley_df = pd.read_csv('processed_data/mendeley_df.csv')
uci_df = pd.read_csv('processed_data/uci_df.csv')

# Classify each task and keep results in the original DataFrame
datasets = [("fbs_df", fbs_df), ("chinese_df", chinese_df), ("mendeley_df", mendeley_df), ("uci_df", uci_df)]
tasks = ['manipulation', 'sentiment', 'topic']

for name, df in datasets:
    for task in tasks:
        start_time = time.time()
        print(f"Processing {name} for {task}")
        df = classify_task(df, 'CLEANED_TEXT', task)
        duration = time.time() - start_time
        print(f"Completed {name} for {task} in {duration:.2f} seconds")
    df.to_csv(f"results_{name}.csv", index=False)

print("DONE")


Processing mendeley_df for manipulation


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


TypeError: object of type 'float' has no len()

## try shap

In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, ZeroShotClassificationPipeline
from shap import Explainer, plots
from typing import Union, List

weights = "facebook/bart-large-mnli"

# Check if GPU is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

example_texts = ["Please Stay At Home. To encourage the notion of staying at home. All tax-paying citizens are entitled to �305.96 or more emergency refund. smsg.io/fCVbD", 
                 "BankOfAmerica Alert 137943. Please follow http://bit.do/cgjK-and re-activate",
                 "UR awarded a City Break and could WIN a £200 Summer Shopping spree every WK. Txt STORE to 88039 . SkilGme. TsCs087147403233",
                 "Apple ID: [BUXCX7GBVwWCcOD Final Notification Your Apple 1D is due to expire today. Prevent this by confirming your Apple ID at�http://verifyapple.uk Apple Inc"
                 ]
#example_texts = ["I am happy to see you.","That was disappointing.","I would be lying if I said that movie was good."]
labels = ["urgency", "trust", "fear", "authority", "incentives", "impersonation"]
hypothesis_template="This Phishing text envokes {}."

model = AutoModelForSequenceClassification.from_pretrained(weights).to(device)
tokenizer = AutoTokenizer.from_pretrained(weights)

class MyZeroShotClassificationPipeline(ZeroShotClassificationPipeline):
    def __init__(self, model, tokenizer):
        super().__init__(model=model, tokenizer=tokenizer, device=device.index if device.type == 'cuda' else -1)  # Pass device to the superclass
        self.workaround_labels = None
        self.workaround_hypothesis_template = None

    def __call__(self, *args, **kwargs):
        o = super().__call__(args[0], self.workaround_labels, hypothesis_template=self.workaround_hypothesis_template)[0]
        return [[{"label": x[0], "score": x[1]} for x in zip(o["labels"], o["scores"])]]

    def set_labels_workaround(self, labels: Union[str, List[str]]):
        self.workaround_labels = labels

    def set_hypothesis_template_workaround(self, hypothesis_template: str):
        self.workaround_hypothesis_template = hypothesis_template

pipe = MyZeroShotClassificationPipeline(model, tokenizer)

# Set labels and hypothesis template
pipe.set_labels_workaround(labels)
pipe.set_hypothesis_template_workaround(hypothesis_template)
# In the following, we address issue 2.
model.config.label2id.update({v:k for k,v in enumerate(labels)})
model.config.id2label.update({k:v for k,v in enumerate(labels)})


def score_and_visualize(input_texts):
    
    for text in input_texts:
        print(text)
        result = pipe([text])
        print(result)

    explainer = Explainer(pipe)
    shap_values = explainer(input_texts)
    print(shap_values)

    plots.text(shap_values)

score_and_visualize(example_texts)

In [7]:


# Initialize and move the model to the GPU
model_zh = ZeroShotClassifier("joeddav/xlm-roberta-large-xnli",device="cuda:0")

# Fit the model (fit method may not use data, so no need to move data to GPU here)
model.fit(None, ["紧迫", "信任", "恐惧", "权威", "奖励", "冒充"])

# Make predictions on the texts in the DataFrame
texts = fbs_df_phish['TEXT'].tolist()


# Predict
fbs_df_phish['prediction'] = model.predict(texts)

# Set the output format to pandas DataFrame and transform the texts (optional)
model.set_output(transform="pandas")
transformed_output = model.transform(texts)

# Visualize the DataFrame as a table
print(fbs_df_phish[['TEXT', 'prediction']])


Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  1%|          | 26/3069 [00:02<05:22,  9.44it/s]


KeyboardInterrupt: 

In [17]:
# Calculate the percentage of each type of prediction
prediction_counts = fbs_df_phish['prediction'].value_counts(normalize=True) * 100

# Convert the result to a DataFrame for better visualization
prediction_percentages = prediction_counts.reset_index()
prediction_percentages.columns = ['Prediction', 'Percentage']

# Display the result
print(prediction_percentages)

  Prediction  Percentage
0         奖励   33.496253
1         权威   19.322255
2         信任   17.627892
3         紧迫   15.053763
4         恐惧   13.489736
5         冒充    1.010101


In [4]:
# Initialize and move the model to the GPU
model_zh = ZeroShotClassifier("bert-base-chinese",device="cuda:0")

# Fit the model (fit method may not use data, so no need to move data to GPU here)
model.fit(None, ["紧迫", "信任", "恐惧", "权威", "奖励", "冒充"])
texts = ["尊敬 的 CELLPHONE 用户 中国联通 NAME 提示 您 有 g 专属 流量","你的账号已被盗，拨打电话","恭喜你获得大奖","公安局提醒你","马上行动"]

# Set the output format to pandas DataFrame and transform the texts (optional)
model.set_output(transform="pandas")
transformed_output = model.transform(texts)
print(transformed_output)

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

100%|██████████| 5/5 [00:00<00:00, 13.34it/s]

         奖励        权威        冒充        信任        紧迫        恐惧
0  0.201076  0.185511  0.132428  0.141178  0.181802  0.158007
1  0.203453  0.159781  0.134546  0.138986  0.205607  0.157627
2  0.250413  0.173183  0.136768  0.154591  0.107620  0.177425
3  0.109924  0.096179  0.471475  0.104606  0.048869  0.168948
4  0.164172  0.188040  0.158268  0.169198  0.132301  0.188020





In [13]:
import torch
import pandas as pd
from transformers import pipeline

# Check if GPU is available and set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model="shenzhi-wang/Llama3-8B-Chinese-Chat", device=0 if torch.cuda.is_available() else -1)

# Define candidate labels in Chinese
candidate_labels = ["紧迫感", "恐吓", "权威", "奖励", "冒充"]

# Texts for classification
texts = ["尊敬 的 CELLPHONE 用户 中国联通 NAME 提示 您 有 g 专属 流量", "你的账号已被盗，拨打电话", "恭喜你获得大奖，免费领取", "公安局提醒你，你很危险", "马上行动，现在！"]

# Make predictions on the texts
predictions = classifier(texts, candidate_labels=candidate_labels)

# Convert predictions to a pandas DataFrame
predictions_df = pd.DataFrame(predictions)


# Extract the highest score label for each text
highest_score_labels = []
for pred in predictions:
    max_score_idx = pred['scores'].index(max(pred['scores']))
    highest_score_labels.append(pred['labels'][max_score_idx])

# Create a DataFrame to visualize the results
results_df = pd.DataFrame({'text': texts, 'highest_score_label': highest_score_labels})
# Extract and format the results
transformed_output = predictions_df[['sequence', 'labels', 'scores']]

# Print the results
print(transformed_output)
# Print the results
print(results_df)


config.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at shenzhi-wang/Llama3-8B-Chinese-Chat and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/51.3k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/97.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


OutOfMemoryError: CUDA out of memory. Tried to allocate 224.00 MiB. GPU 

In [3]:
import torch
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

def classify_texts(df, text_column, candidate_labels, model_name="joeddav/xlm-roberta-large-xnli", n_rows=100):
    # Check if GPU is available and set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Initialize the tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    model.to(device)
    
    # Initialize the zero-shot classification pipeline
    classifier = pipeline(
        "zero-shot-classification",
        model=model,
        tokenizer=tokenizer,
        device=0 if torch.cuda.is_available() else -1
    )

    # Extract the first n_rows from the DataFrame
    texts = df[text_column].head(n_rows).tolist()

    # Make predictions on the texts
    predictions = [classifier(text, candidate_labels=candidate_labels) for text in texts]

    # Extract the highest score label for each text
    highest_score_labels = []
    for pred in predictions:
        max_score_idx = pred['scores'].index(max(pred['scores']))
        highest_score_labels.append(pred['labels'][max_score_idx])

    # Create a DataFrame to visualize the results
    results_df = pd.DataFrame({'text': texts, 'highest_score_label': highest_score_labels})

    # Print the results
    print(results_df)
    return results_df


chinese_df = pd.DataFrame({
    'TEXT': [
        "尊敬 的 CELLPHONE 用户 中国联通 NAME 提示 您 有 g 专属 流量",
        "警告，你的账号已被盗，马上拨打电话才能找回",
        "恭喜你！获得大奖，免费领取，中奖了",
        "公安局提醒你，请上传身份证",
        "妈妈，我是小明，点开这个相册"
    ]  # Repeat the texts to ensure we have more than 100 rows
})




# Define candidate labels in Chinese
candidate_labels_zh = ["紧迫", "恐吓", "冒充权威机构", "奖励", "冒充熟人"]
candidate_labels_en = ["urgency", "trust", "fear", "authority", "incentives", "impersonation"]
# Call the function on the DataFrame's 'TEXT' column
results_df = classify_texts(mendeley_df_phish, 'TEXT', candidate_labels_en,model_name="facebook/bart-large-mnli" ,n_rows=100)


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


                                                 text highest_score_label
0   Please Stay At Home. To encourage the notion o...          incentives
1   BankOfAmerica Alert 137943. Please follow http...             urgency
2   UR awarded a City Break and could WIN a £200 S...          incentives
3   \tYOU HAVE WON! As a valued Vodafone customer ...          incentives
4   Phony £350 award - Todays Voda numbers ending ...       impersonation
..                                                ...                 ...
95  URGENT! We are trying to contact U Todays draw...             urgency
96  You have WON a guaranteed £1000 cash or a £200...           authority
97  Nationwide has noticed your debit card was rec...             urgency
98  09066362231 URGENT! Your mobile No 07xxxxxxxxx...             urgency
99  PRIVATE! Your 2003 Account Statement for 07753...               trust

[100 rows x 2 columns]
