# Cyber Threat Detection Project

<img src="https://www.tierpoint.com/wp-content/uploads/2023/06/Navigating-the-Cyber-Threat-Landscape-in-2023_blog-1.png.webp">

In [116]:
# pip install torch transformers datasets scikit-learn matplotlib evaluate

### Import Libraries

In [117]:
import pandas as pd
from transformers import DistilBertTokenizerFast, DistilBertForTokenClassification, Trainer, TrainingArguments
from transformers import DataCollatorForTokenClassification
import torch
from sklearn.model_selection import train_test_split


### Import Dataset

In [None]:
import kagglehub

path = kagglehub.dataset_download("ramoliyafenil/text-based-cyber-threat-detection")

print("Path to dataset files:", path)

Path to dataset files: /Users/zafer/.cache/kagglehub/datasets/ramoliyafenil/text-based-cyber-threat-detection/versions/1


### Dataset Set to Model

In [None]:
df = pd.read_csv(path + "/cyber-threat-intelligence_all.csv")
df_processed= pd.read_csv(path + "/Cyber-Threat-Intelligence-Custom-Data_new_processed.csv")

In [None]:
# only text - label_1 and diagnonis
df_processed = df_processed[['text', 'label_1', 'diagnosis']]
#label_1 rename to label
df_processed.rename(columns={'label_1': 'label'}, inplace=True)

In [None]:
selected_labels = ['malware', 'attack-pattern', 'threat-actor']
data_processed = df_processed[df_processed['label'].isin(selected_labels)]
data_processed.shape

(253, 3)

In [119]:
df = df.dropna(subset=['label'])

selected_labels = ['malware', 'attack-pattern', 'threat-actor']
data = df[df['label'].isin(selected_labels)]
data = data[['text','label']]
data.shape

(4007, 2)

In [None]:

data_without_diagnosis = data[~data['text'].isin(data_processed['text'])]
data_with_diagnosis = data_processed.copy()

def auto_generate_diagnosis(label):
    if "malware" in label:
        return "Malicious Software Detected"
    elif "attack-pattern" in label:
        return "Suspicious Attack Pattern Identified"
    elif "threat-actor" in label:
        return "Potential Threat Actor Detected"
    else:
        return "Unknown Activity Detected"


data_without_diagnosis['diagnosis'] = data_without_diagnosis['label'].apply(auto_generate_diagnosis)

combined_data = pd.concat([data_with_diagnosis, data_without_diagnosis], ignore_index=True)
combined_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_without_diagnosis['diagnosis'] = data_without_diagnosis['label'].apply(auto_generate_diagnosis)


Unnamed: 0,text,label,diagnosis
0,A cybersquatting domain save-russia[.]today is...,attack-pattern,The diagnosis is a cyber attack that involves ...
1,While analyzing the technical details of this ...,threat-actor,Diagnosis: APT37/Reaper/Group 123 is responsib...
2,We named these malicious apps AnubisSpy (ANDRO...,malware,"The diagnosis is a case of malware infection, ..."
3,Base64 encoded ANEL downloaded by Koadic ANEL ...,malware,The entity has been infected by the ANEL malwa...
4,BIOPASS RAT Loader Backdoor.Win64.BIOPASS.A ...,malware,The diagnosis is a cybersecurity threat caused...


In [99]:
combined_data.shape

(3569, 3)

In [100]:
combined_data['combined_label'] = combined_data.apply(lambda row: f"{row['label']}_{row['diagnosis']}", axis=1)
combined_data.drop(columns=['label', 'diagnosis'], inplace=True)

In [108]:
combined_data.sample(10)

Unnamed: 0,text,combined_label
976,Masquerading: Match Legitimate Name or Locatio...,attack-pattern_Suspicious Attack Pattern Ident...
2751,Figure 6. C/C++ version of RATANKBA Figure...,attack-pattern_Suspicious Attack Pattern Ident...
351,And we expect to continue to see Hancitor be a...,malware_Malicious Software Detected
929,Figure 1. Stages of Earth Centaur’s intr...,threat-actor_Potential Threat Actor Detected
721,These kinds of attacks are called “Distributed...,attack-pattern_Suspicious Attack Pattern Ident...
670,"This practice is known as triple extortion, a ...",threat-actor_Potential Threat Actor Detected
781,test4.exe BIOPASS RAT Loader,malware_Malicious Software Detected
70,"On Feb. 23, a new variant of wiper malware nam...",malware_HermeticWiper is a variant of wiper ma...
1234,A third SharpChisel instance that connects to ...,attack-pattern_Suspicious Attack Pattern Ident...
1210,Only three AV vendors correctly identified it ...,malware_Malicious Software Detected


### Model Settings and Training

In [None]:
# Gerekli kütüphaneleri içe aktarma
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)
from datasets import Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import torch


def convert_to_dataset(df):
    return Dataset.from_pandas(df)

train_df, test_df = train_test_split(combined_data, test_size=0.2, random_state=42)


train_dataset = convert_to_dataset(train_df)
test_dataset = convert_to_dataset(test_df)

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

label_list = sorted(list(set(train_df['combined_label'].tolist())))
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for i, label in enumerate(label_list)}

def tokenize_and_prepare(examples):
    tokenized_inputs = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=128,
    )

    labels = []
    for i, combined_label in enumerate(examples["combined_label"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_id = label_to_id.get(combined_label, -100)
        label_ids = [-100 if word_id is None else label_id for word_id in word_ids]
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

train_dataset = train_dataset.map(
    tokenize_and_prepare,
    batched=True,
    remove_columns=train_dataset.column_names  
)

test_dataset = test_dataset.map(
    tokenize_and_prepare,
    batched=True,
    remove_columns=test_dataset.column_names  
)


model = DistilBertForTokenClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=len(label_list)
)

training_args = TrainingArguments(
    output_dir='./results',
    eval_steps=500,  
    logging_steps=500,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer)
)


trainer.train()


evaluation_results = trainer.evaluate()
print(f"Evaluation results: {evaluation_results}")


model.save_pretrained('./saved_model')
tokenizer.save_pretrained('./saved_model')



Map:   0%|          | 0/2855 [00:00<?, ? examples/s]

Map:   0%|          | 0/714 [00:00<?, ? examples/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,1.134


Evaluation results: {'eval_loss': 0.2582714259624481, 'eval_runtime': 6.2477, 'eval_samples_per_second': 114.281, 'eval_steps_per_second': 7.203, 'epoch': 3.0}


('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.txt',
 './saved_model/added_tokens.json',
 './saved_model/tokenizer.json')

### Load Saved Model

In [None]:
model = DistilBertForTokenClassification.from_pretrained('./saved_model')
tokenizer = DistilBertTokenizerFast.from_pretrained('./saved_model')

model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

DistilBertForTokenClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
   

### Prediction

In [None]:
def predict_example(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    predictions = logits.argmax(dim=-1).squeeze().tolist()

    if isinstance(predictions[0], list):
        predictions = predictions[0]

    predicted_labels = [id_to_label.get(pred, "O") for pred in predictions]

    return predicted_labels

In [115]:
sample_text = "At the time, we found ChessMaster targeting different sectors from the academe to media and government agencies in Japan.."
predicted_labels = predict_example(sample_text)
print(f"Sample text: {sample_text}")
print(f"Predicted label: {predicted_labels[0]}")

Sample text: At the time, we found ChessMaster targeting different sectors from the academe to media and government agencies in Japan..
Predicted label: threat-actor_Potential Threat Actor Detected
