In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/cyberguard/test1.csv
/kaggle/input/cyberguard/train1.csv


In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from datasets import Dataset

In [3]:
# Load the data
file_path = '/kaggle/input/cyberguard/train1.csv'
data = pd.read_csv(file_path)

# Drop rows with missing values in 'crimeaditionalinfo' or 'category'
data = data.dropna(subset=['crimeaditionalinfo', 'category'])

In [4]:
label_encoder = LabelEncoder()
data['category_encoded'] = label_encoder.fit_transform(data['category'])

In [5]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data['crimeaditionalinfo'].tolist(), data['category_encoded'].tolist(), test_size=0.2, random_state=42
)

In [6]:
tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained(
    'google-bert/bert-base-uncased', 
    num_labels=len(label_encoder.classes_)
)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

In [8]:
class CrimeDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)

train_dataset = CrimeDataset(train_encodings, train_labels)
val_dataset = CrimeDataset(val_encodings, val_labels)

In [9]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    report_to=[]
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train the model
trainer.train()



Epoch,Training Loss,Validation Loss
1,0.6747,0.650005
2,0.6207,0.6392


TrainOutput(global_step=4684, training_loss=0.6848232666719275, metrics={'train_runtime': 8444.9362, 'train_samples_per_second': 17.746, 'train_steps_per_second': 0.555, 'total_flos': 3.943547764140442e+16, 'train_loss': 0.6848232666719275, 'epoch': 2.0})

In [10]:
from sklearn.metrics import classification_report

# Get predictions on the validation set
predictions = trainer.predict(val_dataset)

# Extract predicted class labels
pred_labels = predictions.predictions.argmax(-1)

# Extract true labels from val_dataset
# true_labels = [label for label in val_dataset['labels']]
unique_labels_in_val = list(set(val_labels))

# Generate and print the classification report
print(classification_report(val_labels, pred_labels,labels=unique_labels_in_val, target_names=label_encoder.classes_))

                                                      precision    recall  f1-score   support

                               Any Other Cyber Crime       0.51      0.32      0.39      2142
Child Pornography CPChild Sexual Abuse Material CSAM       0.55      0.21      0.31        84
                                Cryptocurrency Crime       0.58      0.47      0.52        92
                      Cyber Attack/ Dependent Crimes       1.00      1.00      1.00       762
                                     Cyber Terrorism       0.00      0.00      0.00        38
      Hacking  Damage to computercomputer system etc       0.40      0.35      0.38       337
                            Online Cyber Trafficking       0.00      0.00      0.00        33
                              Online Financial Fraud       0.83      0.95      0.89     11470
                            Online Gambling  Betting       0.00      0.00      0.00        91
               Online and Social Media Related Crime       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
import pandas as pd
from sklearn.metrics import classification_report

# Load the dataset
df = pd.read_csv('/kaggle/input/cyberguard/test1.csv')
df = df.dropna(subset=['crimeaditionalinfo', 'category'])

# Filter out specific categories
df = df[~df['category'].isin(['Crime Against Women & Children'])]
# Encode the labels
df['label'] = label_encoder.transform(df['category'])  # Ensure label_encoder is defined and fitted

# Prepare the text and tokenize
text = df['crimeaditionalinfo']  # Check for correct column name
test_encoding = tokenizer(text.tolist(), truncation=True, padding=True, max_length=512)  # Convert to list if necessary

# Extract labels
labels = df['label'].tolist()  # Ensure this is a list

# Create the dataset
test = CrimeDataset(test_encoding, labels)

# Get predictions on the test set
predictions = trainer.predict(test)

# Extract predicted class labels
pred_labels = predictions.predictions.argmax(-1)

# Generate and print the classification report
unique_labels_in_val = list(set(labels))  # Ensure unique labels for report
print(classification_report(labels, pred_labels, labels=unique_labels_in_val, target_names=label_encoder.classes_))

                                                      precision    recall  f1-score   support

                               Any Other Cyber Crime       0.49      0.28      0.35      3670
Child Pornography CPChild Sexual Abuse Material CSAM       0.55      0.28      0.37       123
                                Cryptocurrency Crime       0.58      0.50      0.54       166
                      Cyber Attack/ Dependent Crimes       1.00      1.00      1.00      1261
                                     Cyber Terrorism       0.00      0.00      0.00        52
      Hacking  Damage to computercomputer system etc       0.37      0.35      0.36       592
                            Online Cyber Trafficking       0.00      0.00      0.00        61
                              Online Financial Fraud       0.83      0.95      0.88     18890
                            Online Gambling  Betting       0.50      0.01      0.01       134
               Online and Social Media Related Crime       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
df.describe()

Unnamed: 0,label
count,31218.0
mean,6.560382
std,3.041273
min,0.0
25%,7.0
50%,7.0
75%,7.0
max,14.0
