In [1]:
import pandas as pd

# Load the uploaded CSV file to understand its structure and content.
file_path = '/kaggle/input/cyberguard/train1.csv'
data = pd.read_csv(file_path)

file_path = '/kaggle/input/cyberguard/test1.csv'
data_test = pd.read_csv(file_path)

# Display the first few rows of the data for inspection.
data.head()

Unnamed: 0,category,sub_category,crimeaditionalinfo
0,Online and Social Media Related Crime,Cyber Bullying Stalking Sexting,I had continue received random calls and abusi...
1,Online Financial Fraud,Fraud CallVishing,The above fraudster is continuously messaging ...
2,Online Gambling Betting,Online Gambling Betting,He is acting like a police and demanding for m...
3,Online and Social Media Related Crime,Online Job Fraud,In apna Job I have applied for job interview f...
4,Online Financial Fraud,Fraud CallVishing,I received a call from lady stating that she w...


In [2]:
data.value_counts('category')

category
Online Financial Fraud                                  57434
Online and Social Media Related Crime                   12140
Any Other Cyber Crime                                   10878
Cyber Attack/ Dependent Crimes                           3608
RapeGang Rape RGRSexually Abusive Content                2822
Sexually Obscene material                                1838
Hacking  Damage to computercomputer system etc           1710
Sexually Explicit Act                                    1552
Cryptocurrency Crime                                      480
Online Gambling  Betting                                  444
Child Pornography CPChild Sexual Abuse Material CSAM      379
Online Cyber Trafficking                                  183
Cyber Terrorism                                           161
Ransomware                                                 56
Report Unlawful Content                                     1
Name: count, dtype: int64

In [3]:
data = data.dropna(subset=['crimeaditionalinfo'])
data_test = data_test.dropna(subset=['crimeaditionalinfo'])

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline

# Extract the relevant columns and split the data
X = data['crimeaditionalinfo']  # Textual descriptions as input
y = data['category']            # Target variable

X_test = data_test['crimeaditionalinfo']
y_test = data_test['category']
# Split the data into training and test sets


# Initialize models
models = {
    'Logistic Regression': make_pipeline(TfidfVectorizer(), LogisticRegression(max_iter=1000, random_state=42)),
    'Random Forest': make_pipeline(TfidfVectorizer(), RandomForestClassifier(random_state=42)),
    'Support Vector Machine': make_pipeline(TfidfVectorizer(), SVC(kernel='linear', random_state=42)),
    'Naive Bayes': make_pipeline(TfidfVectorizer(), MultinomialNB())
}

In [5]:
performance_reports = {}

for model_name, model in models.items():
    model.fit(X, y)  # Train the model
    y_pred = model.predict(X_test)  # Make predictions
    report = classification_report(y_test, y_pred, output_dict=True)  # Evaluate predictions
    performance_reports[model_name] = report
    print(report)

# Output a summary of classification reports for each model
performance_reports

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'Any Other Cyber Crime': {'precision': 0.4525993883792049, 'recall': 0.24196185286103541, 'f1-score': 0.31534090909090906, 'support': 3670}, 'Child Pornography CPChild Sexual Abuse Material CSAM': {'precision': 0.7073170731707317, 'recall': 0.23577235772357724, 'f1-score': 0.35365853658536583, 'support': 123}, 'Crime Against Women & Children': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 4}, 'Cryptocurrency Crime': {'precision': 0.6666666666666666, 'recall': 0.40963855421686746, 'f1-score': 0.5074626865671641, 'support': 166}, 'Cyber Attack/ Dependent Crimes': {'precision': 0.9968379446640316, 'recall': 1.0, 'f1-score': 0.9984164687252574, 'support': 1261}, 'Cyber Terrorism': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 52}, 'Hacking  Damage to computercomputer system etc': {'precision': 0.4337748344370861, 'recall': 0.22128378378378377, 'f1-score': 0.2930648769574944, 'support': 592}, 'Online Cyber Trafficking': {'precision': 0.0, 'recall': 0.0, 'f1-

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'Any Other Cyber Crime': {'precision': 0.8089430894308943, 'recall': 0.054223433242506815, 'f1-score': 0.10163432073544433, 'support': 3670}, 'Child Pornography CPChild Sexual Abuse Material CSAM': {'precision': 0.8, 'recall': 0.22764227642276422, 'f1-score': 0.3544303797468355, 'support': 123}, 'Crime Against Women & Children': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 4}, 'Cryptocurrency Crime': {'precision': 1.0, 'recall': 0.1144578313253012, 'f1-score': 0.2054054054054054, 'support': 166}, 'Cyber Attack/ Dependent Crimes': {'precision': 0.9968379446640316, 'recall': 1.0, 'f1-score': 0.9984164687252574, 'support': 1261}, 'Cyber Terrorism': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 52}, 'Hacking  Damage to computercomputer system etc': {'precision': 0.875, 'recall': 0.03547297297297297, 'f1-score': 0.06818181818181818, 'support': 592}, 'Online Cyber Trafficking': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 61}, 'Online Financ

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'Any Other Cyber Crime': {'precision': 0.4627039627039627, 'recall': 0.21634877384196186, 'f1-score': 0.2948384701076866, 'support': 3670}, 'Child Pornography CPChild Sexual Abuse Material CSAM': {'precision': 0.7916666666666666, 'recall': 0.3089430894308943, 'f1-score': 0.4444444444444445, 'support': 123}, 'Crime Against Women & Children': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 4}, 'Cryptocurrency Crime': {'precision': 0.6904761904761905, 'recall': 0.5240963855421686, 'f1-score': 0.595890410958904, 'support': 166}, 'Cyber Attack/ Dependent Crimes': {'precision': 0.9968379446640316, 'recall': 1.0, 'f1-score': 0.9984164687252574, 'support': 1261}, 'Cyber Terrorism': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 52}, 'Hacking  Damage to computercomputer system etc': {'precision': 0.4362017804154303, 'recall': 0.2483108108108108, 'f1-score': 0.31646932185145316, 'support': 592}, 'Online Cyber Trafficking': {'precision': 0.0, 'recall': 0.0, 'f1-score

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'Any Other Cyber Crime': {'precision': 0.74, 'recall': 0.010081743869209809, 'f1-score': 0.01989247311827957, 'support': 3670}, 'Child Pornography CPChild Sexual Abuse Material CSAM': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 123}, 'Crime Against Women & Children': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 4}, 'Cryptocurrency Crime': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 166}, 'Cyber Attack/ Dependent Crimes': {'precision': 0.9968379446640316, 'recall': 1.0, 'f1-score': 0.9984164687252574, 'support': 1261}, 'Cyber Terrorism': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 52}, 'Hacking  Damage to computercomputer system etc': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 592}, 'Online Cyber Trafficking': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 61}, 'Online Financial Fraud': {'precision': 0.684621247867591, 'recall': 0.9985177342509264, 'f1-score': 0.8122994767554531, 's

  _warn_prf(average, modifier, msg_start, len(result))


{'Logistic Regression': {'Any Other Cyber Crime': {'precision': 0.4525993883792049,
   'recall': 0.24196185286103541,
   'f1-score': 0.31534090909090906,
   'support': 3670},
  'Child Pornography CPChild Sexual Abuse Material CSAM': {'precision': 0.7073170731707317,
   'recall': 0.23577235772357724,
   'f1-score': 0.35365853658536583,
   'support': 123},
  'Crime Against Women & Children': {'precision': 0.0,
   'recall': 0.0,
   'f1-score': 0.0,
   'support': 4},
  'Cryptocurrency Crime': {'precision': 0.6666666666666666,
   'recall': 0.40963855421686746,
   'f1-score': 0.5074626865671641,
   'support': 166},
  'Cyber Attack/ Dependent Crimes': {'precision': 0.9968379446640316,
   'recall': 1.0,
   'f1-score': 0.9984164687252574,
   'support': 1261},
  'Cyber Terrorism': {'precision': 0.0,
   'recall': 0.0,
   'f1-score': 0.0,
   'support': 52},
  'Hacking  Damage to computercomputer system etc': {'precision': 0.4337748344370861,
   'recall': 0.22128378378378377,
   'f1-score': 0.29306