In [7]:
# Import necessary libraries
import pandas as pd
import numpy as np
import nltk
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Download NLTK resources
nltk.download('stopwords')

# Load the datasets
train_data = pd.read_csv('/content/train.csv')
test_data = pd.read_csv('/content/test.csv')

# Preview the data
print("Training Data Preview:")
print(train_data.head(), "\n")
print("Test Data Preview:")
print(test_data.head(), "\n")

# Preprocessing function for cleaning and tokenizing text
def preprocess_text(text):
    if isinstance(text, str):  # Ensure the input is a string
        # Remove punctuation and digits
        text = ''.join([char for char in text if char not in string.punctuation and not char.isdigit()])

        # Convert text to lowercase
        text = text.lower()

        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        text = ' '.join([word for word in text.split() if word not in stop_words])

        # Apply stemming
        stemmer = PorterStemmer()
        text = ' '.join([stemmer.stem(word) for word in text.split()])

        return text
    else:
        # If the value is not a string (NaN or other), return an empty string
        return ""

# Apply preprocessing on both train and test datasets
train_data['crimeaditionalinfo'] = train_data['crimeaditionalinfo'].apply(preprocess_text)
test_data['crimeaditionalinfo'] = test_data['crimeaditionalinfo'].apply(preprocess_text)

# Check if the data is cleaned properly
print("Preprocessed Training Data Preview:")
print(train_data.head(), "\n")
print("Preprocessed Test Data Preview:")
print(test_data.head(), "\n")

# Feature Extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train = tfidf_vectorizer.fit_transform(train_data['crimeaditionalinfo'])
X_test = tfidf_vectorizer.transform(test_data['crimeaditionalinfo'])

# Labels (target variables)
y_train = train_data['category']
y_test = test_data['category']

# Splitting the training data into train and validation sets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train a Multinomial Naive Bayes model
model = MultinomialNB()
model.fit(X_train_split, y_train_split)

# Predict on the validation set
y_val_pred = model.predict(X_val_split)

# Evaluate the model on the validation set
accuracy = accuracy_score(y_val_split, y_val_pred)
precision = precision_score(y_val_split, y_val_pred, average='weighted', zero_division=1)
recall = recall_score(y_val_split, y_val_pred, average='weighted', zero_division=1)
f1 = f1_score(y_val_split, y_val_pred, average='weighted', zero_division=1)

print("Model Evaluation on Validation Set:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print("\nClassification Report on Validation Set:")
print(classification_report(y_val_split, y_val_pred))

# Now test the model on the test data
y_test_pred = model.predict(X_test)

# Evaluate the model on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred, average='weighted', zero_division=1)
test_recall = recall_score(y_test, y_test_pred, average='weighted', zero_division=1)
test_f1 = f1_score(y_test, y_test_pred, average='weighted', zero_division=1)

print("Model Evaluation on Test Set:")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall: {test_recall:.4f}")
print(f"F1-Score: {test_f1:.4f}")
print("\nClassification Report on Test Set:")
print(classification_report(y_test, y_test_pred))

# You can also save the model and vectorizer if you need to deploy or use them later
import joblib

joblib.dump(model, 'text_classification_model.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

# Save the evaluation metrics and results
metrics = {
    'accuracy': test_accuracy,
    'precision': test_precision,
    'recall': test_recall,
    'f1_score': test_f1
}

# Save metrics to a CSV file
metrics_df = pd.DataFrame([metrics])
metrics_df.to_csv('evaluation_metrics.csv', index=False)
print("\nMetrics saved to evaluation_metrics.csv")



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Training Data Preview:
                                category                       sub_category  \
0  Online and Social Media Related Crime  Cyber Bullying  Stalking  Sexting   
1                 Online Financial Fraud                  Fraud CallVishing   
2               Online Gambling  Betting           Online Gambling  Betting   
3  Online and Social Media Related Crime                   Online Job Fraud   
4                 Online Financial Fraud                  Fraud CallVishing   

                                  crimeaditionalinfo  
0  I had continue received random calls and abusi...  
1  The above fraudster is continuously messaging ...  
2  He is acting like a police and demanding for m...  
3  In apna Job I have applied for job interview f...  
4  I received a call from lady stating that she w...   

Test Data Preview:
                                    category  \
0  RapeGang Rape RGRSexually Abusive Content   
1                     Online Financial Fraud   
2      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                                                      precision    recall  f1-score   support

                               Any Other Cyber Crime       0.36      0.28      0.32      2091
Child Pornography CPChild Sexual Abuse Material CSAM       0.93      0.19      0.31        69
                                Cryptocurrency Crime       1.00      0.03      0.06        96
                      Cyber Attack/ Dependent Crimes       1.00      1.00      1.00       765
                                     Cyber Terrorism       0.00      0.00      0.00        31
      Hacking  Damage to computercomputer system etc       0.37      0.11      0.17       341
                            Online Cyber Trafficking       0.00      0.00      0.00        34
                              Online Financial Fraud       0.83      0.91      0.87     11471
                            Online Gambling  Betting       0.00      0.00      0.00        97
               Online and Social Media Related Crime       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                                                      precision    recall  f1-score   support

                               Any Other Cyber Crime       0.36      0.27      0.30      3670
Child Pornography CPChild Sexual Abuse Material CSAM       0.72      0.11      0.18       123
                      Crime Against Women & Children       0.00      0.00      0.00         4
                                Cryptocurrency Crime       0.62      0.03      0.06       166
                      Cyber Attack/ Dependent Crimes       1.00      1.00      1.00      1261
                                     Cyber Terrorism       0.00      0.00      0.00        52
      Hacking  Damage to computercomputer system etc       0.37      0.10      0.16       592
                            Online Cyber Trafficking       0.00      0.00      0.00        61
                              Online Financial Fraud       0.82      0.91      0.87     18896
                            Online Gambling  Betting       