In [1]:
import pandas as pd
import re

# Load datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [2]:
train_data.head()

Unnamed: 0,category,sub_category,crimeaditionalinfo
0,Online and Social Media Related Crime,Cyber Bullying Stalking Sexting,I had continue received random calls and abusi...
1,Online Financial Fraud,Fraud CallVishing,The above fraudster is continuously messaging ...
2,Online Gambling Betting,Online Gambling Betting,He is acting like a police and demanding for m...
3,Online and Social Media Related Crime,Online Job Fraud,In apna Job I have applied for job interview f...
4,Online Financial Fraud,Fraud CallVishing,I received a call from lady stating that she w...


In [3]:
train_data['crimeaditionalinfo'].isnull().sum()

21

In [4]:
train_data=train_data.dropna()

In [5]:
train_data.isnull().sum()

category              0
sub_category          0
crimeaditionalinfo    0
dtype: int64

In [6]:
def preprocess_text(text):
    if pd.isna(text):  # Check if the text is NaN
        return []  # Return an empty list for missing values
    text = str(text).lower()  # Convert to string and lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    tokens = text.split()  # Tokenize by whitespace
    return tokens

# Apply preprocessing
train_data['tokens'] = train_data['crimeaditionalinfo'].apply(preprocess_text)
test_data['tokens'] = test_data['crimeaditionalinfo'].apply(preprocess_text)

In [7]:
#pip install gensim

In [8]:
from gensim.models import Word2Vec

# Train Word2Vec model
sentences = train_data['tokens'].tolist()
word2vec_model = Word2Vec(
    sentences=sentences,
    vector_size=300,   # Embedding dimension
    window=5,          # Context window size
    min_count=2,       # Ignore words with frequency lower than 2
    workers=4,         # Number of CPU cores
    sg=1               # Use skip-gram
)

# Save the trained model
word2vec_model.save("custom_word2vec.model")


In [9]:
import numpy as np

def get_sentence_embedding(sentence, model):
    embeddings = [model.wv[word] for word in sentence if word in model.wv]
    if embeddings:
        return np.mean(embeddings, axis=0)  # Averaging word embeddings
    else:
        return np.zeros(model.vector_size)  # For empty or OOV-only sentences

# Generate sentence embeddings for each sentence in the training data
train_data['sentence_embedding'] = train_data['tokens'].apply(lambda x: get_sentence_embedding(x, word2vec_model))


In [10]:
train_data['sentence_embedding'][0]

array([-7.88021535e-02,  2.49270484e-01,  6.92878217e-02,  2.11779643e-02,
        6.41876012e-02, -1.03904888e-01,  9.61883068e-02,  4.18564916e-01,
       -5.49675012e-03,  5.46150748e-03,  7.18926117e-02, -1.28895611e-01,
        1.33500502e-01,  5.03949774e-03, -3.21658961e-02, -1.97155431e-01,
        1.93899781e-01, -1.23419344e-01,  2.60990448e-02, -1.03132665e-01,
        1.82077866e-02,  1.68189220e-02,  9.87077057e-02,  8.73014107e-02,
        5.64403795e-02,  5.48228174e-02, -1.78125292e-01,  1.37511373e-01,
       -6.45054132e-02, -1.21994115e-01, -2.04318389e-02, -1.41354678e-02,
        2.88357865e-02,  4.69467118e-02, -6.93118349e-02,  1.41741782e-01,
       -1.44890044e-02, -2.37292171e-01,  5.64138405e-02,  2.21521966e-02,
       -7.07385615e-02,  2.75654551e-02, -9.08959135e-02, -1.41900793e-01,
       -2.44009253e-02,  1.93675771e-01, -5.17208539e-02, -8.14843997e-02,
       -5.58990538e-02,  8.26207474e-02,  1.52379677e-01,  2.46693548e-02,
       -2.42827684e-02,  

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Prepare data for model training
X_train = np.vstack(train_data['sentence_embedding'].values)
y_train = train_data['category']  # Assuming category is the label column

# Train a classifier
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

# Predict on training data to evaluate
y_pred = classifier.predict(X_train)
print("Training Accuracy:", accuracy_score(y_train, y_pred))

from sklearn.metrics import recall_score
print("recall score ",recall_score(y_train, y_pred,average='weighted'))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training Accuracy: 0.7825298022371776
recall score  0.7825298022371776


In [12]:
test_data['sentence_embedding'] = test_data['tokens'].apply(lambda x: get_sentence_embedding(x, word2vec_model))
X_test = np.vstack(test_data['sentence_embedding'].values)


In [13]:
# Predict on test data
y_test = test_data['category']  # Replace with the actual label column in your test data
y_test_pred = classifier.predict(X_test)

# Calculate test accuracy
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", test_accuracy)
print("recall score ",recall_score(y_test, y_test_pred,average='weighted'))

Test Accuracy: 0.7223093919113645
recall score  0.7223093919113645
