# **CNN Model**

In [1]:
import pandas as pd
import numpy as np
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from imblearn.over_sampling import SMOTE
from tensorflow.keras.layers import Input, Flatten, SpatialDropout1D, Bidirectional, Reshape
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
df = pd.read_csv('/content/cleaned_dataset_updated.csv')

# Display the first few rows to verify
print("Original DataFrame:")
print(df.head())
 # Use the 'tweet' column for text data
X = df['tweet']
# Use the 'class' column for labels
y = df['class']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Handle missing values in 'tweet' column
df['tweet'] = df['tweet'].fillna('')  # Replace NaN with empty strings



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Use TfidfVectorizer for the 'tweet_tokens' column
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(df['tweet'])

# Encode target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Handle class imbalance by converting labels to one-hot encoding
y_categorical = to_categorical(y_encoded)

Original DataFrame:
   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      1   
1           1      3            0                   3        0      0   
2           2      3            0                   3        0      0   
3           3      3            0                   2        1      0   
4           4      6            0                   6        0      0   

                                               tweet  \
0  retweet as a woman you should not complain abo...   
1  retweet boy dats coldtyga down bad for cuffin ...   
2  retweet dawg retweet you ever fuck a bitch and...   
3                     retweet she look like a tranny   
4  retweet the shit you hear about me might be tr...   

                                        tweet_tokens  
0  ['retweet', 'woman', 'complain', 'cleaning', '...  
1  ['retweet', 'boy', 'dat', 'coldtyga', 'bad', '...  
2  ['retweet', 'dawg', 'retweet', 'ever

In [4]:
# Define CNN model
model = Sequential()
model.add(Dense(512, activation='relu', input_shape=(X_tfidf.shape[1],)))
model.add(Reshape((512, 1)))
model.add(Conv1D(filters=64, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=32,kernel_size=5,activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten()) # Flatten the output from MaxPooling1D
model.add(Dense(64, activation='relu'))

model.add(Dense(2, activation='softmax'))
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.001), metrics=['accuracy'])

# Define early stopping callback
early_stopping = EarlyStopping(monitor='val_accuracy', patience=5, min_delta=0.001)

# Sort the indices of the sparse matrix
X_tfidf.sort_indices()

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_tfidf, y_categorical, test_size=0.2, random_state=42)

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32,
                    validation_data=(X_val, y_val),
                    callbacks=[early_stopping],
                    verbose=1)

# Evaluate the model on the validation set
loss, accuracy = model.evaluate(X_val, y_val)
print(f'Validation accuracy: {accuracy:.3f}')




Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Validation accuracy: 0.926


In [5]:
# Predict probabilities on the validation set
y_pred_probs = model.predict(X_val)

# Get predicted labels (class with highest probability)
y_pred = np.argmax(y_pred_probs, axis=1)

# Inverse transform encoded true labels to original form
y_val_original = np.argmax(y_val, axis=1)



In [6]:
# Extracting metrics separately
print("Accuracy:", accuracy_score(y_val_original, y_pred))
print("Classification Report:\n", classification_report(y_val_original, y_pred))

accuracy = accuracy_score(y_val_original, y_pred)

classification_report_dict = classification_report(y_val_original, y_pred, output_dict=True)
precision = classification_report_dict['weighted avg']['precision']
recall = classification_report_dict['weighted avg']['recall']
f1_score = classification_report_dict['weighted avg']['f1-score']

Accuracy: 0.9259632842445027
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.96      0.96      4122
           1       0.79      0.76      0.78       835

    accuracy                           0.93      4957
   macro avg       0.87      0.86      0.87      4957
weighted avg       0.93      0.93      0.93      4957

