In [5]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [None]:
import tensorflow as tf
print(tf.__version__)

In [None]:
!conda install tensorflow -y
import tensorflow as tf
print(tf.__version__)


In [None]:
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, LSTM, Dense, Dropout, SpatialDropout1D, GlobalMaxPooling1D, BatchNormalization
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau
# Load dataset
df = pd.read_csv('/kaggle/input/sarcasm-dataset/dataset_sarcasm.csv') 
# Text Cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

df['clean_text'] = df['headline'].apply(clean_text)
labels = np.array(df['is_sarcastic'])  
# Tokenization & Padding
MAX_VOCAB_SIZE = 10000
MAX_SEQUENCE_LENGTH = 50

tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token='<OOV>')
tokenizer.fit_on_texts(df['clean_text'])
sequences = tokenizer.texts_to_sequences(df['clean_text'])
padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

# Convert labels to numpy array
labels = np.array(df['is_sarcastic'])

# Train-Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)


In [None]:
X_train = np.random.randint(1, MAX_VOCAB_SIZE, (28619, MAX_SEQUENCE_LENGTH))
X_test = np.random.randint(1, MAX_VOCAB_SIZE, (7170, MAX_SEQUENCE_LENGTH))
y_train = np.random.randint(0, 2, (28619,))
y_test = np.random.randint(0, 2, (7170,))

In [None]:
print(np.bincount(y_train))
print(np.bincount(y_test))


In [None]:
# Ensure X_train and X_test have correct shape (should be 2D, not 3D)
print(X_train.shape)  # Should be (num_samples, 50)
print(X_test.shape)   # Should be (num_samples, 50)

# Model Architecture
model = Sequential([
    Embedding(input_dim=MAX_VOCAB_SIZE, output_dim=128), 
    SpatialDropout1D(0.2),

    Conv1D(filters=64, kernel_size=3, activation='relu', kernel_regularizer=l2(0.001)),
    BatchNormalization(),

    # LSTM expects 3D input, so we directly feed Conv1D output to it
    LSTM(32, dropout=0.3, recurrent_dropout=0.3, kernel_regularizer=l2(0.001)),

    Dense(32, activation='relu', kernel_regularizer=l2(0.001)),
    Dropout(0.4),

    Dense(1, activation='sigmoid')
])


# Compile and Train
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=32)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report

# Predict on Test Set
y_pred = model.predict(X_test)
y_pred_labels = (y_pred > 0.5).astype(int)  # Convert probabilities to 0/1 labels

# Compute Confusion Matrix
cm = confusion_matrix(y_test, y_pred_labels)

# Plot Confusion Matrix
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Negative", "Positive"], yticklabels=["Negative", "Positive"])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()

# Print Classification Report
print(classification_report(y_test, y_pred_labels))


In [None]:
# Save Model
model.save('sarcasm_cnn_lstm.h5')

# Load and Test Model
loaded_model = tf.keras.models.load_model('sarcasm_cnn_lstm.h5')
predictions = (loaded_model.predict(X_test) > 0.5).astype("int32")