In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (Input, Embedding, Conv1D, GlobalMaxPooling1D,
                                     Concatenate, Dense, Dropout, SpatialDropout1D)
from tensorflow.keras.callbacks import EarlyStopping

# -----------------------------
# 1. Custom Utility Functions
# -----------------------------
def custom_train_test_split(X, y, test_size=0.2, random_state=42):
    """A simple train/validation split using numpy."""
    np.random.seed(random_state)
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    split = int(X.shape[0] * test_size)
    test_idx = indices[:split]
    train_idx = indices[split:]
    return X[train_idx], X[test_idx], y[train_idx], y[test_idx]

def custom_f1_score(y_true, y_pred):
    """
    Compute the F1 score for binary classification.
    Both y_true and y_pred should be numpy arrays.
    """
    y_true = y_true.flatten()
    y_pred = y_pred.flatten()
    tp = np.sum((y_true == 1) & (y_pred == 1))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    precision = tp / (tp + fp + 1e-7)
    recall = tp / (tp + fn + 1e-7)
    f1 = 2 * precision * recall / (precision + recall + 1e-7)
    return f1

# -----------------------------
# 2. Load the Data
# -----------------------------
# Load training data (with labels)
train_df = pd.read_csv('/content/train.csv')
# Load test data (without labels)
test_df = pd.read_csv('/content/test.csv')

# Extract texts and labels from training data
texts_train = train_df['text'].astype(str).values
labels_train = train_df['target'].values

# Extract texts from test data
texts_test = test_df['text'].astype(str).values

# -----------------------------
# 3. Preprocess the Data (Tokenization & Padding)
# -----------------------------
max_words = 20000    # Maximum number of words to consider
max_length = 100     # Maximum tweet length (in tokens)
embedding_dim = 128  # Increase embedding dimension for better expressiveness

# Create and fit the tokenizer on the training texts
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts_train)

# Convert texts to sequences for both train and test data
train_sequences = tokenizer.texts_to_sequences(texts_train)
test_sequences = tokenizer.texts_to_sequences(texts_test)

# Pad sequences to ensure uniform length
X_train_full = pad_sequences(train_sequences, maxlen=max_length)
X_test = pad_sequences(test_sequences, maxlen=max_length)

# -----------------------------
# 4. Split Training Data into Training and Validation Sets
# -----------------------------
X_train, X_val, y_train, y_val = custom_train_test_split(X_train_full, labels_train, test_size=0.2, random_state=42)

# -----------------------------
# 5. Build the Improved Multi-Kernel CNN Model
# -----------------------------
input_layer = Input(shape=(max_length,), name='input')

# Trainable Embedding layer with random initialization
embedding_layer = Embedding(input_dim=max_words,
                            output_dim=embedding_dim,
                            input_length=max_length,
                            name='embedding')(input_layer)

# Apply SpatialDropout1D to regularize embeddings
x = SpatialDropout1D(0.2)(embedding_layer)

# Create parallel convolutional layers with different kernel sizes
filter_sizes = [3, 4, 5]
conv_layers = []
for size in filter_sizes:
    conv = Conv1D(filters=128, kernel_size=size, activation='relu')(x)
    pool = GlobalMaxPooling1D()(conv)
    conv_layers.append(pool)

# Concatenate the pooled features
concatenated = Concatenate()(conv_layers)

# Add dropout for further regularization
x = Dropout(0.5)(concatenated)

# Fully connected layer
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)

# Final output layer with sigmoid activation for binary classification
output_layer = Dense(1, activation='sigmoid')(x)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# -----------------------------
# 6. Train the Model with EarlyStopping
# -----------------------------
batch_size = 32
epochs = 20

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=(X_val, y_val),
    callbacks=[early_stopping]
)

# -----------------------------
# 7. Evaluate the Model on the Validation Set
# -----------------------------
y_val_pred_prob = model.predict(X_val)
y_val_pred = (y_val_pred_prob > 0.5).astype(int)
f1 = custom_f1_score(np.array(y_val), y_val_pred)
print("Validation F1 Score: {:.4f}".format(f1))

# -----------------------------
# 8. Generate Predictions on the Test Set and Create a Submission File
# -----------------------------
y_test_pred_prob = model.predict(X_test)
y_test_pred = (y_test_pred_prob > 0.5).astype(int).flatten()

# If your test file contains an 'id' column, include it in the submission.
if 'id' in test_df.columns:
    submission = pd.DataFrame({'id': test_df['id'], 'target': y_test_pred})
else:
    submission = pd.DataFrame({'target': y_test_pred})

submission.to_csv('submission.csv', index=False)
print("Submission file saved as 'submission.csv'.")




Epoch 1/20
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 104ms/step - accuracy: 0.5942 - loss: 0.6599 - val_accuracy: 0.7878 - val_loss: 0.4723
Epoch 2/20
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 108ms/step - accuracy: 0.8489 - loss: 0.3737 - val_accuracy: 0.8022 - val_loss: 0.4530
Epoch 3/20
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 103ms/step - accuracy: 0.9347 - loss: 0.1953 - val_accuracy: 0.7845 - val_loss: 0.5182
Epoch 4/20
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 108ms/step - accuracy: 0.9722 - loss: 0.1015 - val_accuracy: 0.7773 - val_loss: 0.6701
Epoch 5/20
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 105ms/step - accuracy: 0.9856 - loss: 0.0498 - val_accuracy: 0.7674 - val_loss: 0.7164
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 40ms/step
Validation F1 Score: 0.7578
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/st