In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from nltk import RegexpTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, auc
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model, save_model
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Conv1D, MaxPooling1D
from tensorflow.keras.layers import Input, GlobalMaxPooling1D, BatchNormalization, Dropout, Flatten
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
import nltk
from gensim.models import Word2Vec
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Download necessary NLTK resources (organizes linguistic data)
try:
    nltk.word_tokenize("example") # This implicitly requires 'punkt'
    WordNetLemmatizer().lemmatize("running") # This implicitly requires 'wordnet'
except LookupError as e:
    print(f"NLTK Resource not found: {e}")
    print("Downloading necessary NLTK resources...")
    nltk.download('punkt_tab')
    nltk.download('averaged_perceptron_tagger')
    nltk.download('wordnet')
    print("NLTK resources downloaded successfully.")

def load_farm_ads_data(text_file: str, vector_file: str):
    # Dictionary to store index:value pairs
    text_data = []
    # Reads the data and separates the label and text
    with open(text_file, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            if not parts:  # skip empty lines
                continue
            label = int(parts[0])
            text = ' '.join(parts[1:])
            text_data.append({'label': label, 'text': text})
    
    # Create pandas DataFrame where each item is a dictionary (key & value)
    text_df = pd.DataFrame(text_data)
    labels = text_df['label'].values
    texts = text_df['text'].values
    
    vector_data = []
    with open(vector_file, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            if not parts:  # skip empty lines
                continue
            label = int(parts[0])
            features = {}
            for item in parts[1:]:
                idx, val = item.split(':')
                # '3:1' to idx = '3', val = '1'
                features[int(idx)] = float(val)
            vector_data.append(features)
    
    # Convert dictionaries into pandas DataFrame
    vector_df = pd.DataFrame(vector_data).fillna(0)
    
    return texts, labels, vector_df

def preprocess_text(text_series):
    """      
        This reduces the dimensionality of text data and performance of downstram tasks like text classificaiton or information retrieval
    """   
    cleaned_texts = []
    tokenized_texts = []
    
    for text in text_series:
        tokens = word_tokenize(text)
        # Store the tokenized version for Word2Vec
        tokenized_texts.append(tokens)
        
        # Join tokens back into a string
        cleaned_text = ' '.join(tokens)
        cleaned_texts.append(cleaned_text)
    
    # cleaned_texts for text classification and feature extration techiques like TF-IDF
    # tokenized_text for token list used lated on word embeddings (Word2Vec) or RNN
    return cleaned_texts, tokenized_texts

def visualize_data(labels, texts, cleaned_texts):
    """
        Visualize data distributions and characteristics
    """
    # Distribution of classes with -1 and 1
    plt.figure(figsize=(8, 6)) #(width, height)
    sns.countplot(x=labels)
    plt.title('Distribution of Ad Classes')
    plt.xlabel('Class (-1: Not Accepted, 1: Accepted)')
    plt.ylabel('Count')
    plt.savefig('visualization/distributions.png')  
    plt.close()
    
    all_words = [word for text in cleaned_texts for word in text.split()]
    word_freq = pd.Series(all_words).value_counts()
    
    plt.figure(figsize=(12, 6))
    word_freq[:20].plot(kind='bar')
    plt.title('Top 20 Most Common Words')
    plt.xlabel('Words')
    plt.ylabel('Frequency')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig('visualization/word_frequency.png')
    plt.close()
    
    # Compare word frequencies between classes
    accepted_words = [word for i, text in enumerate(cleaned_texts)
                      for word in text.split() if labels[i] == 1]
    rejected_words = [word for i, text in enumerate(cleaned_texts)
                      for word in text.split() if labels[i] == -1]
    
    accepted_freq = pd.Series(accepted_words).value_counts()[:15]
    rejected_freq = pd.Series(rejected_words).value_counts()[:15]
    
    plt.figure(figsize=(16, 6))

    plt.subplot(1, 2, 1)
    accepted_freq.plot(kind='bar')
    plt.title('Top Words in Accepted Ads')
    plt.xlabel('Words')
    plt.ylabel('Frequency')
    plt.xticks(rotation=45, ha='right')
    
    plt.subplot(1, 2, 2)
    rejected_freq.plot(kind='bar')
    plt.title('Top Words in Rejected Ads')
    plt.xlabel('Words')
    plt.ylabel('Frequency')
    plt.xticks(rotation=45, ha='right')
    
    plt.tight_layout()
    plt.savefig('visualization/class_word_frequency.png')
    plt.close()

# Build a deep CNN model with at least 10 layers
def build_cnn_model(vocab_size, embedding_dim, embedding_matrix=None):
    """
    It compiles the model with binary cross-entropy loss (How wrong are the predictions "Yes, No" Decisions) and the Adam optimizer (gradually improved by minimizing the errors).
    
    Learn patterns from text data to decide between two classes (positive vs negative).
    Word embedding to turn words into numbers ->
    Goes to several layers to detect important features ->
    use normalization and dropout to learn better + avoid overfitting
    """
    model = Sequential()
    
    # Layer 1: Embedding layer
    if embedding_matrix is not None:
        model.add(Embedding(
            vocab_size, 
            embedding_dim, 
            weights=[embedding_matrix], 
            trainable=False
        ))
    else:
        model.add(Embedding(vocab_size, embedding_dim))
    
    # Layer 2: Spatial Dropout
    model.add(SpatialDropout1D(0.2))
    
    # Layers 3-4: Conv1D + BatchNorm
    model.add(Conv1D(64, 5, activation='relu'))
    model.add(BatchNormalization())
    
    # Layer 5: MaxPooling
    model.add(MaxPooling1D(3))
    
    # Layers 6-7: Conv1D + BatchNorm
    model.add(Conv1D(128, 5, activation='relu'))
    model.add(BatchNormalization())
    
    # Layer 8: MaxPooling
    model.add(MaxPooling1D(3))
    
    # Layers 9-10: Conv1D + BatchNorm
    model.add(Conv1D(256, 3, activation='relu'))
    model.add(BatchNormalization())
    
    # Layer 11: Global Max Pooling
    model.add(GlobalMaxPooling1D())
    
    # Layer 12: Dense + BatchNorm
    model.add(Dense(128, activation='relu'))
    model.add(BatchNormalization())
    
    # Layer 13: Dropout
    model.add(Dropout(0.3))
    
    # Layer 14: Dense output
    model.add(Dense(1, activation='sigmoid'))
    
    # Compile model
    model.compile(loss='binary_crossentropy',
                  optimizer=Adam(learning_rate=0.001),
                  metrics=['accuracy'])
    
    return model

# Build a LSTM model with at least 10 layers
def build_lstm_model(vocab_size, embedding_dim, embedding_matrix=None):
    """    
    This method creates a deep LSTM (Long Short-Term Memory) model that can learn patterns in sequences of words (like sentences), 
    especially useful for text classification. 
    It uses layers like LSTM (to capture sequence context), dropout (to prevent overfitting), 
    batch normalization (to stabilize learning), 
    and dense layers to make the final prediction between two classes (e.g., positive or negative).
    """
    model = Sequential()
    
    # Layer 1: Embedding layer
    if embedding_matrix is not None:
        model.add(Embedding(
            vocab_size, 
            embedding_dim, 
            weights=[embedding_matrix], 
            trainable=False
        ))
    else:
        model.add(Embedding(vocab_size, embedding_dim))
    
    # Layer 2: Spatial Dropout
    model.add(SpatialDropout1D(0.2))
    
    # Layer 3: LSTM
    model.add(LSTM(128, return_sequences=True))
    
    # Layer 4: Batch Normalization
    model.add(BatchNormalization())
    
    # Layer 5: Dropout
    model.add(Dropout(0.3))
    
    # Layer 6: LSTM
    model.add(LSTM(64, return_sequences=False))
    
    # Layer 7: Batch Normalization
    model.add(BatchNormalization())
    
    # Layer 8: Dropout
    model.add(Dropout(0.3))
    
    # Layer 9: Dense + Batch Normalization
    model.add(Dense(64, activation='relu'))
    model.add(BatchNormalization())
    
    # Layer 10: Dropout
    model.add(Dropout(0.3))
    
    # Layer 11: Dense output
    model.add(Dense(1, activation='sigmoid'))
    
    # Compile model
    model.compile(loss='binary_crossentropy',
                  optimizer=Adam(learning_rate=0.001),
                  metrics=['accuracy'])
    
    return model

def train_evaluate_model(model, X_train, y_train, X_test, y_test, batch_size=32, epochs=10, model_name="model"):
    """
    This method trains a neural network model using training data, validates it during training to avoid overfitting (with early stopping), 
    and then evaluates its performance on test data using accuracy, classification report, confusion matrix, and ROC curve. 
    It also saves training history and evaluation plots (accuracy/loss, confusion matrix, and ROC curve) to files for later review.
    """
    # Early stopping
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    
    # Train model
    history = model.fit(
        X_train, y_train,
        batch_size=batch_size,
        epochs=epochs,
        validation_split=0.2,
        callbacks=[early_stopping],
        verbose=1
    )
    
    # Evaluate model
    y_pred_proba = model.predict(X_test)
    y_pred = (y_pred_proba > 0.5).astype(int)
    
    # Convert -1/1 back to 0/1 for evaluation if needed
    if -1 in y_test:
        y_test_01 = (y_test == 1).astype(int)
    else:
        y_test_01 = y_test
    
    # Metrics
    accuracy = accuracy_score(y_test_01, y_pred)
    report = classification_report(y_test_01, y_pred)
    conf_matrix = confusion_matrix(y_test_01, y_pred)
    
    # Plot training history
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title(f'{model_name} - Accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')
    
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title(f'{model_name} - Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')
    
    plt.tight_layout()
    plt.savefig(f'visualization/{model_name}_training_history.png')
    plt.close()
    
    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Not Accepted', 'Accepted'],
                yticklabels=['Not Accepted', 'Accepted'])
    plt.title(f'{model_name} - Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.tight_layout()
    plt.savefig(f'visualization/{model_name}_confusion_matrix.png')
    plt.close()
    
    # Plot ROC curve
    fpr, tpr, _ = roc_curve(y_test_01, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'{model_name} - ROC Curve')
    plt.legend(loc="lower right")
    plt.savefig(f'visualization/{model_name}_roc_curve.png')
    plt.close()
    
    results = {
        'accuracy': accuracy,
        'report': report,
        'conf_matrix': conf_matrix,
        'roc_auc': roc_auc,
        'history': history
    }
    
    return results

if __name__ == "__main__":
    # File paths
    text_file = "farm-ads"
    vector_file = "farm-ads-vect"
    
    # Load data
    texts, labels, vector_df = load_farm_ads_data(text_file, vector_file)
    
    print(f"Loaded {len(texts)} text samples with labels: {np.unique(labels)}")
    
    # Preprocess text data
    cleaned_texts, tokenized_texts = preprocess_text(texts)
    
    # # Print the cleaned texts
    # print("\nCleaned Texts:")
    # for i, cleaned_text in enumerate(cleaned_texts[:5]):  # Print the first 5 
    #     print(f"Sample {i+1}: {cleaned_text}")
    # 
    # # Print the tokenized texts
    # print("\nTokenized Texts:")
    # for i, tokens in enumerate(tokenized_texts[:5]):  # Print the first 5 
    #     print(f"Sample {i+1}: {tokens}")
        
    visualize_data(labels, texts, cleaned_texts)
    
    # Convert labels from -1/1 to 0/1 for binary classification
    labels_01 = (labels == 1).astype(int)
    
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        cleaned_texts, labels_01, test_size=0.2, random_state=42, stratify=labels_01
    )
    
    # Keep original labels for model evaluation
    _, _, y_train_orig, y_test_orig = train_test_split(
        cleaned_texts, labels, test_size=0.2, random_state=42, stratify=labels
    )
    
    print(f"Training set size: {len(X_train)}, Test set size: {len(X_test)}")
    
    # Prepare data for models
    # Tokenize text for CNN and LSTM models
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(X_train)
    vocab_size = len(tokenizer.word_index) + 1 # +1 to ensure index 0 for padding in sequence models
    print(f"Vocabulary size: {vocab_size}")
    
    # Convert text to sequences
    X_train_seq = tokenizer.texts_to_sequences(X_train)
    X_test_seq = tokenizer.texts_to_sequences(X_test)
    
    # Pad sequences (makes sequences uniform in length for neural networks by adding 0 (padding) or cut off excess (truncation)
    max_length = 100  # Can adjust text length analysis
    X_train_pad = pad_sequences(X_train_seq, maxlen=max_length)
    X_test_pad = pad_sequences(X_test_seq, maxlen=max_length)
    
    # Create Word2Vec embeddings
    print("Training Word2Vec model...")
    w2v_model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)
    
    # Create embedding matrix for pre-trained embeddings
    embedding_dim = 100
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, idx in tokenizer.word_index.items():
        if word in w2v_model.wv:
            embedding_matrix[idx] = w2v_model.wv[word]
    
    # Model 1: CNN with Word2Vec embeddings
    print("\nTraining CNN model with Word2Vec embeddings...")
    cnn_model = build_cnn_model(vocab_size, embedding_dim, embedding_matrix)
    cnn_results = train_evaluate_model(
        cnn_model, X_train_pad, y_train, X_test_pad, y_test_orig, 
        batch_size=32, epochs=15, model_name="CNN_Word2Vec"
    )
    
    # Model 2: LSTM with Word2Vec embeddings
    print("\nTraining LSTM model with Word2Vec embeddings...")
    lstm_model = build_lstm_model(vocab_size, embedding_dim, embedding_matrix)
    lstm_results = train_evaluate_model(
        lstm_model, X_train_pad, y_train, X_test_pad, y_test_orig, 
        batch_size=32, epochs=15, model_name="LSTM_Word2Vec"
    )
    
    # Compare all models
    all_model_results = {
        'CNN_Word2Vec': cnn_results,
        'LSTM_Word2Vec': lstm_results
    }
        
    # Saving Models
    # 1. Save Word2Vec (vectors and vocabulary)
    w2v_model.save("models/word2vec.model")
    np.save("models/embedding_matrix.npy", embedding_matrix)
    # 2. Save CNN (architecture 'layers', weights, optimizer state)
    cnn_model.save("models/cnn_word2vec.keras")
    # 3. Save LSTM 
    lstm_model.save("models/lstm_word2vec.keras")
    

Loaded 4143 text samples with labels: [-1  1]
Training set size: 3314, Test set size: 829
Vocabulary size: 39293
Training Word2Vec model...

Training CNN model with Word2Vec embeddings...
Epoch 1/15
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.7021 - loss: 0.6471 - val_accuracy: 0.6109 - val_loss: 0.6651
Epoch 2/15
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.8116 - loss: 0.4014 - val_accuracy: 0.8808 - val_loss: 0.3111
Epoch 3/15
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.8576 - loss: 0.3411 - val_accuracy: 0.8839 - val_loss: 0.2706
Epoch 4/15
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.8560 - loss: 0.3226 - val_accuracy: 0.8839 - val_loss: 0.2771
Epoch 5/15
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.8820 - loss: 0.2884 - val_accuracy: 0.9050 - val_loss: 0.2483
Epoch 6