In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from gensim.models import Word2Vec
from transformers import BertTokenizer, TFBertModel
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences

# Loading the data
def load_farm_ads_data(text_file: str, vector_file: str):
    """Load and parse the farm ads data files"""
    # Dictionary to store index:value pairs
    text_data = []
    # Reads the data and seperates the label and text
    with open(text_file, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            if not parts:  # skip empty lines
                continue
            label = int(parts[0])
            text = ' '.join(parts[1:])
            text_data.append({'label': label, 'text': text})
    
    # Create pandas DataFrame where each item is a dictionary (key & value)
    text_df = pd.DataFrame(text_data)
    labels = text_df['label'].values
    texts = text_df['text'].values
    
    vector_data = []
    with open(vector_file, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            if not parts:  # skip empty lines
                continue
            label = int(parts[0])
            features = {}
            for item in parts[1:]:
                idx, val = item.split(':')
                # '3:1' to idx = '3', val = '1'
                features[int(idx)] = float(val)
            vector_data.append(features)
    
    # Convert dictionaries into pandas DataFrame
    vector_df = pd.DataFrame(vector_data).fillna(0)
    
    return texts, labels, vector_df

def preprocess_data(texts, labels, vector_features, test_size=0.2, random_state=123, w2v_size=100, w2v_window=5, w2v_min_count=1):
    """
    Preprocess data with:
        - Word2Vec embeddings
        - BERT tokenization
        - Vector feature normalization
    """
    # Convert labels to numpy array (-1, 1 format)
    y = np.array(labels).reshape(-1, 1)
    
    # Word2Vec Embeddings - Tokenize texts for Word2Vec
    tokenized_texts = [text.split() for text in texts]
    
    # Train Word2Vec model
    w2v_model = Word2Vec(
        sentences=tokenized_texts,
        vector_size=w2v_size,
        window=w2v_window,
        min_count=w2v_min_count,
        workers=4
    )
    
    # Create embedding matrix
    embedding_matrix = np.zeros((len(w2v_model.wv.key_to_index) + 1, w2v_size))
    for word, idx in w2v_model.wv.key_to_index.items():
        embedding_matrix[idx] = w2v_model.wv[word]
    
    # Convert texts to sequences of word indices
    word_index = w2v_model.wv.key_to_index
    sequences = [[word_index[word] for word in text.split() if word in word_index] for text in texts]
    X_text_seq = pad_sequences(sequences, maxlen=128)
    
    # Bert Tokenization
    bert_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
    bert_inputs = bert_tokenizer(
        texts.tolist(),
        padding=True,
        truncation=True,
        max_length=1000,
        return_tensors="tf"
    )
    
    # Vector Feature Normalization
    scaler = StandardScaler()
    X_vector = scaler.fit_transform(vector_features)
    
    # Create consistent split indices
    indices = np.arange(len(y))
    train_idx, test_idx = train_test_split(
        indices, 
        test_size=test_size, 
        random_state=random_state,
        stratify=y
    )
    
    # Splits to all representations
    X_train_seq, X_test_seq = X_text_seq[train_idx], X_text_seq[test_idx]
    X_train_vec, X_test_vec = X_vector[train_idx], X_vector[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    # Split Bert inputs
    bert_train = {
        'input_ids': tf.gather(bert_inputs['input_ids'], train_idx),
        'attention_mask': tf.gather(bert_inputs['attention_mask'], train_idx)
    }
    bert_test = {
        'input_ids': tf.gather(bert_inputs['input_ids'], test_idx),
        'attention_mask': tf.gather(bert_inputs['attention_mask'], test_idx)
    }
    
    return {
        'word_embeddings': (X_train_seq, X_test_seq, X_train_vec, X_test_vec, y_train, y_test),
        'bert': (bert_train, bert_test, X_train_vec, X_test_vec, y_train, y_test),
        'w2v_model': w2v_model,
        'embedding_matrix': embedding_matrix,
        'bert_tokenizer': bert_tokenizer,
        'scaler': scaler
    }

if __name__ == '__main__':
    # File paths
    text_file = "farm-ads"
    vector_file = "farm-ads-vect"
    
    # Load data
    texts, labels, vector_data = load_farm_ads_data(text_file, vector_file)
    
    # Display results
    print("\nFirst 5 text samples with labels:")
    for i in range(5):
        print(f"Label: {labels[i]}\tText: {texts[i][:50]}...")
    
    print("\nVector data shape:", vector_data.shape)
    print("First 5 rows of vector data (non-zero features only):")
    # Get column wise + dictionary of non-zero features
    print(vector_data.iloc[:5].apply(lambda x: x[x != 0].to_dict(), axis=1))
    
    # ===================
    # Preprocess with original (-1,1) labels
    processed = preprocess_data(
        texts, 
        labels, 
        vector_data.values,
        w2v_size=100,  # Embedding dimension
        w2v_window=5,  # Context window size
        w2v_min_count=1  # Minimum word frequency
    )
    
    # 1. Print basic shapes and info
    print("\n=== Data Shapes ===")
    print(f"Word2Vec Train Sequences: {processed['word_embeddings'][0].shape}")
    print(f"Word2Vec Test Sequences: {processed['word_embeddings'][1].shape}")
    print(f"Vector Features Train: {processed['word_embeddings'][2].shape}")
    print(f"Vector Features Test: {processed['word_embeddings'][3].shape}")
    print(f"Labels Train: {processed['word_embeddings'][4].shape}")
    print(f"Labels Test: {processed['word_embeddings'][5].shape}")
    
    # 2. Print Word2Vec model info
    print("\n=== Word2Vec Model Info ===")
    print(f"Vocabulary size: {len(processed['w2v_model'].wv)}")
    print(f"Embedding dimension: {processed['w2v_model'].vector_size}")
    print("Sample words and their vectors:")
    for i, word in enumerate(processed['w2v_model'].wv.index_to_key[:3]):  # First 3 words
        print(f"'{word}': {processed['w2v_model'].wv[word][:5]}...")  # First 5 dimensions
    
    # 3. Print sample sequences
    print("\n=== Sample Text Sequences ===")
    print("First training sample (word indices):")
    print(processed['word_embeddings'][0][0])
    print("\nCorresponding original text:")
    print(texts[0][:100] + "...")  # First 100 chars
    
    # 4. Print vector features
    print("\n=== Vector Feature Samples ===")
    print("First training sample (non-zero features only):")
    first_sample = processed['word_embeddings'][2][0]  # First sample's vector features
    non_zero = {i: val for i, val in enumerate(first_sample) if val != 0}
    print(f"{len(non_zero)} non-zero features out of {len(first_sample)}")
    print(list(non_zero.items())[:5])  # First 5 non-zero features
    
    # 5. Print label distribution
    print("\n=== Label Distribution ===")
    print("Training set:")
    unique, counts = np.unique(processed['word_embeddings'][4], return_counts=True)
    print(dict(zip(unique, counts)))
    print("\nTest set:")
    unique, counts = np.unique(processed['word_embeddings'][5], return_counts=True)
    print(dict(zip(unique, counts)))
    
    # 6. Print embedding matrix info
    print("\n=== Embedding Matrix ===")
    print(f"Shape: {processed['embedding_matrix'].shape}")
    print("Sample embedding (first word):")
    print(processed['embedding_matrix'][1][:10])  # First 10 dimensions of 2nd row (index 1)



First 5 text samples with labels:
Label: 1	Text: ad-jerry ad-bruckheimer ad-chase ad-premier ad-sep...
Label: -1	Text: ad-rheumatoid ad-arthritis ad-expert ad-tip ad-inf...
Label: -1	Text: ad-rheumatologist ad-anju ad-varghese ad-yonker ad...
Label: -1	Text: ad-siemen ad-water ad-remediation ad-water ad-scar...
Label: -1	Text: ad-symptom ad-muscle ad-weakness ad-genetic ad-dis...

Vector data shape: (4143, 54877)
First 5 rows of vector data (non-zero features only):
0    {1: 1.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1....
1    {10: 1.0, 11: 1.0, 12: 1.0, 13: 1.0, 14: 1.0, ...
2    {29: 1.0, 31: 1.0, 35: 1.0, 101: 1.0, 131: 1.0...
3    {34: 1.0, 35: 1.0, 36: 1.0, 44: 1.0, 54: 1.0, ...
4    {8: 1.0, 9: 1.0, 429: 1.0, 430: 1.0, 431: 1.0,...
dtype: object

=== Data Shapes ===
Word2Vec Train Sequences: (3314, 128)
Word2Vec Test Sequences: (829, 128)
Vector Features Train: (3314, 54877)
Vector Features Test: (829, 54877)
Labels Train: (3314, 1)
Labels Test: (829, 1)

=== Word2Vec Model Info =