# **1. Mathematical Functions**

In [1]:
import random
import math
import csv
import re
import sys

# **2. Data Loading and Preprocessing Functions**

# *2.1. Load Dataset*

In [13]:
def load_dataset(file_path):
    import csv
    import sys
    # Increase the field size limit
    max_int = sys.maxsize
    decrement = True
    while decrement:
        decrement = False
        try:
            csv.field_size_limit(max_int)
        except OverflowError:
            max_int = int(max_int / 10)
            decrement = True

    emails = []
    labels = []
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        header = next(reader, None)
        for row in reader:
            if len(row) < 2:
                continue
            email = ','.join(row[:-1])
            label = row[-1]
            emails.append(email)
            labels.append(int(label))
    return emails, labels

# *2.2 Preprocess Text*

In [14]:
def preprocess_text(text):
    # Convert text to lowercase and remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', ' ', text.lower())
    words = text.split()
    # Remove stopwords
    stopwords = set([
        'the', 'and', 'is', 'in', 'to', 'of', 'it', 'that', 'this', 'a', 'an',
        'for', 'on', 'with', 'as', 'at', 'by', 'from', 'or', 'but', 'if', 'then'
    ])
    words = [word for word in words if word not in stopwords]
    # Simple stemming
    suffixes = ('ing', 'ed', 's', 'es')
    stemmed_words = []
    for word in words:
        for suffix in suffixes:
            if word.endswith(suffix) and len(word) > len(suffix) + 2:
                word = word[:-len(suffix)]
                break
        stemmed_words.append(word)
    return stemmed_words


# *2.3 Build Vocabulary*

In [15]:
def build_vocabulary(tokenized_texts):
    vocab = {}
    index = 0
    for words in tokenized_texts:
        for word in words:
            if word not in vocab:
                vocab[word] = index
                index += 1
    return vocab

# *2.4 Balance the Dataset*

In [16]:
def balance_dataset(emails, labels):
    spam_emails = [(email, label) for email, label in zip(emails, labels) if label == 1]
    not_spam_emails = [(email, label) for email, label in zip(emails, labels) if label == 0]
    min_count = min(len(spam_emails), len(not_spam_emails))
    spam_emails_sampled = spam_emails[:min_count]
    not_spam_emails_sampled = not_spam_emails[:min_count]
    balanced_data = spam_emails_sampled + not_spam_emails_sampled
    random.shuffle(balanced_data)
    balanced_emails = [email for email, label in balanced_data]
    balanced_labels = [label for email, label in balanced_data]
    return balanced_emails, balanced_labels

# *2.5 Main Preprocessing Function*

In [17]:
def preprocess_data(file_path):
    emails, labels = load_dataset(file_path)
    emails, labels = balance_dataset(emails, labels)
    tokenized_texts = [preprocess_text(email) for email in emails]
    vocab = build_vocabulary(tokenized_texts)
    return tokenized_texts, labels, vocab

# **3. Training Functions**

# *3.1 Initialize Embeddings*

In [18]:
def initialize_embeddings(vocab_size, embedding_dim):
    embeddings = {}
    for i in range(vocab_size):
        embeddings[i] = [random.uniform(-0.5, 0.5) for _ in range(embedding_dim)]
    return embeddings

# *3.2 Generate Training Data*

In [19]:
def generate_training_data(tokenized_texts, vocab, window_size):
    training_data = []
    for words in tokenized_texts:
        for idx, word in enumerate(words):
            target_word_idx = vocab[word]
            context_indices = []
            for neighbor in range(max(idx - window_size, 0), min(idx + window_size + 1, len(words))):
                if neighbor != idx:
                    context_word = words[neighbor]
                    context_indices.append(vocab[context_word])
            training_data.append((target_word_idx, context_indices))
    return training_data

# *3.3 Negative Sampling*

In [20]:
def generate_negative_samples(vocab_size, count):
    negative_samples = [random.randint(0, vocab_size - 1) for _ in range(count)]
    return negative_samples

# *3.4  Train Word2Vec Model*

In [21]:
def train_word2vec(training_data, vocab_size, embedding_dim, epochs, learning_rate, negative_sample_size):
    # Initialize embeddings
    W = initialize_embeddings(vocab_size, embedding_dim)
    W_prime = initialize_embeddings(vocab_size, embedding_dim)

    for epoch in range(epochs):
        total_loss = 0.0
        random.shuffle(training_data)

        for target_word_idx, context_indices in training_data:
            # Positive samples
            for context_word_idx in context_indices:
                label = 1
                score = sum(W[target_word_idx][k] * W_prime[context_word_idx][k] for k in range(embedding_dim))
                pred = 1 / (1 + math.exp(-score))
                pred = max(min(pred, 1 - 1e-7), 1e-7)
                loss = -math.log(pred)
                total_loss += loss
                grad = learning_rate * (label - pred)
                for k in range(embedding_dim):
                    w_temp = W[target_word_idx][k]
                    w_prime_temp = W_prime[context_word_idx][k]
                    W[target_word_idx][k] += grad * w_prime_temp
                    W_prime[context_word_idx][k] += grad * w_temp

            # Negative samples
            negative_samples = generate_negative_samples(vocab_size, negative_sample_size)
            for negative_word_idx in negative_samples:
                label = 0
                score = sum(W[target_word_idx][k] * W_prime[negative_word_idx][k] for k in range(embedding_dim))
                pred = 1 / (1 + math.exp(-score))
                pred = max(min(pred, 1 - 1e-7), 1e-7)
                loss = -math.log(1 - pred)
                total_loss += loss
                grad = learning_rate * (label - pred)
                for k in range(embedding_dim):
                    w_temp = W[target_word_idx][k]
                    w_prime_temp = W_prime[negative_word_idx][k]
                    W[target_word_idx][k] += grad * w_prime_temp
                    W_prime[negative_word_idx][k] += grad * w_temp

        print('Epoch', epoch + 1, 'Loss:', total_loss)
    return W

# **4. Saving Data**

In [22]:
def save_embeddings(embeddings, file_name):
    with open(file_name, 'w', encoding='utf-8') as f:
        for idx in embeddings:
            vector = embeddings[idx]
            vector_str = ' '.join([str(v) for v in vector])
            f.write(f'{idx} {vector_str}\n')

def save_vocab(vocab, file_name):
    with open(file_name, 'w', encoding='utf-8') as f:
        for word in vocab:
            f.write(f'{word} {vocab[word]}\n')

def save_list(data_list, file_name):
    with open(file_name, 'w', encoding='utf-8') as f:
        for item in data_list:
            if isinstance(item, list):
                f.write(' '.join(item) + '\n')
            else:
                f.write(str(item) + '\n')

# **Running the Word2Vec Training**

In [25]:
# Parameters
embedding_dim = 10
window_size = 2
epochs = 30
learning_rate = 0.005
negative_sample_size = 5

# Example usage
file_path = 'spam_or_not_spam.csv'
tokenized_texts, labels, vocab = preprocess_data(file_path)
training_data = generate_training_data(tokenized_texts, vocab, window_size)
vocab_size = len(vocab)
word_embeddings = train_word2vec(training_data, vocab_size, embedding_dim, epochs, learning_rate, negative_sample_size)

# Save the data
save_embeddings(word_embeddings, 'word_embeddings.txt')
save_vocab(vocab, 'vocab.txt')
save_list(tokenized_texts, 'tokenized_texts.txt')
save_list(labels, 'labels.txt')

Epoch 1 Loss: 1371294.5840724749
Epoch 2 Loss: 1211830.3250119959
Epoch 3 Loss: 1088256.054288666
Epoch 4 Loss: 984327.2279765035
Epoch 5 Loss: 905617.3793770841
Epoch 6 Loss: 848571.5099918712
Epoch 7 Loss: 811367.4135964148
Epoch 8 Loss: 783492.7025554841
Epoch 9 Loss: 764442.8687006214
Epoch 10 Loss: 748135.8840426878
Epoch 11 Loss: 734880.5954946207
Epoch 12 Loss: 723378.5425682429
Epoch 13 Loss: 714362.0175240496
Epoch 14 Loss: 705572.8465825928
Epoch 15 Loss: 697286.5573918801
Epoch 16 Loss: 690388.2566687807
Epoch 17 Loss: 683649.1868091067
Epoch 18 Loss: 677116.4908259144
Epoch 19 Loss: 672615.7986845918
Epoch 20 Loss: 666530.6553002902
Epoch 21 Loss: 660850.7973566239
Epoch 22 Loss: 655172.8342332322
Epoch 23 Loss: 651235.086359787
Epoch 24 Loss: 645524.0220935084
Epoch 25 Loss: 642038.4870288984
Epoch 26 Loss: 637715.6863427298
Epoch 27 Loss: 633420.054010779
Epoch 28 Loss: 629903.9099397415
Epoch 29 Loss: 625406.539674177
Epoch 30 Loss: 620970.8727412712
