In [2]:
import os

def extract_emails(directory, output_file):
    with open(output_file, 'w', encoding='utf-8') as out_file:
        for root, _, files in os.walk(directory):
            for file in files:
                # Process text files, eml files, or files without an extension
                if file.endswith('.txt') or file.endswith('.eml') or not os.path.splitext(file)[1]:
                    file_path = os.path.join(root, file)
                    try:
                        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                            content = f.read()
                            out_file.write(content + '\n\n')  # Separate emails by two newlines
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")

# Paths to directories
ham_dir = 'enron/ham'
spam_dir = 'enron/spam'

# Output files
ham_output_file = 'enron/ham.txt'
spam_output_file = 'enron/spam.txt'

# Extract emails
extract_emails(ham_dir, ham_output_file)
extract_emails(spam_dir, spam_output_file)

print("Extraction complete.")


The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.
Extraction complete.


## #1 Logistic Regression


In [10]:
import numpy as np
from collections import Counter
import random
import sys
import chardet

np.random.seed(12345)

print("Importing dataset from disk...")

def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        raw_data = f.read()
        result = chardet.detect(raw_data)
        return result['encoding']

# Detect encoding for spam.txt
encoding_spam = detect_encoding('spam.txt')
with open('spam.txt', 'r', encoding=encoding_spam, errors='replace') as f:
    raw = f.readlines()

spam = [row[:-2].split(" ") for row in raw]

# Detect encoding for ham.txt
encoding_ham = detect_encoding('ham.txt')
with open('ham.txt', 'r', encoding=encoding_ham, errors='replace') as f:
    raw = f.readlines()

ham = [row[:-2].split(" ") for row in raw]

class LogisticRegression(object):
    
    def __init__(self, positives, negatives, iterations=10, alpha=0.01, regularization_strength=0.01):
        
        self.maxweight = 10
        self.alpha = alpha
        self.regularization_strength = regularization_strength
        
        # Create vocabulary
        cnts = Counter()
        for email in (positives + negatives):
            for word in email:
                cnts[word] += 1
        
        vocab = list(cnts.keys())
        self.word2index = {word: i for i, word in enumerate(vocab)}
    
        # Initialize weights
        self.weights = (np.random.rand(len(vocab)) - 0.5) * 0.1
        
        # Train model
        self.train(positives, negatives, iterations=iterations)
    
    def train(self, positives, negatives, iterations=10):
        
        for iter in range(iterations):
            error = 0
            n = 0
            for i in range(max(len(positives), len(negatives))):
                error += np.abs(self.learn(positives[i % len(positives)], 1))
                error += np.abs(self.learn(negatives[i % len(negatives)], 0))
                n += 2

            print("Iter:" + str(iter) + " Loss:" + str(error / float(n)))
    
    @staticmethod
    def softmax(x):
        x = np.clip(x, -500, 500)  # Clip values to avoid overflow
        return 1 / (1 + np.exp(-x))
    
    def predict(self, email):
        return self.unencrypted_predict(email)
    
    def unencrypted_predict(self, email):
        pred = 0
        for word in email:
            pred += self.weights[self.word2index[word]]
        pred = self.softmax(pred)
        return pred

    def learn(self, email, target):
        pred = self.predict(email)
        delta = (pred - target)
        for word in email:
            self.weights[self.word2index[word]] -= (delta * self.alpha + self.regularization_strength * self.weights[self.word2index[word]])
        return delta

model = LogisticRegression(spam[0:-1000], ham[0:-1000], iterations=20, alpha=0.01, regularization_strength=0.005)

# Evaluate the model
fp = 0
tn = 0
tp = 0
fn = 0

for i, h in enumerate(ham[-1000:]):
    pred = model.predict(h)
    if pred < 0.5:
        tn += 1
    else:
        fp += 1

    if i % 10 == 0:
        sys.stdout.write('\r I:' + str(tn + tp + fn + fp) + " % Correct:" + str(100 * tn / float(tn + fp))[0:6])

for i, h in enumerate(spam[-1000:]):
    pred = model.predict(h)
    if pred > 0.5:
        tp += 1
    else:
        fn += 1

    if i % 10 == 0:
        sys.stdout.write('\r I:' + str(tn + tp + fn + fp) + " % Correct:" + str(100 * (tn + tp) / float(tn + tp + fn + fp))[0:6])

sys.stdout.write('\r I:' + str(tn + tp + fn + fp) + " % Correct:" + str(100 * (tn + tp) / float(tn + tp + fn + fp))[0:6])

print("\n Accuracy: %" + str(100 * (tn + tp) / float(tn + tp + fn + fp))[0:6])
print("False Positives: %" + str(100 * fp / float(tp + fp))[0:4] + "    <- privacy violation level")
print("False Negatives: %" + str(100 * fn / float(tn + fn))[0:4] + "   <- security risk level")


Importing dataset from disk...
Iter:0 Loss:0.1418565838862508
Iter:1 Loss:0.1159124479776207
Iter:2 Loss:0.11007672910511375
Iter:3 Loss:0.10665691746145255
Iter:4 Loss:0.10412171061464175
Iter:5 Loss:0.10202175002694532
Iter:6 Loss:0.10078361353158154
Iter:7 Loss:0.09973163654990577
Iter:8 Loss:0.09914064915691775
Iter:9 Loss:0.09824559484017459
Iter:10 Loss:0.09769536162716529
Iter:11 Loss:0.09697567747573058
Iter:12 Loss:0.09684459533264063
Iter:13 Loss:0.09674045756105
Iter:14 Loss:0.09605884690921715
Iter:15 Loss:0.09590132617151204
Iter:16 Loss:0.09578045455963016
Iter:17 Loss:0.09500135219551235
Iter:18 Loss:0.09517674220316232
Iter:19 Loss:0.09461852895063669
 I:2000 % Correct:99.497
 Accuracy: %99.4
False Positives: %0.89    <- privacy violation level
False Negatives: %0.30   <- security risk level


In [None]:
## from flask import Flask, request, jsonify
import re

# Initialize Flask app
app = Flask(__name__)

# Assuming `model` is your unencrypted model from previous steps
# Initialize the model here if necessary

# Define the preprocessing function to handle case and punctuation
def preprocess_email(email):
    # Convert to lowercase
    email = email.lower()
    
    # Remove punctuation using regex
    email = re.sub(r'[^\w\s]', '', email)
    
    # Tokenize by splitting on spaces
    email_tokens = email.split()
    
    return email_tokens

@app.route('/predict', methods=['POST'])
def predict():
    data = request.json
    email = data.get('email', '')
    
    if not email:
        return jsonify({'error': 'No email provided'}), 400
    
    # Preprocess the email input
    email_tokens = preprocess_email(email)
    
    # Run through the model
    try:
        pred = model.predict(email_tokens)
        
        # Determine whether it's spam or not
        if pred > 0.5:
            result = "spam"
        else:
            result = "not spam"
        
        # Return the result as JSON
        return jsonify({'prediction': result})
    
    except KeyError as e:
        # Return an error message if a word is not found in the vocabulary
        return jsonify({'error': f"Word '{e.args[0]}' not in vocabulary"}), 400

# Run the Flask server
if __name__ == '__main__':
    app.run(port=12345)  # Default port is 5000


## Pallier Homomorphic Encryption

In [12]:
import math
import libnum

# Function to compute modular inverse
def mod_inverse(x, n):
    return pow(x, -1, n)

# L function for decryption
def L(x, n):
    return (x - 1) // n

# Paillier key generation using libnum for prime generation
def generate_keys(bit_length=512):
    # Generate two large prime numbers p and q using libnum
    p = libnum.generate_prime(bit_length // 2)
    q = libnum.generate_prime(bit_length // 2)

    n = p * q  # n = p * q
    n_sq = n * n  # n^2 for the modulus in encryption

    # λ (lambda) = lcm(p-1, q-1)
    lambda_param = (p - 1) * (q - 1) // math.gcd(p - 1, q - 1)

    # g can be any number (usually n+1)
    g = n + 1

    # µ = (L(g^λ mod n^2))^(-1) mod n
    mu = mod_inverse(L(pow(g, lambda_param, n_sq), n), n)

    public_key = (n, g)
    private_key = (lambda_param, mu)

    return public_key, private_key

# Custom EncryptedNumber class to mimic object-like string representation
class EncryptedNumber:
    def __init__(self, ciphertext):
        self.ciphertext = ciphertext

    def __str__(self):
        # This returns a string similar to an object representation
        return f"<EncryptedNumber object at {hex(id(self))}>"

# Paillier encryption
def encrypt(public_key, plaintext):
    n, g = public_key
    n_sq = n * n

    # Choose random r where 1 <= r < n
    r = libnum.randint_bits(n.bit_length() - 1) % n

    # Encryption formula: c = g^m * r^n mod n^2
    ciphertext = (pow(g, plaintext, n_sq) * pow(r, n, n_sq)) % n_sq
    return EncryptedNumber(ciphertext)  # Wrap ciphertext in EncryptedNumber class

# Paillier decryption
def decrypt(private_key, public_key, encrypted_number):
    n, g = public_key
    lambda_param, mu = private_key
    n_sq = n * n

    # Extract the actual ciphertext from the EncryptedNumber object
    ciphertext = encrypted_number.ciphertext

    # Decryption formula: m = L(c^λ mod n^2) * µ mod n
    x = pow(ciphertext, lambda_param, n_sq)
    plaintext = (L(x, n) * mu) % n
    return plaintext

# Homomorphic addition of two ciphertexts
def homomorphic_add(public_key, c1, c2):
    n, _ = public_key
    n_sq = n * n

    # Homomorphic addition: c3 = (c1 * c2) mod n^2
    return EncryptedNumber((c1.ciphertext * c2.ciphertext) % n_sq)

# Example usage:
if __name__ == "__main__":
    # Key generation (512-bit security)
    public_key, private_key = generate_keys(bit_length=512)

    # Encrypt two messages
    m1 = 42
    m2 = 23
    print(f"Original messages: {m1}, {m2}")

    c1 = encrypt(public_key, m1)
    c2 = encrypt(public_key, m2)
    print(f"Encrypted messages: {c1}, {c2}")

    # Homomorphic addition of encrypted messages
    c3 = homomorphic_add(public_key, c1, c2)
    print(f"Encrypted sum: {c3}")

    # Decrypt the sum
    decrypted_sum = decrypt(private_key, public_key, c3)
    print(f"Decrypted sum: {decrypted_sum}")


Original messages: 42, 23
Encrypted messages: <EncryptedNumber object at 0x10f0df980>, <EncryptedNumber object at 0x10f261b50>
Encrypted sum: <EncryptedNumber object at 0x10aefb620>
Decrypted sum: 65


In [5]:
import numpy as np
from collections import Counter
import random
import sys
import chardet
import math
import libnum

np.random.seed(12345)

# Function to detect file encoding
def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        raw_data = f.read()
        result = chardet.detect(raw_data)
        return result['encoding']

# Load datasets
def load_data(file_path):
    encoding = detect_encoding(file_path)
    with open(file_path, 'r', encoding=encoding, errors='replace') as f:
        raw = f.readlines()
    return [row[:-2].split(" ") for row in raw]

spam = load_data('spam.txt')
ham = load_data('ham.txt')

# Paillier Encryption Functions
def mod_inverse(x, n):
    return pow(x, -1, n)

def L(x, n):
    return (x - 1) // n

def generate_keys(bit_length=512):
    p = libnum.generate_prime(bit_length // 2)
    q = libnum.generate_prime(bit_length // 2)
    n = p * q
    n_sq = n * n
    lambda_param = (p - 1) * (q - 1) // math.gcd(p - 1, q - 1)
    g = n + 1
    mu = mod_inverse(L(pow(g, lambda_param, n_sq), n), n)
    public_key = (n, g)
    private_key = (lambda_param, mu)
    return public_key, private_key

class EncryptedNumber:
    def __init__(self, ciphertext):
        self.ciphertext = ciphertext

    def __str__(self):
        return f"<EncryptedNumber object at {hex(id(self))}>"

def encrypt(public_key, plaintext):
    n, g = public_key
    n_sq = n * n
    r = libnum.randint_bits(n.bit_length() - 1) % n
    ciphertext = (pow(g, plaintext, n_sq) * pow(r, n, n_sq)) % n_sq
    return EncryptedNumber(ciphertext)

def decrypt(private_key, public_key, encrypted_number):
    n, g = public_key
    lambda_param, mu = private_key
    n_sq = n * n
    ciphertext = encrypted_number.ciphertext
    x = pow(ciphertext, lambda_param, n_sq)
    plaintext = (L(x, n) * mu) % n
    return plaintext

def homomorphic_add(public_key, c1, c2):
    n, _ = public_key
    n_sq = n * n
    return EncryptedNumber((c1.ciphertext * c2.ciphertext) % n_sq)

# Logistic Regression Class
class LogisticRegression:
    def __init__(self, positives, negatives, public_key, iterations=10, alpha=0.01, regularization_strength=0.01):
        self.maxweight = 10
        self.alpha = alpha
        self.regularization_strength = regularization_strength
        self.pubkey = public_key
        
        # Create vocabulary
        cnts = Counter()
        for email in (positives + negatives):
            for word in email:
                cnts[word] += 1

        vocab = list(cnts.keys())
        self.word2index = {word: i for i, word in enumerate(vocab)}

        # Initialize weights
        self.weights = (np.random.rand(len(vocab)) - 0.5) * 0.1
        
        # Scale and convert weights to native Python int
        self.weights = np.round(self.weights * 100).astype(int).tolist()

        # Encrypt the weights
        self.encrypted_weights = [encrypt(public_key, weight) for weight in self.weights]

        # Train model
        self.train(positives, negatives, iterations=iterations)

    def train(self, positives, negatives, iterations=10):
        for iter in range(iterations):
            error = 0
            n = 0
            for i in range(max(len(positives), len(negatives))):
                error += np.abs(self.learn(positives[i % len(positives)], 1))
                error += np.abs(self.learn(negatives[i % len(negatives)], 0))
                n += 2
            print("Iter:" + str(iter) + " Loss:" + str(error / float(n)))

    @staticmethod
    def softmax(x):
        x = np.clip(x, -500, 500)
        return 1 / (1 + np.exp(-x))

    def predict(self, email):
        return self.encrypted_predict(email)

    def encrypted_predict(self, email):
        pred = encrypt(self.pubkey, 0)  # Start with encrypted zero
        for word in email:
            weight = self.encrypted_weights[self.word2index[word]]
            pred = homomorphic_add(self.pubkey, pred, weight)
        return pred

    def learn(self, email, target):
        pred = self.decrypt_predict(email)
        delta = (pred - target)
        for word in email:
            index = self.word2index[word]
            self.weights[index] -= (delta * self.alpha + self.regularization_strength * self.weights[index])
            self.encrypted_weights[index] = encrypt(self.pubkey, int(self.weights[index]))  # Ensure conversion
        return delta

    def decrypt_predict(self, email):
        pred = 0
        for word in email:
            pred += self.weights[self.word2index[word]]
        pred = self.softmax(pred)
        return pred


# Example Usage
if __name__ == "__main__":
    # Generate keys
    public_key, private_key = generate_keys(bit_length=512)
    
    # Create and train Logistic Regression model
    model = LogisticRegression(spam[0:-1000], ham[0:-1000], public_key, iterations=20, alpha=0.01, regularization_strength=0.005)

    # Evaluate the model
    fp, tn, tp, fn = 0, 0, 0, 0

    for i, h in enumerate(ham[-1000:]):
        encrypted_pred = model.predict(h)
        pred = decrypt(private_key, public_key, encrypted_pred)
        if pred < 0.5:
            tn += 1
        else:
            fp += 1

    for i, h in enumerate(spam[-1000:]):
        encrypted_pred = model.predict(h)
        pred = decrypt(private_key, public_key, encrypted_pred)
        if pred > 0.5:
            tp += 1
        else:
            fn += 1

    total = tn + tp + fp + fn
    accuracy = 100 * (tn + tp) / total if total > 0 else 0
    print("\n Accuracy: %" + str(accuracy)[0:6])
    print("False Positives: %" + str(100 * fp / float(tp + fp))[0:4] + "    <- privacy violation level")
    print("False Negatives: %" + str(100 * fn / float(tn + fn))[0:4] + "   <- security risk level")



KeyboardInterrupt



In [34]:
import numpy as np
from collections import Counter
import chardet
import math
import re
import libnum

np.random.seed(12345)

# Function to detect file encoding
def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        raw_data = f.read()
        result = chardet.detect(raw_data)
        return result['encoding']


def clean_text(text):
    # Remove HTML tags and entities like &nbsp;
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'&\w+;', '', text)
    # Remove email headers like 'Subject:'
    text = re.sub(r'Subject:', '', text)
    return text
    
def load_data(file_path):
    print("Starting to load data...")
    encoding = detect_encoding(file_path)
    print(f"Detected encoding: {encoding}")
    with open(file_path, 'r', encoding=encoding, errors='replace') as f:
        raw = f.readlines()
    
    print("Dataset imported.")
    
    # Clean each email and split into words
    return [[word for word in clean_text(row).split() if word.strip()] for row in raw]


# Importing dataset
spam = load_data('spam.txt')
ham = load_data('ham.txt')

# Paillier Encryption Functions
def mod_inverse(x, n):
    return pow(x, -1, n)

def L(x, n):
    return (x - 1) // n

def generate_keys(bit_length=256):
    p = libnum.generate_prime(bit_length // 2)
    q = libnum.generate_prime(bit_length // 2)
    n = p * q
    n_sq = n * n
    lambda_param = (p - 1) * (q - 1) // math.gcd(p - 1, q - 1)
    g = n + 1
    mu = mod_inverse(L(pow(g, lambda_param, n_sq), n), n)
    public_key = (n, g)
    private_key = (lambda_param, mu)
    return public_key, private_key

class EncryptedNumber:
    def __init__(self, ciphertext):
        self.ciphertext = ciphertext

def encrypt(public_key, plaintext, scaling_factor=10000):
    n, g = public_key
    n_sq = n * n
    r = libnum.randint_bits(n.bit_length() - 1) % n
    ciphertext = (pow(g, int(plaintext * scaling_factor), n_sq) * pow(r, n, n_sq)) % n_sq
    return EncryptedNumber(ciphertext)

def decrypt(private_key, public_key, encrypted_number, scaling_factor=10000):
    n, g = public_key
    lambda_param, mu = private_key
    n_sq = n * n
    ciphertext = encrypted_number.ciphertext
    x = pow(ciphertext, lambda_param, n_sq)
    plaintext = (L(x, n) * mu) % n
    return plaintext / scaling_factor

def homomorphic_add(public_key, c1, c2):
    n, _ = public_key
    n_sq = n * n
    return EncryptedNumber((c1.ciphertext * c2.ciphertext) % n_sq)

# Logistic Regression Class
class LogisticRegression:
    def __init__(self, positives, negatives, public_key, iterations=20, alpha=0.01, regularization_strength=0.005, weight_scale=1e5):
        self.alpha = alpha
        self.regularization_strength = regularization_strength
        self.pubkey = public_key
        self.weight_scale = weight_scale
        
        # Create vocabulary
        cnts = Counter()
        for email in (positives + negatives):
            cnts.update(email)

        vocab = list(cnts.keys())
        self.word2index = {word: i for i, word in enumerate(vocab)}

        # Initialize weights with lower variance
        self.weights = (np.random.rand(len(vocab)) - 0.5) * 0.1

        # Train model
        self.train(positives, negatives, iterations=iterations)

    def train(self, positives, negatives, iterations=20):
        for iter in range(iterations):
            if iter % 5 == 0:  # Encrypt weights every 5 epochs
                self.encrypted_weights = [encrypt(self.pubkey, int(weight)) for weight in self.weights]
                print(f"Epoch {iter + 1} started, weights encrypted.")

            error = 0
            n = 0
            for i in range(max(len(positives), len(negatives))):
                error += np.abs(self.learn(positives[i % len(positives)], 1))
                error += np.abs(self.learn(negatives[i % len(negatives)], 0))
                n += 2

            max_weight = max(abs(w) for w in self.weights)
            print(f"Iteration: {iter + 1}, Loss: {error / float(n):.4f}, Max Weight: {max_weight}")

    @staticmethod
    def softmax(x):
        return 1 / (1 + np.exp(-x))

    # Modify the predict function to match unencrypted prediction first for comparison
    def predict(self, email):
        pred = 0
        for word in email:
        if word in self.word2index:
            pred += self.weights[self.word2index[word]]
        pred = self.softmax(pred)
        return pred


    def encrypted_predict(self, email):
        pred = encrypt(self.pubkey, 0)  # Start with encrypted zero
        for word in email:
            if word in self.word2index:
                weight = self.encrypted_weights[self.word2index[word]]
                pred = homomorphic_add(self.pubkey, pred, weight)
        return pred

    # Modify the learn function to use simple delta update as in target code
    def learn(self, email, target):
        pred = self.decrypt_predict(email)
        delta = pred - target
    
        for word in email:
            if word in self.word2index:
                self.weights[self.word2index[word]] -= delta * self.alpha
        return delta


    def decrypt_predict(self, email):
        pred = sum(self.weights[self.word2index[word]] for word in email if word in self.word2index)
        return self.softmax(pred)

# Example Usage
if __name__ == "__main__":
    # Generate keys
    public_key, private_key = generate_keys(bit_length=64)  # Increased key size
    
    # Create and train Logistic Regression model
    model = LogisticRegression(spam[0:-1000], ham[0:-1000], public_key, iterations=10, alpha=0.003)

    # Evaluate the model
    fp, tn, tp, fn = 0, 0, 0, 0

    for h in ham[-1000:]:
        encrypted_pred = model.predict(h)
        pred = decrypt(private_key, public_key, encrypted_pred)
        if pred < 0.4:  # Increase threshold for TN
            tn += 1
        else:
            fp += 1

    for s in spam[-1000:]:
        encrypted_pred = model.predict(s)
        pred = decrypt(private_key, public_key, encrypted_pred)
        if pred > 0.3:  # Increase threshold for TP
            tp += 1
        else:
            fn += 1


    total = tn + tp + fp + fn
    accuracy = 100 * (tn + tp) / total if total > 0 else 0
    print(f"\nAccuracy: {accuracy:.2f}%")
    print(f"True Positives: {tp}, True Negatives: {tn}, False Positives: {fp}, False Negatives: {fn}")


Starting to load data...
Detected encoding: Windows-1252
Dataset imported.
Starting to load data...
Detected encoding: ascii
Dataset imported.
Epoch 1 started, weights encrypted.
Iteration: 1, Loss: 0.0943, Max Weight: 0.8499133334729208
Iteration: 2, Loss: 0.0512, Max Weight: 0.7720277889291725
Iteration: 3, Loss: 0.0422, Max Weight: 0.7746412814448762
Iteration: 4, Loss: 0.0372, Max Weight: 0.7745550268801938
Iteration: 5, Loss: 0.0339, Max Weight: 0.7904771956373626
Epoch 6 started, weights encrypted.
Iteration: 6, Loss: 0.0315, Max Weight: 0.8219411712428307
Iteration: 7, Loss: 0.0297, Max Weight: 0.87692066922867
Iteration: 8, Loss: 0.0282, Max Weight: 0.9262058073962733
Iteration: 9, Loss: 0.0270, Max Weight: 0.9707931348521376
Iteration: 10, Loss: 0.0259, Max Weight: 1.0114150831017963

Accuracy: 50.00%
True Positives: 0, True Negatives: 1000, False Positives: 0, False Negatives: 1000


In [32]:
top_weights = sorted(zip(model.word2index.keys(), model.weights), key=lambda x: abs(x[1]), reverse=True)
print(f"Top words by weight: {top_weights[:10]}")

Top words by weight: [('removed', np.float64(1.1705920727373356)), ('daren', np.float64(-1.1084337535007625)), ('thanks', np.float64(-1.0931768499059655)), ('attached', np.float64(-1.0820453731595625)), ('nbsp', np.float64(1.0766684057495002)), ('doc', np.float64(-1.0511789620975704)), ('neon', np.float64(-1.028657068918028)), ('money', np.float64(1.0245858763823161)), ('here', np.float64(0.9820507067122095)), ('2004', np.float64(0.9470461135994449))]


In [57]:
import numpy as np
from collections import Counter
import chardet
import math
import re
import libnum

np.random.seed(12345)

def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        raw_data = f.read()
        result = chardet.detect(raw_data)
        return result['encoding']

def clean_text(text):
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'&\w+;', '', text)
    text = re.sub(r'Subject:', '', text)
    return text
    
def load_data(file_path):
    encoding = detect_encoding(file_path)
    with open(file_path, 'r', encoding=encoding, errors='replace') as f:
        raw = f.readlines()
    return [[word for word in clean_text(row).split() if word.strip()] for row in raw]

spam = load_data('spam.txt')
ham = load_data('ham.txt')

def mod_inverse(x, n):
    return pow(x, -1, n)

def L(x, n):
    return (x - 1) // n

def generate_keys(bit_length=256):
    p = libnum.generate_prime(bit_length // 2)
    q = libnum.generate_prime(bit_length // 2)
    n = p * q
    n_sq = n * n
    lambda_param = (p - 1) * (q - 1) // math.gcd(p - 1, q - 1)
    g = n + 1
    mu = mod_inverse(L(pow(g, lambda_param, n_sq), n), n)
    public_key = (n, g)
    private_key = (lambda_param, mu)
    return public_key, private_key

class EncryptedNumber:
    def __init__(self, ciphertext):
        self.ciphertext = ciphertext

def encrypt(public_key, plaintext, scaling_factor=10000):
    n, g = public_key
    n_sq = n * n
    r = libnum.randint_bits(n.bit_length() - 1) % n
    ciphertext = (pow(g, int(plaintext * scaling_factor), n_sq) * pow(r, n, n_sq)) % n_sq
    return EncryptedNumber(ciphertext)

def decrypt(private_key, public_key, encrypted_number, scaling_factor=10000):
    n, g = public_key
    lambda_param, mu = private_key
    n_sq = n * n
    ciphertext = encrypted_number.ciphertext
    x = pow(ciphertext, lambda_param, n_sq)
    plaintext = (L(x, n) * mu) % n
    return plaintext / scaling_factor

def homomorphic_add(public_key, c1, c2):
    n, _ = public_key
    n_sq = n * n
    return EncryptedNumber((c1.ciphertext * c2.ciphertext) % n_sq)

class LogisticRegression:
    def __init__(self, positives, negatives, public_key, iterations=20, alpha=0.01, regularization_strength=0.001, weight_scale=1e5):
        self.alpha = alpha
        self.regularization_strength = regularization_strength
        self.pubkey = public_key
        self.weight_scale = weight_scale

        cnts = Counter()
        for email in (positives + negatives):
            cnts.update(email)

        vocab = list(cnts.keys())
        self.word2index = {word: i for i, word in enumerate(vocab)}

        # Initialize weights as in target
        self.weights = (np.random.rand(len(vocab)) - 0.5) * 0.1

        self.train(positives, negatives, iterations=iterations)

        # Encrypt weights after training
        self.encrypted_weights = [encrypt(self.pubkey, int(weight)) for weight in self.weights]
        print("Weights encrypted after training.")

    def train(self, positives, negatives, iterations=10):
        for iteration in range(iterations):
            error = 0
            n = 0
            for i in range(max(len(positives), len(negatives))):
                error += np.abs(self.learn(positives[i % len(positives)], 1))
                error += np.abs(self.learn(negatives[i % len(negatives)], 0))
                n += 2
        
        print(f"Iteration: {iteration + 1}, Loss: {error / n:.4f}")
    
        # Encrypt weights after training
        self.encrypted_weights = [encrypt(self.pubkey, int(weight)) for weight in self.weights]
        print("Weights encrypted after training.")

    @staticmethod
    def softmax(x):
        return 1 / (1 + np.exp(-x))

    def predict(self, email, encrypt_output=False):
        pred = 0
        for word in email:
            if word in self.word2index:
                pred += self.weights[self.word2index[word]]
    
        pred = self.softmax(pred)
    
        if encrypt_output:
            encrypted_pred = encrypt(self.pubkey, pred)
            return encrypted_pred
    
        # Return classification (1 for spam, 0 for ham) based on a threshold
        return 1 if pred >= 0.5 else 0  # Threshold is 0.5


    def learn(self, email, target):
        pred = self.predict(email)  # Unencrypted prediction
        delta = pred - target  # This operation works because `pred` is unencrypted
    
        for word in email:
            if word in self.word2index:
                self.weights[self.word2index[word]] -= self.alpha * delta

        return delta


# Example Usage
if __name__ == "__main__":
    public_key, private_key = generate_keys(bit_length=64)
    model = LogisticRegression(spam[0:-1000], ham[0:-1000], public_key, iterations=10, alpha=0.01)

    fp, tn, tp, fn = 0, 0, 0, 0

    for h in ham[-1000:]:
        encrypted_pred = model.predict(h, encrypt_output=True)  # Encrypt the output during prediction
        pred = decrypt(private_key, public_key, encrypted_pred)
        if pred <  0.5:
            tn += 1
        else:
            fp += 1

    for s in spam[-1000:]:
        encrypted_pred = model.predict(s, encrypt_output=True)  # Encrypt the output during prediction
        pred = decrypt(private_key, public_key, encrypted_pred)
        if pred > 0.5:
            tp += 1
        else:
            fn += 1

print(f"False Positives: {fp}, True Negatives: {tn}, True Positives: {tp}, False Negatives: {fn}")
total = tn + tp + fp + fn
accuracy = 100 * (tn + tp) / total if total > 0 else 0
print(f"\nAccuracy: {accuracy:.2f}%")


  return 1 / (1 + np.exp(-x))


Iteration: 10, Loss: 0.0000
Weights encrypted after training.
Weights encrypted after training.
False Positives: 0, True Negatives: 1000, True Positives: 989, False Negatives: 11

Accuracy: 99.45%


In [64]:
model.predict(spam[57])

1

In [None]:
model.encrypted_weights

In [100]:
import numpy as np
from collections import Counter
import chardet
import math
import re
import libnum

np.random.seed(12345)

def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        raw_data = f.read()
        result = chardet.detect(raw_data)
        return result['encoding']

def clean_text(text):
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'&\w+;', '', text)
    text = re.sub(r'Subject:', '', text)
    return text
    
def load_data(file_path):
    encoding = detect_encoding(file_path)
    with open(file_path, 'r', encoding=encoding, errors='replace') as f:
        raw = f.readlines()
    return [[word for word in clean_text(row).split() if word.strip()] for row in raw]

spam = load_data('spam.txt')
ham = load_data('ham.txt')

def mod_inverse(x, n):
    return pow(x, -1, n)

def L(x, n):
    return (x - 1) // n

def generate_keys(bit_length=256):
    p = libnum.generate_prime(bit_length // 2)
    q = libnum.generate_prime(bit_length // 2)
    n = p * q
    n_sq = n * n
    lambda_param = (p - 1) * (q - 1) // math.gcd(p - 1, q - 1)
    g = n + 1
    mu = mod_inverse(L(pow(g, lambda_param, n_sq), n), n)
    public_key = (n, g)
    private_key = (lambda_param, mu)
    return public_key, private_key

class EncryptedNumber:
    def __init__(self, ciphertext):
        self.ciphertext = ciphertext

def encrypt(public_key, plaintext, scaling_factor=10000):
    n, g = public_key
    n_sq = n * n
    r = libnum.randint_bits(n.bit_length() - 1) % n
    ciphertext = (pow(g, int(plaintext * scaling_factor), n_sq) * pow(r, n, n_sq)) % n_sq
    return EncryptedNumber(ciphertext)

def decrypt(private_key, public_key, encrypted_number, scaling_factor=10000):
    n, g = public_key
    lambda_param, mu = private_key
    n_sq = n * n
    ciphertext = encrypted_number.ciphertext
    x = pow(ciphertext, lambda_param, n_sq)
    plaintext = (L(x, n) * mu) % n
    return plaintext / scaling_factor

def homomorphic_add(public_key, c1, c2):
    n, _ = public_key
    n_sq = n * n
    return EncryptedNumber((c1.ciphertext * c2.ciphertext) % n_sq)

class LogisticRegression:
    def __init__(self, positives, negatives, public_key, iterations=20, alpha=0.01, regularization_strength=0.001, weight_scale=1e5):
        self.alpha = alpha
        self.regularization_strength = regularization_strength
        self.pubkey = public_key
        self.weight_scale = weight_scale

        cnts = Counter()
        for email in (positives + negatives):
            cnts.update(email)

        vocab = list(cnts.keys())
        self.word2index = {word: i for i, word in enumerate(vocab)}

        # Initialize weights as in target
        self.weights = (np.random.rand(len(vocab)) - 0.5) * 0.1

        self.train(positives, negatives, iterations=iterations)

        # Encrypt weights after training
        self.encrypted_weights = [encrypt(self.pubkey, int(weight)) for weight in self.weights]
        print("Weights encrypted after training.")

    def train(self, positives, negatives, iterations=10):
        for iteration in range(iterations):
            error = 0
            n = 0
            for i in range(max(len(positives), len(negatives))):
                error += np.abs(self.learn(positives[i % len(positives)], 1))
                error += np.abs(self.learn(negatives[i % len(negatives)], 0))
                n += 2
        
        print(f"Iteration: {iteration + 1}, Loss: {error / n:.4f}")
    
        # Encrypt weights after training
        self.encrypted_weights = [encrypt(self.pubkey, int(weight)) for weight in self.weights]
        print("Weights encrypted after training.")

    @staticmethod
    def softmax(x):
        return 1 / (1 + np.exp(-x))

    def predict(self, email, encrypt_output=False):
        pred = 0
        for word in email:
            if word in self.word2index:
                pred += self.weights[self.word2index[word]]
    
        pred = self.softmax(pred)
    
        if encrypt_output:
            encrypted_pred = encrypt(self.pubkey, pred)
            return encrypted_pred
    
        # Return classification (1 for spam, 0 for ham) based on a threshold
        return 1 if pred >= 0.5 else 0  # Threshold is 0.5


    def learn(self, email, target):
        pred = self.predict(email)  # Unencrypted prediction
        delta = pred - target  # This operation works because `pred` is unencrypted
    
        for word in email:
            if word in self.word2index:
                self.weights[self.word2index[word]] -= self.alpha * delta

        return delta


# Example Usage
if __name__ == "__main__":
    public_key, private_key = generate_keys(bit_length=64)
    model = LogisticRegression(spam[0:-1000], ham[0:-1000], public_key, iterations=10, alpha=0.01)

    fp, tn, tp, fn = 0, 0, 0, 0

    for h in ham[-1000:]:
        encrypted_pred = model.predict(h, encrypt_output=True)  # Encrypt the output during prediction
        pred = encrypt(private_key, public_key, encrypted_pred)
        if pred <  0.5:
            tn += 1
        else:
            fp += 1

    for s in spam[-1000:]:
        encrypted_pred = model.predict(s, encrypt_output=True)  # Encrypt the output during prediction
        pred = encrypt(private_key, public_key, encrypted_pred)
        if pred > 0.5:
            tp += 1
        else:
            fn += 1

print(f"False Positives: {fp}, True Negatives: {tn}, True Positives: {tp}, False Negatives: {fn}")
total = tn + tp + fp + fn
accuracy = 100 * (tn + tp) / total if total > 0 else 0
print(f"\nAccuracy: {accuracy:.2f}%")


  return 1 / (1 + np.exp(-x))


Iteration: 10, Loss: 0.0000
Weights encrypted after training.
Weights encrypted after training.


TypeError: can't multiply sequence by non-int of type 'EncryptedNumber'

In [110]:
import numpy as np
from collections import Counter
import chardet
import math
import re
import libnum

np.random.seed(12345)

def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        raw_data = f.read()
        result = chardet.detect(raw_data)
        return result['encoding']

def clean_text(text):
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'&\w+;', '', text)
    text = re.sub(r'Subject:', '', text)
    return text
    
def load_data(file_path):
    print(f"Detecting Encoding for {file_path}...")
    encoding = detect_encoding(file_path)
    print(f"Encoding detected: {encoding}\n")

    print(file_path, " : ", encoding) 
    with open(file_path, 'r', encoding=encoding, errors='replace') as f:
        raw = f.readlines()
    print(f"Dataset '{file_path}' loaded successfully with {len(raw)} records.\n")
    return [[word for word in clean_text(row).split() if word.strip()] for row in raw]

spam = load_data('spam.txt')
ham = load_data('ham.txt')

def mod_inverse(x, n):
    return pow(x, -1, n)

def L(x, n):
    return (x - 1) // n

def generate_keys(bit_length=256):
    print("🔑 Generating keys, please wait...")

    p = libnum.generate_prime(bit_length // 2)
    q = libnum.generate_prime(bit_length // 2)
    n = p * q
    n_sq = n * n
    lambda_param = (p - 1) * (q - 1) // math.gcd(p - 1, q - 1)
    g = n + 1
    mu = mod_inverse(L(pow(g, lambda_param, n_sq), n), n)
    print(f"✓ Keys generated successfully! (bit length: {bit_length})\n")

    public_key = (n, g)
    private_key = (lambda_param, mu)
    return public_key, private_key

class EncryptedNumber:
    def __init__(self, ciphertext):
        self.ciphertext = ciphertext

def encrypt(public_key, plaintext, scaling_factor=10000):
    n, g = public_key
    n_sq = n * n
    r = libnum.randint_bits(n.bit_length() - 1) % n
    ciphertext = (pow(g, int(plaintext * scaling_factor), n_sq) * pow(r, n, n_sq)) % n_sq
    return EncryptedNumber(ciphertext)

def decrypt(private_key, public_key, encrypted_number, scaling_factor=10000):
    n, g = public_key
    lambda_param, mu = private_key
    n_sq = n * n
    ciphertext = encrypted_number.ciphertext
    x = pow(ciphertext, lambda_param, n_sq)
    plaintext = (L(x, n) * mu) % n
    return plaintext / scaling_factor

def homomorphic_add(public_key, c1, c2):
    n, _ = public_key
    n_sq = n * n
    return EncryptedNumber((c1.ciphertext * c2.ciphertext) % n_sq)

class LogisticRegression:
    def __init__(self, positives, negatives, public_key, iterations=20, alpha=0.01, regularization_strength=0.001, weight_scale=1e5):
        self.alpha = alpha
        self.regularization_strength = regularization_strength
        self.pubkey = public_key
        self.weight_scale = weight_scale

        print(f"Initializing Logistic Regression model...")
        cnts = Counter()
        for email in (positives + negatives):
            cnts.update(email)

        vocab = list(cnts.keys())
        self.word2index = {word: i for i, word in enumerate(vocab)}

        # Initialize weights as in target
        self.weights = (np.random.rand(len(vocab)) - 0.5) * 0.1

        self.train(positives, negatives, iterations=iterations)

        # Encrypt weights after training
        self.encrypted_weights = [encrypt(self.pubkey, int(weight)) for weight in self.weights]

    def train(self, positives, negatives, iterations=10):
        for iteration in range(iterations):
            error = 0
            n = 0
            for i in range(max(len(positives), len(negatives))):
                error += np.abs(self.learn(positives[i % len(positives)], 1))
                error += np.abs(self.learn(negatives[i % len(negatives)], 0))
                n += 2
        
            print(f"Iteration: {iteration + 1}, Loss: {error / n:.4f}")
    
        # Encrypt weights after training
        self.encrypted_weights = [encrypt(self.pubkey, int(weight)) for weight in self.weights]
        print("Weights encrypted after training.")

    @staticmethod
    def softmax(x):
        x = np.clip(x, -500, 500)
        return 1 / (1 + np.exp(-x))

    def predict(self, email, encrypt_output=True):
        pred = 0
        for word in email:
            if word in self.word2index:
                pred += self.weights[self.word2index[word]]
        
        pred = self.softmax(pred)
        
        if encrypt_output:
            encrypted_pred = encrypt(self.pubkey, pred)
            return encrypted_pred
        
        # Return classification (1 for spam, 0 for ham) based on a threshold
        return 1 if pred >= 0.5 else 0  # Threshold is 0.5



    def learn(self, email, target):
        pred = self.predict(email, encrypt_output=False)  # Unencrypted prediction
        delta = pred - target  # This operation works because `pred` is unencrypted
    
        for word in email:
            if word in self.word2index:
                self.weights[self.word2index[word]] -= self.alpha * delta

        return delta


# Example Usage
if __name__ == "__main__":
    public_key, private_key = generate_keys(bit_length=64)
    model = LogisticRegression(spam[0:-1000], ham[0:-1000], public_key, iterations=10, alpha=0.01)

    fp, tn, tp, fn = 0, 0, 0, 0

    for h in ham[-1000:]:
        pred = model.predict(h, encrypt_output=False)  # Encrypt the output during prediction
        #pred = decrypt(private_key, public_key, encrypted_pred)
        if pred <  0.5:
            tn += 1
        else:
            fp += 1

    for s in spam[-1000:]:
        pred = model.predict(s, encrypt_output=False)  # Encrypt the output during prediction
        #pred = decrypt(private_key, public_key, encrypted_pred)
        if pred > 0.5:
            tp += 1
        else:
            fn += 1

print(f"False Positives: {fp}, True Negatives: {tn}, True Positives: {tp}, False Negatives: {fn}")
total = tn + tp + fp + fn
accuracy = 100 * (tn + tp) / total if total > 0 else 0
print(f"\nAccuracy: {accuracy:.2f}%")


Detecting Encoding for spam.txt...
Encoding detected: Windows-1252

spam.txt  :  Windows-1252
Dataset 'spam.txt' loaded successfully with 9000 records.

Detecting Encoding for ham.txt...
Encoding detected: ascii

ham.txt  :  ascii
Dataset 'ham.txt' loaded successfully with 22032 records.

🔑 Generating keys, please wait...
✓ Keys generated successfully! (bit length: 64)

Initializing Logistic Regression model...
Iteration: 1, Loss: 0.0517
Iteration: 2, Loss: 0.0102
Iteration: 3, Loss: 0.0033
Iteration: 4, Loss: 0.0006
Iteration: 5, Loss: 0.0005
Iteration: 6, Loss: 0.0000
Iteration: 7, Loss: 0.0000
Iteration: 8, Loss: 0.0000
Iteration: 9, Loss: 0.0000
Iteration: 10, Loss: 0.0000
Weights encrypted after training.
False Positives: 0, True Negatives: 1000, True Positives: 1000, False Negatives: 0

Accuracy: 100.00%


In [112]:
model.predict(spam[0])

<__main__.EncryptedNumber at 0x141aa5d00>