In [2]:
import os

def extract_emails(directory, output_file):
    with open(output_file, 'w', encoding='utf-8') as out_file:
        for root, _, files in os.walk(directory):
            for file in files:
                # Process text files, eml files, or files without an extension
                if file.endswith('.txt') or file.endswith('.eml') or not os.path.splitext(file)[1]:
                    file_path = os.path.join(root, file)
                    try:
                        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                            content = f.read()
                            out_file.write(content + '\n\n')  # Separate emails by two newlines
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")

# Paths to directories
ham_dir = 'enron/ham'
spam_dir = 'enron/spam'

# Output files
ham_output_file = 'enron/ham.txt'
spam_output_file = 'enron/spam.txt'

# Extract emails
extract_emails(ham_dir, ham_output_file)
extract_emails(spam_dir, spam_output_file)

print("Extraction complete.")


Extraction complete.


## Model


In [7]:
import numpy as np
from collections import Counter
import random
import sys
import chardet

np.random.seed(12345)

print("Importing dataset from disk...")

def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        raw_data = f.read()
        result = chardet.detect(raw_data)
        return result['encoding']

# Detect encoding for spam.txt
encoding_spam = detect_encoding('spam.txt')
with open('spam.txt', 'r', encoding=encoding_spam, errors='replace') as f:
    raw = f.readlines()

spam = [row[:-2].split(" ") for row in raw]

# Detect encoding for ham.txt
encoding_ham = detect_encoding('ham.txt')
with open('ham.txt', 'r', encoding=encoding_ham, errors='replace') as f:
    raw = f.readlines()

ham = [row[:-2].split(" ") for row in raw]

class LogisticRegression(object):
    
    def __init__(self, positives, negatives, iterations=10, alpha=0.01, regularization_strength=0.01):
        
        self.maxweight = 10
        self.alpha = alpha
        self.regularization_strength = regularization_strength
        
        # Create vocabulary
        cnts = Counter()
        for email in (positives + negatives):
            for word in email:
                cnts[word] += 1
        
        vocab = list(cnts.keys())
        self.word2index = {word: i for i, word in enumerate(vocab)}
    
        # Initialize weights
        self.weights = (np.random.rand(len(vocab)) - 0.5) * 0.1
        
        # Train model
        self.train(positives, negatives, iterations=iterations)
    
    def train(self, positives, negatives, iterations=10):
        
        for iter in range(iterations):
            error = 0
            n = 0
            for i in range(max(len(positives), len(negatives))):
                error += np.abs(self.learn(positives[i % len(positives)], 1))
                error += np.abs(self.learn(negatives[i % len(negatives)], 0))
                n += 2

            print("Iter:" + str(iter) + " Loss:" + str(error / float(n)))
    
    @staticmethod
    def softmax(x):
        x = np.clip(x, -500, 500)  # Clip values to avoid overflow
        return 1 / (1 + np.exp(-x))
    
    def predict(self, email):
        return self.unencrypted_predict(email)
    
    def unencrypted_predict(self, email):
        pred = 0
        for word in email:
            pred += self.weights[self.word2index[word]]
        pred = self.softmax(pred)
        return pred

    def learn(self, email, target):
        pred = self.predict(email)
        delta = (pred - target)
        for word in email:
            self.weights[self.word2index[word]] -= (delta * self.alpha + self.regularization_strength * self.weights[self.word2index[word]])
        return delta

model = LogisticRegression(spam[0:-5000], ham[0:-5000], iterations=10, alpha=0.01, regularization_strength=0.01)

# Evaluate the model
fp = 0
tn = 0
tp = 0
fn = 0

for i, h in enumerate(ham[-1000:]):
    pred = model.predict(h)
    if pred < 0.5:
        tn += 1
    else:
        fp += 1

    if i % 10 == 0:
        sys.stdout.write('\r I:' + str(tn + tp + fn + fp) + " % Correct:" + str(100 * tn / float(tn + fp))[0:6])

for i, h in enumerate(spam[-1000:]):
    pred = model.predict(h)
    if pred > 0.5:
        tp += 1
    else:
        fn += 1

    if i % 10 == 0:
        sys.stdout.write('\r I:' + str(tn + tp + fn + fp) + " % Correct:" + str(100 * (tn + tp) / float(tn + tp + fn + fp))[0:6])

sys.stdout.write('\r I:' + str(tn + tp + fn + fp) + " % Correct:" + str(100 * (tn + tp) / float(tn + tp + fn + fp))[0:6])

print("\n Accuracy: %" + str(100 * (tn + tp) / float(tn + tp + fn + fp))[0:6])
print("False Positives: %" + str(100 * fp / float(tp + fp))[0:4] + "    <- privacy violation level")
print("False Negatives: %" + str(100 * fn / float(tn + fn))[0:4] + "   <- security risk level")


Importing dataset from disk...
Iter:0 Loss:0.17723691243965922
Iter:1 Loss:0.1518159145780536
Iter:2 Loss:0.14444303680231094
Iter:3 Loss:0.14093632480936893
Iter:4 Loss:0.1384698558292573
Iter:5 Loss:0.13689429473413126
Iter:6 Loss:0.1357383294796565
Iter:7 Loss:0.1347106069611384
Iter:8 Loss:0.13406221429403853
Iter:9 Loss:0.13371350537215843
 I:2000 % Correct:98.955
 Accuracy: %98.95
False Positives: %1.77    <- privacy violation level
False Negatives: %0.30   <- security risk level


In [None]:
## from flask import Flask, request, jsonify
import re

# Initialize Flask app
app = Flask(__name__)

# Assuming `model` is your unencrypted model from previous steps
# Initialize the model here if necessary

# Define the preprocessing function to handle case and punctuation
def preprocess_email(email):
    # Convert to lowercase
    email = email.lower()
    
    # Remove punctuation using regex
    email = re.sub(r'[^\w\s]', '', email)
    
    # Tokenize by splitting on spaces
    email_tokens = email.split()
    
    return email_tokens

@app.route('/predict', methods=['POST'])
def predict():
    data = request.json
    email = data.get('email', '')
    
    if not email:
        return jsonify({'error': 'No email provided'}), 400
    
    # Preprocess the email input
    email_tokens = preprocess_email(email)
    
    # Run through the model
    try:
        pred = model.predict(email_tokens)
        
        # Determine whether it's spam or not
        if pred > 0.5:
            result = "spam"
        else:
            result = "not spam"
        
        # Return the result as JSON
        return jsonify({'prediction': result})
    
    except KeyError as e:
        # Return an error message if a word is not found in the vocabulary
        return jsonify({'error': f"Word '{e.args[0]}' not in vocabulary"}), 400

# Run the Flask server
if __name__ == '__main__':
    app.run(port=12345)  # Default port is 5000


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:12345
[33mPress CTRL+C to quit[0m
127.0.0.1 - - [16/Sep/2024 10:53:56] "[31m[1mPOST /predict HTTP/1.1[0m" 400 -
127.0.0.1 - - [16/Sep/2024 10:53:56] "[31m[1mPOST /predict HTTP/1.1[0m" 400 -
127.0.0.1 - - [16/Sep/2024 10:54:08] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [16/Sep/2024 10:54:08] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [16/Sep/2024 10:54:11] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [16/Sep/2024 10:54:11] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [16/Sep/2024 10:54:11] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [16/Sep/2024 10:54:11] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [16/Sep/2024 10:54:12] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [16/Sep/2024 10:54:12] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [16/Sep/2024 10:54:12] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [16/Sep/2024 10:54:12] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [16/Sep/2024 10:54:57] "[31m[1mPOST /predict HTTP/1.1[0m" 400 -
127.0.0.1 

In [3]:
import phe as paillier
print ("Generating paillier keypair")
pubkey, prikey = paillier.generate_paillier_keypair(n_length=64)

Generating paillier keypair


In [4]:
a = pubkey.encrypt (123)
b= pubkey.encrypt (-1)

In [5]:
prikey.decrypt(a)

123

In [6]:
print(a)
print(b)

<phe.paillier.EncryptedNumber object at 0x10f275760>
<phe.paillier.EncryptedNumber object at 0x10f0df290>


In [7]:
prikey.decrypt(a+a)

246

In [8]:
c=a+a

In [9]:
print(c)

<phe.paillier.EncryptedNumber object at 0x10f307e60>


In [19]:
prikey.decrypt(c)

246

In [12]:
import math
import libnum

# Function to compute modular inverse
def mod_inverse(x, n):
    return pow(x, -1, n)

# L function for decryption
def L(x, n):
    return (x - 1) // n

# Paillier key generation using libnum for prime generation
def generate_keys(bit_length=512):
    # Generate two large prime numbers p and q using libnum
    p = libnum.generate_prime(bit_length // 2)
    q = libnum.generate_prime(bit_length // 2)

    n = p * q  # n = p * q
    n_sq = n * n  # n^2 for the modulus in encryption

    # λ (lambda) = lcm(p-1, q-1)
    lambda_param = (p - 1) * (q - 1) // math.gcd(p - 1, q - 1)

    # g can be any number (usually n+1)
    g = n + 1

    # µ = (L(g^λ mod n^2))^(-1) mod n
    mu = mod_inverse(L(pow(g, lambda_param, n_sq), n), n)

    public_key = (n, g)
    private_key = (lambda_param, mu)

    return public_key, private_key

# Custom EncryptedNumber class to mimic object-like string representation
class EncryptedNumber:
    def __init__(self, ciphertext):
        self.ciphertext = ciphertext

    def __str__(self):
        # This returns a string similar to an object representation
        return f"<EncryptedNumber object at {hex(id(self))}>"

# Paillier encryption
def encrypt(public_key, plaintext):
    n, g = public_key
    n_sq = n * n

    # Choose random r where 1 <= r < n
    r = libnum.randint_bits(n.bit_length() - 1) % n

    # Encryption formula: c = g^m * r^n mod n^2
    ciphertext = (pow(g, plaintext, n_sq) * pow(r, n, n_sq)) % n_sq
    return EncryptedNumber(ciphertext)  # Wrap ciphertext in EncryptedNumber class

# Paillier decryption
def decrypt(private_key, public_key, encrypted_number):
    n, g = public_key
    lambda_param, mu = private_key
    n_sq = n * n

    # Extract the actual ciphertext from the EncryptedNumber object
    ciphertext = encrypted_number.ciphertext

    # Decryption formula: m = L(c^λ mod n^2) * µ mod n
    x = pow(ciphertext, lambda_param, n_sq)
    plaintext = (L(x, n) * mu) % n
    return plaintext

# Homomorphic addition of two ciphertexts
def homomorphic_add(public_key, c1, c2):
    n, _ = public_key
    n_sq = n * n

    # Homomorphic addition: c3 = (c1 * c2) mod n^2
    return EncryptedNumber((c1.ciphertext * c2.ciphertext) % n_sq)

# Example usage:
if __name__ == "__main__":
    # Key generation (512-bit security)
    public_key, private_key = generate_keys(bit_length=512)

    # Encrypt two messages
    m1 = 42
    m2 = 23
    print(f"Original messages: {m1}, {m2}")

    c1 = encrypt(public_key, m1)
    c2 = encrypt(public_key, m2)
    print(f"Encrypted messages: {c1}, {c2}")

    # Homomorphic addition of encrypted messages
    c3 = homomorphic_add(public_key, c1, c2)
    print(f"Encrypted sum: {c3}")

    # Decrypt the sum
    decrypted_sum = decrypt(private_key, public_key, c3)
    print(f"Decrypted sum: {decrypted_sum}")


Original messages: 42, 23
Encrypted messages: <EncryptedNumber object at 0x10f0df980>, <EncryptedNumber object at 0x10f261b50>
Encrypted sum: <EncryptedNumber object at 0x10aefb620>
Decrypted sum: 65


In [11]:
pip install libnum

Collecting libnum
  Downloading libnum-1.7.1-py3-none-any.whl.metadata (4.6 kB)
Downloading libnum-1.7.1-py3-none-any.whl (14 kB)
Installing collected packages: libnum
Successfully installed libnum-1.7.1
Note: you may need to restart the kernel to use updated packages.
