In [13]:
import numpy as np
from collections import Counter
import random
import sys
import chardet

np.random.seed(12345)

print("Importing dataset from disk...")

def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        raw_data = f.read()
        result = chardet.detect(raw_data)
        return result['encoding']

# Detect encoding for spam.txt
encoding_spam = detect_encoding('spam.txt')
with open('spam.txt', 'r', encoding=encoding_spam, errors='replace') as f:
    raw = f.readlines()

spam = [row[:-2].split(" ") for row in raw]

# Detect encoding for ham.txt
encoding_ham = detect_encoding('ham.txt')
with open('ham.txt', 'r', encoding=encoding_ham, errors='replace') as f:
    raw = f.readlines()

ham = [row[:-2].split(" ") for row in raw]

class SVM:
    
    def __init__(self, positives, negatives, iterations=10, alpha=0.005, regularization_strength=0.05):
        self.alpha = alpha
        self.regularization_strength = regularization_strength
        
        # Create vocabulary
        cnts = Counter()
        for email in (positives + negatives):
            for word in email:
                cnts[word] += 1
        
        vocab = list(cnts.keys())
        self.word2index = {word: i for i, word in enumerate(vocab)}
    
        # Initialize weights
        self.weights = (np.random.rand(len(vocab)) - 0.5) * 0.1
        self.bias = 0
        
        # Calculate class weights
        self.class_weight_spam = len(positives + negatives) / (2.0 * len(positives))  # For spam (positive class)
        self.class_weight_ham = len(positives + negatives) / (2.0 * len(negatives))  # For ham (negative class)
        
        # Train model
        self.train(positives, negatives, iterations=iterations)
    
    def train(self, positives, negatives, iterations=10):
        for iter in range(iterations):
            error = 0
            n = 0
            for i in range(max(len(positives), len(negatives))):
                # Train on positive and negative examples
                error += self.learn(positives[i % len(positives)], 1, self.class_weight_spam)
                error += self.learn(negatives[i % len(negatives)], -1, self.class_weight_ham)
                n += 2

            print(f"Iter: {iter} Avg Hinge Loss: {error / float(n)}")
    
    def predict(self, email):
        pred = self.unencrypted_predict(email)
        return np.sign(pred)
    
    def unencrypted_predict(self, email):
        pred = 0
        for word in email:
            if word in self.word2index:
                pred += self.weights[self.word2index[word]]
        pred += self.bias
        return pred

    def learn(self, email, target, class_weight):
        """
        Learn from one example using hinge loss with class weighting.
        """
        pred = self.unencrypted_predict(email)
        if target * pred < 1:  # Misclassified or within margin
            # Update weights and bias
            for word in email:
                if word in self.word2index:
                    self.weights[self.word2index[word]] += self.alpha * class_weight * (target - self.regularization_strength * self.weights[self.word2index[word]])
            self.bias += self.alpha * class_weight * target
            return class_weight * max(0, 1 - target * pred)  # Apply class weight to hinge loss
        else:
            # Regularization update (when the example is correctly classified)
            for word in email:
                if word in self.word2index:
                    self.weights[self.word2index[word]] -= self.alpha * self.regularization_strength * self.weights[self.word2index[word]]
            return 0

# Train the SVM model using class-weighted learning
model = SVM(spam[0:-1000], ham[0:-1000], iterations=50, alpha=0.008, regularization_strength=0.1)

# Evaluate the model
fp = 0
tn = 0
tp = 0
fn = 0

for i, h in enumerate(ham[-1000:]):
    pred = model.predict(h)
    if pred < 0:  # Negative class, so ham
        tn += 1
    else:  # False positive, classified as spam
        fp += 1

    if i % 10 == 0:
        sys.stdout.write('\r I:' + str(tn + tp + fn + fp) + " % Correct:" + str(100 * tn / float(tn + fp))[0:6])

for i, h in enumerate(spam[-1000:]):
    pred = model.predict(h)
    if pred > 0:  # Positive class, so spam
        tp += 1
    else:  # False negative, classified as ham
        fn += 1

    if i % 10 == 0:
        sys.stdout.write('\r I:' + str(tn + tp + fn + fp) + " % Correct:" + str(100 * (tn + tp) / float(tn + tp + fn + fp))[0:6])

sys.stdout.write('\r I:' + str(tn + tp + fn + fp) + " % Correct:" + str(100 * (tn + tp) / float(tn + tp + fn + fp))[0:6])

print("\nAccuracy: %" + str(100 * (tn + tp) / float(tn + tp + fn + fp))[0:6])
print("False Positives: %" + str(100 * fp / float(tp + fp))[0:4] + "    <- privacy violation level")
print("False Negatives: %" + str(100 * fn / float(tn + fn))[0:4] + "   <- security risk level")


Importing dataset from disk...
Iter: 0 Avg Hinge Loss: 0.3479097259987188
Iter: 1 Avg Hinge Loss: 0.12691136246267476
Iter: 2 Avg Hinge Loss: 0.19241222364332944
Iter: 3 Avg Hinge Loss: 0.06246362373513119
Iter: 4 Avg Hinge Loss: 0.19246261714101956
Iter: 5 Avg Hinge Loss: 0.11835125270856997
Iter: 6 Avg Hinge Loss: 0.06225003776756309
Iter: 7 Avg Hinge Loss: 0.20192517295962317
Iter: 8 Avg Hinge Loss: 0.07811829100448543
Iter: 9 Avg Hinge Loss: 0.05050357864976421
Iter: 10 Avg Hinge Loss: 0.21081160252246084
Iter: 11 Avg Hinge Loss: 0.11562362719178465
Iter: 12 Avg Hinge Loss: 0.08927616203611047
Iter: 13 Avg Hinge Loss: 0.11158828003645546
Iter: 14 Avg Hinge Loss: 0.09564059025254384
Iter: 15 Avg Hinge Loss: 0.09245413117220073
Iter: 16 Avg Hinge Loss: 0.11069605182331393
Iter: 17 Avg Hinge Loss: 0.07642610747409352
Iter: 18 Avg Hinge Loss: 0.06494183014203712
Iter: 19 Avg Hinge Loss: 0.0882819096484429
Iter: 20 Avg Hinge Loss: 0.06319971491670619
Iter: 21 Avg Hinge Loss: 0.069029559

In [14]:
import numpy as np
from collections import Counter
import random
import sys
import chardet

np.random.seed(12345)

print("Importing dataset from disk...")

def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        raw_data = f.read()
        result = chardet.detect(raw_data)
        return result['encoding']

# Detect encoding for spam.txt
encoding_spam = detect_encoding('spam.txt')
with open('spam.txt', 'r', encoding=encoding_spam, errors='replace') as f:
    raw = f.readlines()

spam = [row[:-2].split(" ") for row in raw]

# Detect encoding for ham.txt
encoding_ham = detect_encoding('ham.txt')
with open('ham.txt', 'r', encoding=encoding_ham, errors='replace') as f:
    raw = f.readlines()

ham = [row[:-2].split(" ") for row in raw]

class SVM:
    
    def __init__(self, positives, negatives, iterations=10, alpha=0.005, regularization_strength=0.05):
        self.alpha = alpha
        self.regularization_strength = regularization_strength
        
        # Create vocabulary
        cnts = Counter()
        for email in (positives + negatives):
            for word in email:
                cnts[word] += 1
        
        vocab = list(cnts.keys())
        self.word2index = {word: i for i, word in enumerate(vocab)}
    
        # Initialize weights
        self.weights = (np.random.rand(len(vocab)) - 0.5) * 0.1
        self.bias = 0
        
        # Calculate class weights
        self.class_weight_spam = len(positives + negatives) / (2.0 * len(positives))  # For spam (positive class)
        self.class_weight_ham = len(positives + negatives) / (2.0 * len(negatives))  # For ham (negative class)
        
        # Train model
        self.train(positives, negatives, iterations=iterations)
    
    def train(self, positives, negatives, iterations=10):
        for iter in range(iterations):
            error = 0
            n = 0
            for i in range(max(len(positives), len(negatives))):
                # Train on positive and negative examples
                error += self.learn(positives[i % len(positives)], 1, self.class_weight_spam)
                error += self.learn(negatives[i % len(negatives)], -1, self.class_weight_ham)
                n += 2

            print(f"Iter: {iter} Avg Hinge Loss: {error / float(n)}")
    
    def predict(self, email):
        pred = self.unencrypted_predict(email)
        return np.sign(pred)
    
    def unencrypted_predict(self, email):
        pred = 0
        for word in email:
            if word in self.word2index:
                pred += self.weights[self.word2index[word]]
        pred += self.bias
        return pred

    def learn(self, email, target, class_weight):
        """
        Learn from one example using hinge loss with class weighting.
        """
        pred = self.unencrypted_predict(email)
        if target * pred < 1:  # Misclassified or within margin
            # Update weights and bias
            for word in email:
                if word in self.word2index:
                    self.weights[self.word2index[word]] += self.alpha * class_weight * (target - self.regularization_strength * self.weights[self.word2index[word]])
            self.bias += self.alpha * class_weight * target
            return class_weight * max(0, 1 - target * pred)  # Apply class weight to hinge loss
        else:
            # Regularization update (when the example is correctly classified)
            for word in email:
                if word in self.word2index:
                    self.weights[self.word2index[word]] -= self.alpha * self.regularization_strength * self.weights[self.word2index[word]]
            return 0

# Train the SVM model using class-weighted learning
model = SVM(spam[0:-1000], ham[0:-1000], iterations=50, alpha=0.005, regularization_strength=0.15)

# Evaluate the model
fp = 0
tn = 0
tp = 0
fn = 0

for i, h in enumerate(ham[-1000:]):
    pred = model.predict(h)
    if pred < 0:  # Negative class, so ham
        tn += 1
    else:  # False positive, classified as spam
        fp += 1

    if i % 10 == 0:
        sys.stdout.write('\r I:' + str(tn + tp + fn + fp) + " % Correct:" + str(100 * tn / float(tn + fp))[0:6])

for i, h in enumerate(spam[-1000:]):
    pred = model.predict(h)
    if pred > 0:  # Positive class, so spam
        tp += 1
    else:  # False negative, classified as ham
        fn += 1

    if i % 10 == 0:
        sys.stdout.write('\r I:' + str(tn + tp + fn + fp) + " % Correct:" + str(100 * (tn + tp) / float(tn + tp + fn + fp))[0:6])

sys.stdout.write('\r I:' + str(tn + tp + fn + fp) + " % Correct:" + str(100 * (tn + tp) / float(tn + tp + fn + fp))[0:6])

print("\nAccuracy: %" + str(100 * (tn + tp) / float(tn + tp + fn + fp))[0:6])
print("False Positives: %" + str(100 * fp / float(tp + fp))[0:4] + "    <- privacy violation level")
print("False Negatives: %" + str(100 * fn / float(tn + fn))[0:4] + "   <- security risk level")


Importing dataset from disk...
Iter: 0 Avg Hinge Loss: 0.29077879472627793
Iter: 1 Avg Hinge Loss: 0.09454807004125755
Iter: 2 Avg Hinge Loss: 0.12995744746318927
Iter: 3 Avg Hinge Loss: 0.0977860489877116
Iter: 4 Avg Hinge Loss: 0.07241684504606379
Iter: 5 Avg Hinge Loss: 0.0846983369206138
Iter: 6 Avg Hinge Loss: 0.05854650476911265
Iter: 7 Avg Hinge Loss: 0.07045641092528987
Iter: 8 Avg Hinge Loss: 0.05221536481273749
Iter: 9 Avg Hinge Loss: 0.05662334796952123
Iter: 10 Avg Hinge Loss: 0.08114024841094307
Iter: 11 Avg Hinge Loss: 0.07633703904837863
Iter: 12 Avg Hinge Loss: 0.13610273456558522
Iter: 13 Avg Hinge Loss: 0.0750766280094622
Iter: 14 Avg Hinge Loss: 0.06099816194294786
Iter: 15 Avg Hinge Loss: 0.043882465337187204
Iter: 16 Avg Hinge Loss: 0.07622035380367476
Iter: 17 Avg Hinge Loss: 0.05819049872486148
Iter: 18 Avg Hinge Loss: 0.04510003060558581
Iter: 19 Avg Hinge Loss: 0.06795549309800365
Iter: 20 Avg Hinge Loss: 0.08861657317238193
Iter: 21 Avg Hinge Loss: 0.041051498

In [15]:
import numpy as np
from collections import Counter
import random
import sys
import chardet

np.random.seed(12345)

print("Importing dataset from disk...")

def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        raw_data = f.read()
        result = chardet.detect(raw_data)
        return result['encoding']

# Detect encoding for spam.txt
encoding_spam = detect_encoding('spam.txt')
with open('spam.txt', 'r', encoding=encoding_spam, errors='replace') as f:
    raw = f.readlines()

spam = [row[:-2].split(" ") for row in raw]

# Detect encoding for ham.txt
encoding_ham = detect_encoding('ham.txt')
with open('ham.txt', 'r', encoding=encoding_ham, errors='replace') as f:
    raw = f.readlines()

ham = [row[:-2].split(" ") for row in raw]

class SVM:
    
    def __init__(self, positives, negatives, iterations=10, alpha=0.005, regularization_strength=0.05):
        self.alpha = alpha
        self.regularization_strength = regularization_strength
        
        # Create vocabulary
        cnts = Counter()
        for email in (positives + negatives):
            for word in email:
                cnts[word] += 1
        
        vocab = list(cnts.keys())
        self.word2index = {word: i for i, word in enumerate(vocab)}
    
        # Initialize weights
        self.weights = (np.random.rand(len(vocab)) - 0.5) * 0.1
        self.bias = 0
        
        # Calculate class weights
        self.class_weight_spam = len(positives + negatives) / (2.0 * len(positives))  # For spam (positive class)
        self.class_weight_ham = len(positives + negatives) / (2.0 * len(negatives))  # For ham (negative class)
        
        # Train model
        self.train(positives, negatives, iterations=iterations)
    
    def train(self, positives, negatives, iterations=10):
        for iter in range(iterations):
            error = 0
            n = 0
            for i in range(max(len(positives), len(negatives))):
                # Train on positive and negative examples
                error += self.learn(positives[i % len(positives)], 1, self.class_weight_spam)
                error += self.learn(negatives[i % len(negatives)], -1, self.class_weight_ham)
                n += 2

            print(f"Iter: {iter} Avg Hinge Loss: {error / float(n)}")
    
    def predict(self, email):
        pred = self.unencrypted_predict(email)
        return np.sign(pred)
    
    def unencrypted_predict(self, email):
        pred = 0
        for word in email:
            if word in self.word2index:
                pred += self.weights[self.word2index[word]]
        pred += self.bias
        return pred

    def learn(self, email, target, class_weight):
        """
        Learn from one example using hinge loss with class weighting.
        """
        pred = self.unencrypted_predict(email)
        if target * pred < 1:  # Misclassified or within margin
            # Update weights and bias
            for word in email:
                if word in self.word2index:
                    self.weights[self.word2index[word]] += self.alpha * class_weight * (target - self.regularization_strength * self.weights[self.word2index[word]])
            self.bias += self.alpha * class_weight * target
            return class_weight * max(0, 1 - target * pred)  # Apply class weight to hinge loss
        else:
            # Regularization update (when the example is correctly classified)
            for word in email:
                if word in self.word2index:
                    self.weights[self.word2index[word]] -= self.alpha * self.regularization_strength * self.weights[self.word2index[word]]
            return 0

# Train the SVM model using class-weighted learning
model = SVM(spam[0:-1000], ham[0:-1000], iterations=50, alpha=0.002, regularization_strength=0.15)

# Evaluate the model
fp = 0
tn = 0
tp = 0
fn = 0

for i, h in enumerate(ham[-1000:]):
    pred = model.predict(h)
    if pred < 0:  # Negative class, so ham
        tn += 1
    else:  # False positive, classified as spam
        fp += 1

    if i % 10 == 0:
        sys.stdout.write('\r I:' + str(tn + tp + fn + fp) + " % Correct:" + str(100 * tn / float(tn + fp))[0:6])

for i, h in enumerate(spam[-1000:]):
    pred = model.predict(h)
    if pred > 0:  # Positive class, so spam
        tp += 1
    else:  # False negative, classified as ham
        fn += 1

    if i % 10 == 0:
        sys.stdout.write('\r I:' + str(tn + tp + fn + fp) + " % Correct:" + str(100 * (tn + tp) / float(tn + tp + fn + fp))[0:6])

sys.stdout.write('\r I:' + str(tn + tp + fn + fp) + " % Correct:" + str(100 * (tn + tp) / float(tn + tp + fn + fp))[0:6])

print("\nAccuracy: %" + str(100 * (tn + tp) / float(tn + tp + fn + fp))[0:6])
print("False Positives: %" + str(100 * fp / float(tp + fp))[0:4] + "    <- privacy violation level")
print("False Negatives: %" + str(100 * fn / float(tn + fn))[0:4] + "   <- security risk level")


Importing dataset from disk...
Iter: 0 Avg Hinge Loss: 0.1548281635146259
Iter: 1 Avg Hinge Loss: 0.0755481961485878
Iter: 2 Avg Hinge Loss: 0.04436736824616863
Iter: 3 Avg Hinge Loss: 0.0948758072927415
Iter: 4 Avg Hinge Loss: 0.03886713362116495
Iter: 5 Avg Hinge Loss: 0.031178211702105316
Iter: 6 Avg Hinge Loss: 0.03233633997401704
Iter: 7 Avg Hinge Loss: 0.02984675843973002
Iter: 8 Avg Hinge Loss: 0.025423250449728912
Iter: 9 Avg Hinge Loss: 0.028039331237342906
Iter: 10 Avg Hinge Loss: 0.033847160887986026
Iter: 11 Avg Hinge Loss: 0.04172742374806986
Iter: 12 Avg Hinge Loss: 0.021596998052713014
Iter: 13 Avg Hinge Loss: 0.04960531684035482
Iter: 14 Avg Hinge Loss: 0.03365676055525974
Iter: 15 Avg Hinge Loss: 0.022658128821037217
Iter: 16 Avg Hinge Loss: 0.025096127589202738
Iter: 17 Avg Hinge Loss: 0.03522362032440173
Iter: 18 Avg Hinge Loss: 0.02891721378600951
Iter: 19 Avg Hinge Loss: 0.021342859125341795
Iter: 20 Avg Hinge Loss: 0.022508965329422172
Iter: 21 Avg Hinge Loss: 0.0