Put this notebook in the same directory as `classifier.py` containing your implementation of Classifier.

In [1]:
from classifier import Classifier
from time import time
import csv
from io import StringIO



In [2]:
classifier = Classifier()

## Training

In [3]:
with open("/data/reddit9.csv", newline="") as f:
    reader = csv.reader(f)
    next(reader) # skip header row
    
    # allow classifier to train for 5 minutes
    tic = time()
    k = 1
    #while time() - tic < 300:
    while k + 1 < 3 :
        # get 10 comments at a time
        try:
            rows = []
            for _ in range(10):
                rows.append(next(reader))
        except StopIteration:
            break
        
        # convert list of rows to a CSV string
        output = StringIO()
        writer = csv.writer(output)
        writer.writerows(rows)
        data = output.getvalue()
        
        # update classifier with data
        classifier.update(data)
        k += 1

## Prediction

In [4]:
def get_metrics(score):
    total = score["TP"] + score["TN"] + score["FP"] + score["FN"]
    accuracy = 0 if total == 0 else 100 * (score["TP"] + score["TN"]) / total
    precision = 0 if (score["TP"] + score["FP"]) == 0 else 100 * score["TP"] / (score["TP"] + score["FP"])
    recall = 0 if (score["TP"] + score["FN"]) == 0 else 100 * score["TP"] / (score["TP"] + score["FN"])
    f1 = 0 if precision == 0 or recall == 0 else 2 / (1 / precision + 1 / recall)
    
    return accuracy, precision, recall, f1

In [5]:
score = {
    "TP": 0, 
    "FP": 0,
    "TN": 0,
    "FN": 0
}

total = 100000

with open("/data/reddit10.csv", newline="") as f:
    reader = csv.reader(f)
    next(reader) # skip header row
    
    # predict the class for 100000 posts, 100 at a time
    batch = 0
    while batch < total / 100:
        
        # get comments in batches of 100
        try:
            rows = []
            for _ in range(100):
                rows.append(next(reader))
        except StopIteration:
            break
            
        # get correct answers and strip data
        correct = []
        for i, row in enumerate(rows):
            correct.append(row[20])
            for j in [1, 5, 10, 11, 12, 13, 15, 16, 18, 19, 20]:
                rows[i][j] = ""
        
        # convert list of rows to a CSV string
        output = StringIO()
        writer = csv.writer(output)
        writer.writerows(rows)
        data = output.getvalue()
        
        # update classifier with data
        predictions = classifier.predict(data)
        
        # evaluate predictions against ground truth
        for p, c in zip(predictions, correct):
            if int(p) == 1:
                if int(c) == 1:
                    score["TP"] += 1
                else:
                    score["FP"] += 1
            elif int(p) == 0:
                if int(c) == 1:
                    score["FN"] += 1
                else:
                    score["TN"] += 1
            else:
                raise Exception("invalid prediction")
    
        batch += 1
        if batch % 100 == 0:
            print("Accuracy: %.2f, Precision: %.2f, Recall: %.2f, F1: %.2f" % get_metrics(score))
            
    print("Accuracy: %.2f, Precision: %.2f, Recall: %.2f, F1: %.2f" % get_metrics(score))
        

KeyboardInterrupt: 