In [1]:
from sklearn import svm
import numpy as np
from tqdm import tqdm
import os, random

random.seed(42069666)

class WikiDataset:
    def __init__(self, setting):
        self.FOLDERS = ['History', 'Geography', 'Arts', 'Philosophy_and_religion','Everyday_life',\
                  'Society_and_social_sciences','Biological_and_health_sciences', 'Physical_sciences',\
              'Technology', 'Mathematics']
        self.MAP = {}
        self.CORPUS_FILE = "corpus.txt"
        self.SPLIT = np.array([0, 0.7,0.85,1])
        self.DIM = 0
        self.fileList = []
        self.setting = setting # one of 0,1,2 for train, test val
        for i in range(len(self.FOLDERS)):
            self.MAP[self.FOLDERS[i]] = i
        
        for folder in self.FOLDERS:
            for file in os.listdir(folder):
                self.fileList.append((self.MAP[folder], os.path.join(folder, file)))
        random.shuffle(self.fileList)
        self.SPLIT = [int(c) for c in self.SPLIT * len(self.fileList)]
        
        k = open(self.CORPUS_FILE,"r")
        while True:
            line = k.readline()
            if len(line.split()) == 2:
                self.DIM += 1
            else:
                break
    def __len__(self):
        return self.SPLIT[self.setting+1] - self.SPLIT[self.setting]

    def bag_transform(self, bag):
        return bag/sum(bag)
    def __getitem__(self, idx):
        # we use log bag of words due to zipfian nature
        idx += self.SPLIT[self.setting]
        k = open(self.fileList[idx][1], "r")
        bag = np.zeros(self.DIM)

        while True:
            l = k.readline().split()
            if len(l) <= 1:
                break
            bag[int(l[0])] = float(l[1])
        return self.bag_transform(bag), self.fileList[idx][0]

TrainSet = WikiDataset(0)
ValSet = WikiDataset(1)


In [3]:
Xtrain = []
ytrain = []
Xval = []
yval = []
for i in tqdm(range(TrainSet.__len__())):
    #if i % 50 == 0:
    #    print(f"Loaded {i} Train Examples")
    a,b = TrainSet.__getitem__(i)
    Xtrain.append(a)
    ytrain.append(b)

for i in tqdm(range(ValSet.__len__())):
    #if i % 50 == 0:
    #    print(f"Loaded {i} Test Examples")
    a,b = ValSet.__getitem__(i)
    Xval.append(a)
    yval.append(b)


100%|██████████████████████████████████████████████████████████████████████████████| 5567/5567 [02:43<00:00, 34.03it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1193/1193 [00:31<00:00, 37.47it/s]


0.7820620284995808

In [6]:
from benchmark_base import CategoriserBase

class SVM_tester(CategoriserBase):
    def __init__(self):
        super(SVM_tester, self).__init__()
    
    def precomp(self):
        # override
        self.model = svm.LinearSVC(C=25)
        self.model.fit(Xtrain, ytrain)
        print('Finished training')
    def predict(self, bag):
        return self.model.decision_function(bag)
    
SVM_instance = SVM_tester()
SVM_instance.eval()

Finished training


  return bag/bag.sum() # normalise
  return bag/bag.sum() # normalise
100%|██████████████████████████████████████████████████████████████████████████████| 3342/3342 [01:21<00:00, 40.92it/s]

0/0 correct





In [10]:
SVM_instance.model.decision_function([Xval.__getitem__(1)])

array([[-1.03273518,  0.48975202, -1.22076547, -1.15381653, -0.94391725,
        -0.57195578, -1.02600139, -0.98493871, -1.14714099, -1.11082216]])

In [13]:
class SVM_tester2(CategoriserBase):
    def __init__(self):
        super(SVM_tester2, self).__init__()
    
    def precomp(self):
        # override
        self.model = SVM_instance.model
        print('Finished training')
    def predict(self, bag):
        return self.model.decision_function([bag])[0]
    
SVM_instance2 = SVM_tester2()
SVM_instance2.eval()

Finished training


  return bag/bag.sum() # normalise
  return bag/bag.sum() # normalise
100%|██████████████████████████████████████████████████████████████████████████████| 3342/3342 [01:23<00:00, 40.19it/s]

826/1820 correct





In [16]:
class SVM_tester3(CategoriserBase):
    def __init__(self):
        super(SVM_tester3, self).__init__()
    
    def precomp(self):
        # override
        self.model = svm.LinearSVC(C=5)
        self.model.fit(Xtrain, ytrain)
        print('Finished training')
    def predict(self, bag):
        return self.model.decision_function([bag])[0]
    
SVM_instance3 = SVM_tester3()
SVM_instance3.eval()

Finished training


  return bag/bag.sum() # normalise
  return bag/bag.sum() # normalise
100%|██████████████████████████████████████████████████████████████████████████████| 3342/3342 [01:27<00:00, 38.11it/s]

425/1820 correct





In [20]:
# Naive bayes
# Note that this doesn't provide benchmark for wholly rejecting an article as irrelevant to every category
# This is just for reference
from sklearn.naive_bayes import MultinomialNB

class NaiveBayes(CategoriserBase):
    def __init__(self):
        super(NaiveBayes, self).__init__()
    
    def precomp(self):
        # override
        # #assume each category is equally likely so dont fit prior probability (Pr(Class))
        self.model = MultinomialNB(fit_prior=False) 
        self.model.fit(Xtrain, ytrain)
        print('Finished training')
        print(self.model.score(Xval, yval))
    def predict(self, bag):
        return self.model.predict_proba([bag])[0]
    
NaiveBayes_instance = NaiveBayes()
NaiveBayes_instance.eval()


Finished training
0.5054484492875104


  return bag/bag.sum() # normalise
  return bag/bag.sum() # normalise
100%|██████████████████████████████████████████████████████████████████████████████| 3342/3342 [01:34<00:00, 35.32it/s]

740/1820 correct





In [25]:
SVM_instance2.eval()

  return bag/bag.sum() # normalise
  return bag/bag.sum() # normalise
100%|██████████████████████████████████████████████████████████████████████████████| 3342/3342 [02:24<00:00, 23.19it/s]

826/1820 correct





In [28]:
# Try sklearn linearreg
from sklearn.linear_model import LogisticRegression
class LinearReg(CategoriserBase):
    def __init__(self):
        super(LinearReg, self).__init__()
    
    def precomp(self):
        # override
        self.model = LogisticRegression(C=100)
        self.model.fit(Xtrain, ytrain)
        print('Finished training')
        print(self.model.score(Xval, yval))
    def predict(self, bag):
        return self.model.decision_function([bag])[0]
    
LinearReg2 = LinearReg()
LinearReg2.eval()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Finished training
0.740989103101425


FileNotFoundError: [Errno 2] No such file or directory: 'linearRegResults.txt'

In [29]:
from joblib import dump, load
dump(SVM_instance.model, 'SVM.joblib') 

['SVM.joblib']

In [31]:
svm_model = load('SVM.joblib') 
svm_model.intercept_

array([-0.89472305, -1.05592616, -0.85802678, -0.620536  , -0.8431903 ,
       -0.85703863, -0.34930353, -0.71789778, -0.95431237, -1.09232792])

In [32]:
NaiveBayes_instance.model.feature_log_prob_

array([[-11.15577666, -11.25522953, -11.6361594 , ..., -11.64587294,
        -11.64587294, -11.64587294],
       [-10.95581943, -11.10780392, -11.64839949, ..., -11.6491769 ,
        -11.6491769 , -11.6491769 ],
       [-11.3801378 , -11.55225774, -11.64505718, ..., -11.64563652,
        -11.64563652, -11.64563652],
       ...,
       [-11.50586578, -11.53583265, -11.60427452, ..., -11.64817288,
        -11.64817288, -11.64817288],
       [-11.5350833 , -11.52855692, -11.61913248, ..., -11.64621433,
        -11.64621433, -11.64621433],
       [-11.63197237, -11.64201956, -11.64164867, ..., -11.6434185 ,
        -11.64044673, -11.64069404]])

In [85]:
# Naive Averaging Linear regression

import torch
from torch import nn
from nltk import PorterStemmer
class Embeddings:
    
    def __init__(self):
        self.WEIGHTS_FILE = "glove50BinWeights.dat"
        self.WORDS_FILE = "glove50Words.txt"
        self.CUSTOM_WORDLIST = "corpus.txt"
        self.SIZE = 50
        self.NUM_WORDS = 400000
        k = open(self.CUSTOM_WORDLIST,"r")
        self.DIM = 0
        self.words = []
        while True:
            line = k.readline().split()
            if len(line) == 2:
                self.words.append([line[0], int(line[1])])
                self.DIM += 1
            else:
                break
        
        self.Embeddings = np.fromfile(self.WEIGHTS_FILE, dtype=np.float32).reshape((self.NUM_WORDS,self.SIZE))
        self.EmbeddingWords = open(self.WORDS_FILE, encoding="utf8").read().split('\n')
        self.embedIds = {} #unknown chars replaced by '?'
        self.stemmedIds = {}
        
        self.custom_to_embedId = [-1]*self.DIM
        stemmer = PorterStemmer()
        for i in range(len(self.EmbeddingWords)):
            if i%10000 == 0:
                print(self.EmbeddingWords[i])
            self.embedIds[self.EmbeddingWords[i]] = i
            self.stemmedIds[stemmer.stem(self.EmbeddingWords[i])] = i
        
        for i in range(self.DIM):
            self.custom_to_embedId[i] = self.stemmedIds.get(self.words[i][0],-1)
            
    def get_id_embedding(self, bag_id):
        #print(self.words[bag_id][0])
        wordId = self.custom_to_embedId[bag_id]
        if wordId == -1:
            return np.zeros(self.SIZE)
        else:
            return self.Embeddings[wordId]
    
    def get_word_embedding(self, word):
        wordId = self.embedIds.get(word,-1)
        if wordId == -1:
            return np.zeros(self.SIZE)
        else:
            return self.Embeddings[wordId]

glove50 = Embeddings()
glove50.get_id_embedding(69)

the
persecution
baths
mortally
1667
bec
baek
b/w
klinghoffer
azarov
capron
perpetua
biratnagar
12.74
yaffa
cryogenics
ef1
franchetti
blintzes
birthstones
naadam
concertation
lesticus
containerboard
boydston
afterellen.com
acuff-rose
close-fitting
packbot
comptel
tanke
saraju
rouiba
discomfit
numurkah
hla-a
90125
zipkin
lombarde
1.137



array([ 0.27773 , -1.8892  ,  0.25371 , -0.60628 ,  1.0393  ,  0.14009 ,
       -0.2968  , -0.9551  ,  0.55312 ,  0.30531 ,  0.50993 , -0.26159 ,
       -0.033428, -0.41328 ,  0.062837,  0.60586 ,  0.25449 , -0.025989,
       -0.43171 , -0.62811 , -1.1064  , -0.10107 , -0.52848 , -0.25303 ,
        0.70581 ,  1.0866  ,  0.35819 ,  0.8242  ,  0.75054 ,  1.0691  ,
       -0.94066 , -0.86361 ,  1.0516  , -0.42908 , -0.082611, -0.61878 ,
       -0.43782 ,  0.16866 , -0.23367 ,  0.35183 , -0.46558 , -0.11573 ,
       -0.31309 ,  1.1492  , -1.1831  ,  0.025008,  0.27425 , -0.096663,
       -0.012129, -0.22712 ], dtype=float32)

In [116]:
from torch.utils.data import Dataset, DataLoader
class NeuralNet(nn.Module):
    
    def __init__(self):
        super(NeuralNet, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(50, 20),
            nn.ReLU(),
            nn.Linear(20, 10),
            nn.Sigmoid() #as opposed to nn.Softmax()
        )
    
    def forward(self, x):
        return self.model(x)


class EmbedDataset:
    def __init__(self, setting, embedding):
        self.FOLDERS = ['History', 'Geography', 'Arts', 'Philosophy_and_religion','Everyday_life',\
                  'Society_and_social_sciences','Biological_and_health_sciences', 'Physical_sciences',\
              'Technology', 'Mathematics']
        self.MAP = {}
        self.CORPUS_FILE = "corpus.txt"
        self.SPLIT = np.array([0, 0.7,0.85,1])
        self.DIM = 0
        self.fileList = []
        self.setting = setting # one of 0,1,2 for train, test val
        for i in range(len(self.FOLDERS)):
            self.MAP[self.FOLDERS[i]] = i
        
        for folder in self.FOLDERS:
            for file in os.listdir(folder):
                self.fileList.append((self.MAP[folder], os.path.join(folder, file)))
        random.shuffle(self.fileList)
        self.SPLIT = [int(c) for c in self.SPLIT * len(self.fileList)]
        
        k = open(self.CORPUS_FILE,"r")
        while True:
            line = k.readline()
            if len(line.split()) == 2:
                self.DIM += 1
            else:
                break

        #embedding code
        self.embedding = embedding
    def __len__(self):
        return self.SPLIT[self.setting+1] - self.SPLIT[self.setting]
    
    def __getitem__(self, idx):
        # we use log bag of words due to zipfian nature
        # reads: bag of words file
        idx += self.SPLIT[self.setting]
        k = open(self.fileList[idx][1], "r")
        bag = np.zeros(self.embedding.SIZE)
        wordcount = 0
        while True:
            l = k.readline().split()
            if len(l) <= 1:
                break
            l = [int(c) for c in l]
            res = self.embedding.get_id_embedding(l[0])
            if res.max() != 0 or res.min() != 0:
                wordcount += l[1]
                bag += l[1]*res
        
        return torch.tensor(bag/wordcount).float(), self.fileList[idx][0]
    
trainEmbedder = EmbedDataset(0, glove50)
valEmbedder = EmbedDataset(1, glove50)

                

In [117]:
trainLoader = DataLoader(trainEmbedder, batch_size=32, shuffle=True)
valLoader = DataLoader(valEmbedder, batch_size=32, shuffle=True)
trainEmbedder.__getitem__(1)

(tensor([ 0.0356, -0.2390, -0.1668, -0.2053, -0.2626, -0.1244,  0.2584,  0.1276,
         -0.0043,  0.1877,  0.0490, -0.0356,  0.1766,  0.1438, -0.1229, -0.0610,
          0.0473, -0.0268,  0.2997,  0.0906, -0.0253, -0.1005,  0.0223,  0.2159,
         -0.0274,  0.4289, -0.0921,  0.0765, -0.0081,  0.0358, -0.3047,  0.0068,
          0.1269,  0.0418, -0.0098,  0.0924,  0.0031,  0.0230, -0.2308,  0.0877,
          0.0703, -0.3158,  0.1103,  0.2234, -0.0860, -0.0963,  0.1159,  0.1683,
         -0.0062, -0.1815]),
 1)

In [118]:
device = 'cpu'
def trainloop(dataloader, model, loss_fn, optimizer):
    current = 0
    total_loss = 0
    for batch, (X,y) in enumerate(dataloader):
        X=X.to(device)
        y=y.to(device)
        pred = model(X)
        #print(pred)
        loss = loss_fn(pred, y)
        current += len(X)
        total_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if batch % 30 == 0:
            print(f"Processed {current} examples, cumulative batch loss={total_loss}")
    print(f"Finished epoch, cumulative loss = {total_loss}")

def testloop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0,0
    with torch.no_grad():
        for X,y in dataloader:
            X=X.to(device)
            y=y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred,y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    print(f"Testset: Correct: {correct}/{size}. Cumulative Loss={test_loss}")
        
# Actual Training
epochs = 10 # all you need
lr = 0.001
model = NeuralNet().to(device) #yolo
loss = nn.CrossEntropyLoss()
optim = torch.optim.AdamW(model.parameters(), lr = lr, weight_decay = 0.01)


print(model)
for i in range(epochs):
    print(f"Epoch {i}")
    trainloop(trainLoader, model, loss, optim)
    testloop(valLoader, model, loss)





NeuralNet(
  (model): Sequential(
    (0): Linear(in_features=50, out_features=20, bias=True)
    (1): ReLU()
    (2): Linear(in_features=20, out_features=10, bias=True)
    (3): Sigmoid()
  )
)
Epoch 0
Processed 32 examples, cumulative batch loss=2.3088481426239014
Processed 992 examples, cumulative batch loss=71.2272698879242
Processed 1952 examples, cumulative batch loss=139.6463007926941
Processed 2912 examples, cumulative batch loss=207.41230964660645
Processed 3872 examples, cumulative batch loss=274.3126983642578
Processed 4832 examples, cumulative batch loss=340.77984404563904
Finished epoch, cumulative loss = 391.4669780731201
Testset: Correct: 251.0/1193. Cumulative Loss=83.23696422576904
Epoch 1
Processed 32 examples, cumulative batch loss=2.210111141204834
Processed 992 examples, cumulative batch loss=67.65085959434509
Processed 1952 examples, cumulative batch loss=132.92929363250732
Processed 2912 examples, cumulative batch loss=197.7184443473816
Processed 3872 examples, c

In [120]:
epochs = 5 # all you need
lr = 0.01
loss = nn.CrossEntropyLoss()
optim = torch.optim.AdamW(model.parameters(), lr = lr, weight_decay = 0.01)


print(model)
for i in range(epochs):
    print(f"Epoch {i}")
    trainloop(trainLoader, model, loss, optim)
    testloop(valLoader, model, loss)


NeuralNet(
  (model): Sequential(
    (0): Linear(in_features=50, out_features=20, bias=True)
    (1): ReLU()
    (2): Linear(in_features=20, out_features=10, bias=True)
    (3): Sigmoid()
  )
)
Epoch 0
Processed 32 examples, cumulative batch loss=1.5643229484558105
Processed 992 examples, cumulative batch loss=50.34146976470947
Processed 1952 examples, cumulative batch loss=98.86135220527649
Processed 2912 examples, cumulative batch loss=147.35610222816467
Processed 3872 examples, cumulative batch loss=196.09459364414215
Processed 4832 examples, cumulative batch loss=244.6925390958786
Finished epoch, cumulative loss = 282.1976933479309
Testset: Correct: 1002.0/1193. Cumulative Loss=61.0539653301239
Epoch 1
Processed 32 examples, cumulative batch loss=1.5865298509597778
Processed 992 examples, cumulative batch loss=50.668949127197266
Processed 1952 examples, cumulative batch loss=99.00928056240082
Processed 2912 examples, cumulative batch loss=147.39774656295776
Processed 3872 examples

In [341]:
def clean(txt):
    txt = txt.lower()
    ALLOWED = "abcdefghijklmnopqrstuvwxyz "
    cleaned = ""
    for i in range(len(txt)):
        if txt[i] in ALLOWED:
            cleaned += txt[i]
        else:
            cleaned += " "
    return cleaned
STOP_WORDS = ["during", "out", "very", "having", "with", "they", "own", "an",\
              "be", "some", "for", "do", "its", "yours", "such", "into", "of",\
              "most", "itself", "other", "off", "is", "s", "am", "or", "who", "as",\
              "from", "him", "each", "the", "themselves", "until", "below", "are", "we",\
              "these", "your", "his", "through", "don", "nor", "me", "were", "her", "more",\
              "himself", "this", "down", "should", "our", "their", "while", "above", "both",\
              "up", "to", "ours", "had", "she", "all", "no", "when", "at", "any", "before", "them",\
              "same", "and", "been", "have", "in", "will", "on", "does", "yourselves", "then", "that",\
              "because", "what", "over", "why", "so", "can", "did", "not", "now", "under", "he", "you",\
              "herself", "has", "just", "where", "too", "only", "myself", "which", "those", "i", "after",\
              "few", "whom", "t", "being", "if", "theirs", "my", "against", "a", "by", "doing", "it", "how",\
              "further", "was", "here", "than"]



def test(model, embedding, txt):
    x = clean(txt).split()
    averaged_embedding = np.zeros(embedding.SIZE)
    word_count = 0
    for word in x:
        if word.lower() in STOP_WORDS:
            continue
        res = embedding.get_word_embedding(word.lower())
        if res.max() != 0 or res.min() != 0:
            averaged_embedding += res
            word_count += 1
    averaged_embedding /= word_count
    return model(torch.tensor(averaged_embedding).float().unsqueeze(0))[0]

def eval_embeddings(model, embedding, testfunc, outfile="benchmark_results.txt", top=2, idle=False, size=-1):
        file = open("benchmark.txt","r").read().split('\n')
        BENCHMARK_BINARIES = "benchmark_binaries"
        CLASSES = ['History', 'Geography', 'Arts', 'Philosophy_and_religion','Everyday_life',\
                  'Society_and_social_sciences','Biological_and_health_sciences', 'Physical_sciences',\
              'Technology', 'Mathematics']
        try: checkpoint = open(outfile,"r").read()
        except: checkpoint = ""
        cases = 0
        correct = 0
        
        if size == -1:
            size = len(file)
        it = range(size)
        if not idle:
            it = tqdm(it)
        for testId in it:
            try: [ans, url] = file[testId].split()
            except: continue

            txt_file_name = "benchmark/test"+str(testId)+".txt"
            outfile_string = "Test "+str(testId)+": "
            try:
                raw_txt = open(txt_file_name, "r").read()
                if len(raw_txt) <= 10:
                    continue
                res = testfunc(model, embedding, raw_txt)
            except FileNotFoundError:
                continue
            cases += 1
            #print(ans, url)
            res = [float(res[x]) for x in range(10)]
            if cases % 50 == 0 and idle:
                 print(f"Processed {cases} Cases")
            
            correct_label = CLASSES.index(ans)
            # Metrics:
            # We consider a result correct if it exceeds 0 and falls in the top 2 confidences
            # Optional: CHeck for seperation but meh

            # Check exact category score
            sorted_res = sorted(res)
            
            if res[correct_label] >= sorted_res[-top]:
                correct += 1

            checkpoint += outfile_string + " ".join([str(c) for c in res])+"\n"
        print(f"{correct}/{cases} correct")
        output_file = open(outfile,"w")
        output_file.write(checkpoint)
        output_file.close()


In [None]:
eval_embeddings(model, glove50,test, outfile="benchmark_naive_embedding.txt")

In [148]:
from torch.utils.data import Dataset, DataLoader
class LogisticRegression(nn.Module):
    
    def __init__(self):
        super(LogisticRegression, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(50, 10),
            nn.Sigmoid() #as opposed to nn.Softmax()
        )
    
    def forward(self, x):
        return self.model(x)

#simpler model
epochs = 15 
lr = 0.1
modelSimple = LogisticRegression()
loss = nn.CrossEntropyLoss()
optim = torch.optim.AdamW(modelSimple.parameters(), lr = lr, weight_decay = 0.01)


print(modelSimple)
for i in range(epochs):
    print(f"Epoch {i}")
    trainloop(trainLoader, modelSimple, loss, optim)
    testloop(valLoader, modelSimple, loss)

LogisticRegression(
  (model): Sequential(
    (0): Linear(in_features=50, out_features=10, bias=True)
    (1): Sigmoid()
  )
)
Epoch 0
Processed 32 examples, cumulative batch loss=2.309086799621582
Processed 992 examples, cumulative batch loss=64.53204190731049
Processed 1952 examples, cumulative batch loss=120.62792348861694
Processed 2912 examples, cumulative batch loss=174.5756415128708
Processed 3872 examples, cumulative batch loss=227.83879125118256
Processed 4832 examples, cumulative batch loss=280.4299737215042
Finished epoch, cumulative loss = 320.0798736810684
Testset: Correct: 881.0/1193. Cumulative Loss=65.96121454238892
Epoch 1
Processed 32 examples, cumulative batch loss=1.7390564680099487
Processed 992 examples, cumulative batch loss=54.06024503707886
Processed 1952 examples, cumulative batch loss=105.8529691696167
Processed 2912 examples, cumulative batch loss=157.49269759655
Processed 3872 examples, cumulative batch loss=208.76527154445648
Processed 4832 examples, cumu

KeyboardInterrupt: 

In [None]:
# 3 layer network?
from torch.utils.data import Dataset, DataLoader
class DeeperNeuralNet(nn.Module):
    
    def __init__(self):
        super(DeeperNeuralNet, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(50, 20),
            nn.ReLU(),
            nn.Linear(20, 20),
            nn.ReLU(),
            nn.Linear(20, 10),
            nn.Sigmoid() #as opposed to nn.Softmax()
        )
    
    def forward(self, x):
        return self.model(x)

    #simpler model
epochs = 15 
lr = 0.01
modelComplex = DeeperNeuralNet()
loss = nn.CrossEntropyLoss()
optim = torch.optim.AdamW(modelComplex.parameters(), lr = lr, weight_decay = 0.01)


print(modelComplex)
for i in range(epochs):
    print(f"Epoch {i}")
    trainloop(trainLoader, modelComplex, loss, optim)
    testloop(valLoader, modelComplex, loss)

In [342]:
eval_embeddings(modelSimple, glove50, test, outfile="benchmark_naive_embedding_regression2.txt")

  averaged_embedding /= word_count
100%|█████████████████████████████████████████████████████████████████████████████| 3342/3342 [00:32<00:00, 101.84it/s]

1272/1822 correct





In [367]:
def test_sentence_average(model, embedding, txt):
    txt = txt.split(' ') 
    res = torch.zeros(10)
    WORDS_PER_SENTENCE = 40
    pt = 0
    while pt < len(txt):
        sentence = ' '.join(txt[pt:pt+WORDS_PER_SENTENCE])
        if len(sentence) >= 40: 
            tmp = test(model, embedding, sentence)
            if tmp.max() > 0.3:
                order = tmp.argsort()
                res[order[-1]] += 1
        pt += 30
    #print(res)
    return res

eval_embeddings(modelSimple, glove50, test_sentence_average, outfile="benchmark_naive_sentenceaveraged_embedding_regression2.txt")

  averaged_embedding /= word_count
100%|██████████████████████████████████████████████████████████████████████████████| 3342/3342 [01:02<00:00, 53.44it/s]

1365/1822 correct





In [303]:
class NaiveBayesWeighting:
    
    def __init__(self):
        self.FOLDERS = ['History', 'Geography', 'Arts', 'Philosophy_and_religion','Everyday_life',\
                  'Society_and_social_sciences','Biological_and_health_sciences', 'Physical_sciences',\
              'Technology', 'Mathematics']
        self.MAP = {}
        self.CORPUS_FILE = "corpus.txt"
        self.DIM = 0
        self.fileList = []
        for i in range(len(self.FOLDERS)):
            self.MAP[self.FOLDERS[i]] = i
        
        for folder in self.FOLDERS:
            for file in os.listdir(folder):
                self.fileList.append((self.MAP[folder], os.path.join(folder, file)))
        random.shuffle(self.fileList)

        
        k = open(self.CORPUS_FILE,"r")
        
        self.freq = []
        self.word_list = {} #word to id
        while True:
            line = k.readline().split()
            if len(line) == 2:
                self.freq.append(int(line[1]))
                self.word_list[line[0]]=self.DIM
                self.DIM += 1
            else:
                break
        
        self.total_length = 0
        self.length_counts = [0]*len(self.FOLDERS)
        self.positive = [[0]*len(self.FOLDERS) for i in range(self.DIM)]
        self.stemmer = PorterStemmer()
        for file in tqdm(self.fileList):
            k = open(file[1], "r").read().split('\n')
            for line in k:
                if len(line) <= 2:
                    break
                [wordId, num] = [int(c) for c in line.split()]
                self.length_counts[file[0]] += num
                self.positive[wordId][file[0]] += num
                self.total_length += num
        
    def id_to_weight(self, wid, classId):
        #laplacian smoothing
        negative_len = self.total_length - self.length_counts[classId] + self.DIM
        positive_len = self.length_counts[classId] + self.DIM
        positive_freq = self.positive[wid][classId]
        negative_freq = self.freq[wid]-positive_freq
        positive_freq +=1 
        negative_freq +=1 #laplace smooth
        return np.log(positive_freq/positive_len)-np.log(negative_freq/negative_len)
    
    def word_to_weight(self, word, classId):
        stemmed = self.stemmer.stem(word.lower())
        id = self.word_list.get(stemmed,-1)
        if id != -1:
            return self.id_to_weight(id, classId)
        else:
            negative_len = self.total_length - self.length_counts[classId] + self.DIM
            positive_len = self.length_counts[classId] + self.DIM
            return np.log(negative_len)-np.log(positive_len) # both freqs are 0
weightScheme = NaiveBayesWeighting()

100%|█████████████████████████████████████████████████████████████████████████████| 7953/7953 [00:24<00:00, 324.76it/s]


In [268]:
print(weightScheme.DIM, weightScheme.total_length, weightScheme.length_counts)
weightScheme.word_to_weight("QaQ",1)
weightScheme.id_to_weight(15197,0)
weightScheme.positive[15197][0]


113735 41989465 [5634288, 8313612, 3441379, 2729219, 2249242, 4962694, 6063669, 4226604, 3357972, 1010786]


11

In [282]:
class WeightedEmbedDataset:
    def __init__(self, setting, embedding, class_id, SAMPLE_FACTOR=3):
        self.FOLDERS = ['History', 'Geography', 'Arts', 'Philosophy_and_religion','Everyday_life',\
                  'Society_and_social_sciences','Biological_and_health_sciences', 'Physical_sciences',\
              'Technology', 'Mathematics']
        self.MAP = {}
        self.CORPUS_FILE = "corpus.txt"
        self.SPLIT = np.array([0, 0.85, 1])
        self.DIM = 0
        self.fileList = []
        self.setting = setting # one of 0,1,2 for train, test val
        self.class_id = class_id
        for i in range(len(self.FOLDERS)):
            self.MAP[self.FOLDERS[i]] = i
        
        #"""
        num_cases = SAMPLE_FACTOR*len(os.listdir(self.FOLDERS[class_id]))
        #negative examples
        for folder in self.FOLDERS:
            if self.MAP[folder] != class_id:
                for file in os.listdir(folder):
                    self.fileList.append((self.MAP[folder], os.path.join(folder, file)))
        
        random.shuffle(self.fileList)
        self.fileList = self.fileList[:num_cases]
        
        #positive examples
        for file in os.listdir(self.FOLDERS[class_id]):
            for i in range(SAMPLE_FACTOR):
                self.fileList.append((class_id, os.path.join(self.FOLDERS[class_id], file)))
            
        random.shuffle(self.fileList)
        #""" 
        
        # load all 
        #for folder in self.FOLDERS:
        #    for file in os.listdir(folder):
        #        self.fileList.append((self.MAP[folder], os.path.join(folder, file)))
        #        
        self.SPLIT = [int(c) for c in self.SPLIT * len(self.fileList)]
        
        k = open(self.CORPUS_FILE,"r")
        while True:
            line = k.readline()
            if len(line.split()) == 2:
                self.DIM += 1
            else:
                break

        #embedding code
        self.embedding = embedding
    def __len__(self):
        return self.SPLIT[self.setting+1] - self.SPLIT[self.setting]
    
    def __getitem__(self, idx):
        # we use log bag of words due to zipfian nature
        # reads: bag of words file
        idx += self.SPLIT[self.setting]
        k = open(self.fileList[idx][1], "r")
        bag = np.zeros(self.embedding.SIZE)
        wordcount = 0
        #print(self.fileList[idx][1])
        while True:
            l = k.readline().split()
            if len(l) <= 1:
                break
            l = [int(c) for c in l]
            res = self.embedding.get_id_embedding(l[0])
            if res.max() != 0 or res.min() != 0:
                wt = weightScheme.id_to_weight(l[0], self.class_id)
                wordcount += l[1]
                bag += l[1]*res*wt
        #print(bag, wordcount)
        return torch.tensor(bag/wordcount).float(), int(self.fileList[idx][0] == self.class_id)
WEset = WeightedEmbedDataset(0, glove50, i)
WEset.__getitem__(1)
WEset.__len__()

3468

In [279]:
class BinaryClassifier(nn.Module):
    
    def __init__(self):
        super(BinaryClassifier, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(50, 2),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        return self.model(x)
    
nbmodels = [BinaryClassifier() for i in range(10)]
# Actual Training
for i in range(1,10):
    epochs = 10 # all you need
    lr = 0.003 #can probs do 0.1
    loss = nn.CrossEntropyLoss()
    nbmodels[i].to(device)
    optim = torch.optim.AdamW(nbmodels[i].parameters(), lr = lr, weight_decay = 0.01)
    WEtrainLoader = DataLoader(WeightedEmbedDataset(0, glove50, i), batch_size=32, shuffle=True)
    WEtestLoader = DataLoader(WeightedEmbedDataset(1, glove50, i), batch_size=32, shuffle=True)
    
    for j in range(epochs):
        print(f"Epoch {j}")
        trainloop(WEtrainLoader, nbmodels[i], loss, optim)
        testloop(WEtestLoader, nbmodels[i], loss)

    

Epoch 0
Processed 32 examples, cumulative batch loss=0.6812331080436707
Processed 992 examples, cumulative batch loss=20.000382125377655
Processed 1952 examples, cumulative batch loss=37.59262067079544
Finished epoch, cumulative loss = 53.48525208234787
Testset: Correct: 551.0/612. Cumulative Loss=10.699474334716797
Epoch 0
Processed 32 examples, cumulative batch loss=0.5099274516105652
Processed 992 examples, cumulative batch loss=16.0888774394989
Processed 1952 examples, cumulative batch loss=31.079753816127777
Finished epoch, cumulative loss = 45.06666058301926
Testset: Correct: 551.0/612. Cumulative Loss=9.498610854148865
Epoch 0
Processed 32 examples, cumulative batch loss=0.46167173981666565
Processed 992 examples, cumulative batch loss=14.680695444345474
Processed 1952 examples, cumulative batch loss=28.56091609597206
Finished epoch, cumulative loss = 41.69656214118004
Testset: Correct: 553.0/612. Cumulative Loss=9.10963624715805
Epoch 0
Processed 32 examples, cumulative batch l

KeyboardInterrupt: 

In [285]:
nbmodels2 = [BinaryClassifier() for i in range(10)]
# Actual Training
for i in range(10):
    print('Training classifier', i)
    epochs = 10 # all you need
    lr = 0.01 #can probs do 0.1
    loss = nn.CrossEntropyLoss()
    nbmodels2[i].to(device)
    optim = torch.optim.AdamW(nbmodels2[i].parameters(), lr = lr, weight_decay = 0.01)
    WEtrainLoader = DataLoader(WeightedEmbedDataset(0, glove50, i, 1), batch_size=32, shuffle=True)
    WEtestLoader = DataLoader(WeightedEmbedDataset(1, glove50, i, 1), batch_size=32, shuffle=True)
    
    for j in range(epochs):
        print(f"Epoch {j}")
        trainloop(WEtrainLoader, nbmodels2[i], loss, optim)
        testloop(WEtestLoader, nbmodels2[i], loss)

Training classifier 0
Epoch 0
Processed 32 examples, cumulative batch loss=0.688480019569397
Processed 992 examples, cumulative batch loss=18.32465636730194
Finished epoch, cumulative loss = 21.358395010232925
Testset: Correct: 187.0/204. Cumulative Loss=3.4554097950458527
Epoch 1
Processed 32 examples, cumulative batch loss=0.49270662665367126
Processed 992 examples, cumulative batch loss=14.83844980597496
Finished epoch, cumulative loss = 17.593986302614212
Testset: Correct: 187.0/204. Cumulative Loss=3.0799742937088013
Epoch 2
Processed 32 examples, cumulative batch loss=0.4957418441772461
Processed 992 examples, cumulative batch loss=13.888905942440033
Finished epoch, cumulative loss = 16.494646579027176
Testset: Correct: 187.0/204. Cumulative Loss=3.0056861639022827
Epoch 3
Processed 32 examples, cumulative batch loss=0.3954540193080902
Processed 992 examples, cumulative batch loss=13.270605951547623
Finished epoch, cumulative loss = 15.983509451150894
Testset: Correct: 186.0/204.

In [286]:
# store weights

for i in range(10):
    torch.save(nbmodels2[i], 'linear_model'+str(i)+"weights.pth")
    

In [None]:
def test_nb_average(models, embedding, txt):
    txt = txt.split() 
    res = np.zeros(10)
    num_words = 0
    for i in range(10):
        embed = np.zeros(embedding.SIZE)
        for word in txt:
            this_embedding = embedding.get_word_embedding(word.lower())
            if this_embedding.max() != 0 or this_embedding.min() != 0:
                embed += weightScheme.word_to_weight(word.lower(), i)*this_embedding
                num_words += 1
        embed /= num_words
        tmp = models[i](torch.tensor(embed).unsqueeze(0).float())[0]
        print(tmp)
        res[i] = models[i](torch.tensor(embed).unsqueeze(0).float())[0][1].item()

    #print(res)
    return res

eval_embeddings(nbmodels2, glove50, test_nb_average, outfile="benchmark_nb_averaged_embedding_regression.txt", size=200)

In [313]:
torch.save(modelSimple, 'naiveEmbeddingregression.pth')

In [332]:
a,b = WeightedEmbedDataset(1, glove50, i, 1).__getitem__(12)
print(nbmodels[1](a.unsqueeze(0)),b)

tensor([[0.0056, 0.9937]], grad_fn=<SigmoidBackward>) 1


In [337]:
# Finalise Data
# In the interests of time SVMs will not be tested on embeddings. Based on the same amount of training data
# Linear regression performed worse than SVMs thus we conclude that embeddings are not as good 
# It was hoped that the massive corpus used to train embeddings would counteract its limited
# Dimensionality. This proved to be not the case
WikiTestset = WikiDataset(2)
Xall = Xtrain[:]
yall = ytrain[:]
Xall.extend(Xval)
yall.extend(yval)
for i in tqdm(range(WikiTestset.__len__())):
    #if i % 50 == 0:
    #    print(f"Loaded {i} Test Examples")
    a,b = ValSet.__getitem__(i)
    Xall.append(a)
    yall.append(b)

100%|██████████████████████████████████████████████████████████████████████████████| 1193/1193 [00:32<00:00, 36.33it/s]


In [336]:
#reset xtrain
Xtrain = Xtrain[:5567]
ytrain = ytrain[:5567]

In [339]:
class SVM_tester4(CategoriserBase):
    def __init__(self):
        super(SVM_tester4, self).__init__()
    
    def precomp(self):
        # override
        print('training!')
        self.model = svm.LinearSVC(C=30, max_iter=2000)
        self.model.fit(Xall, yall)
        print('Finished training')
    def predict(self, bag):
        return self.model.decision_function([bag])[0]
    
SVM_instance4 = SVM_tester4()
SVM_instance4.eval()

training!
Finished training


  return bag/bag.sum() # normalise
  return bag/bag.sum() # normalise
100%|██████████████████████████████████████████████████████████████████████████████| 3342/3342 [01:23<00:00, 39.91it/s]

902/1820 correct





In [340]:
dump(SVM_instance4.model, 'SVMimproved.joblib') 

['SVMimproved.joblib']

In [353]:
for i in range(10):
     print(weightScheme.word_to_weight('wikipedia', i),weightScheme.word_to_weight('a', i),weightScheme.word_to_weight('an', i))

-1.1997361311767847 1.847604202035786 1.847604202035786
0.04260568520389185 1.388670694584281 1.388670694584281
-0.4690109366774884 2.3864654453680583 2.3864654453680583
-0.9113165433487858 2.6282615649993293 2.6282615649993293
-0.22982753297656444 2.825299457995655 2.825299457995655
1.6869260211468369 1.9901001183093374 1.9901001183093374
-0.8357298906760633 1.7637183008441486 1.7637183008441486
-0.6072786313466043 2.1663809458946712 2.1663809458946712
-0.4496622068992302 2.412361195992677 2.412361195992677
-0.4921296856695889 3.5984663630475193 3.5984663630475193


In [None]:
def test_nb_filtering(models, embedding, txt):
    txt = txt.split() 
    res = np.zeros(10)
    num_words = 0
    for i in range(10):
        embed = np.zeros(embedding.SIZE)
        for word in txt:
            if word in STOP_WORDS:
                continue
            this_embedding = embedding.get_word_embedding(word.lower())
            
            if this_embedding.max() != 0 or this_embedding.min() != 0:
                embed += weightScheme.word_to_weight(word.lower(), i)*this_embedding
                num_words += 1
        embed /= num_words
        tmp = models[i](torch.tensor(embed).unsqueeze(0).float())[0]
        #print(tmp)
        res[i] = models[i](torch.tensor(embed).unsqueeze(0).float())[0][1].item()

    #print(res)
    return res

eval_embeddings(nbmodels2, glove50, test_nb_average, outfile="benchmark_nb_averaged_embedding_regression.txt", size=200)

In [366]:
def test_sentence_svm(model, embedding, txt):
    txt = txt.split(' ') 
    res = torch.zeros(10)
    WORDS_PER_SENTENCE = 40
    pt = 0
    while pt < len(txt):
        sentence = ' '.join(txt[pt:pt+WORDS_PER_SENTENCE])
        if len(sentence) >= 40: 
            #print(sentence)
            bag = SVM_instance4.preprocess(sentence)
            s = bag.sum()
            if bag.sum() != 0:
                tmp = SVM_instance4.predict(SVM_instance4.preprocess(sentence)/s)
                if tmp.max() > -0.75:
                    res[tmp.argmax()] += 1
        pt += int(WORDS_PER_SENTENCE*0.75)
    #print(res)
    return res

eval_embeddings(SVM_instance4.model, glove50, test_sentence_svm, outfile="benchmark_smv_sentence.txt")

100%|██████████████████████████████████████████████████████████████████████████████| 3342/3342 [50:46<00:00,  1.10it/s]

1378/1822 correct



