In [None]:
#this probably doesn't run in the colab don't call this function
#this runs in windows 11 with a cuda capable gpu

def BERT_gpu_train():
  #This code is modified from the tutorial included in the miniproject-3 handout to fetch attention matrices from the trained model

  #ON WINDOWS 11 must set LongPathsEnabled=1:
  #New-ItemProperty -Path "HKLM:\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 -PropertyType DWORD -Force

  #this helps reduce memory pressure on the GPU
  import os
  os.environ["PYTORCH_CUDA_ALLOC_CONF"]="max_split_size_mb:512"

  #import stuff for the pretrained BERT model
  import sys
  import numpy as np
  import random
  import pandas as pd
  import torch
  #cannot use pytorch_pretrained_bert to output attention for some reason
  from transformers import BertModel
  from torch import nn
  import datasets
  from pytorch_pretrained_bert import BertTokenizer
  from keras.utils import pad_sequences
  from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
  from torch.optim import Adam
  from torch.nn.utils import clip_grad_norm_


  print("defining model...")

  #The pretrained BERT model with one fully connected layer appended to do classification
  #extend the nn.Module class so we can add the sigmoid FC layer
  class BertBinaryClassifier(nn.Module):
      def __init__(self, dropout=0.1):
          super(BertBinaryClassifier, self).__init__()
          self.bert = BertModel.from_pretrained('bert-base-uncased')
          self.dropout=nn.Dropout(dropout)
          self.linear = nn.Linear(768,1)
          self.sigmoid=nn.Sigmoid()

      def forward(self,tokens,masks=None):
          out = self.bert(tokens, attention_mask=masks)
          pooled_output = out.pooler_output
          dropout_output=self.dropout(pooled_output)
          linear_output=self.linear(dropout_output)
          prob = self.sigmoid(linear_output)
          return prob

      #forward but output attention instead of logit
      def forward_atten(self,tokens):
          out = self.bert(tokens, attention_mask=None,output_attentions=True)
          attentions = out.attentions
          return attentions

  print("setup gpu and prepare model...")

  #use the GPU
  device = torch.device("cuda")

  #init the model and send it to the GPU
  bert_clf = BertBinaryClassifier()
  bert_clf = bert_clf.cuda()

  print("loading data...")

  #load the IMDB movie reviews dataset
  imdb = datasets.load_dataset("imdb")

  #process the imdb data according to the needs of the bert model
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True)
  train_tokens = list(map(lambda t: ['[CLS]']+tokenizer.tokenize(t)[:510]+['[SEP]'], imdb["train"]["text"]))
  test_tokens = list(map(lambda t: ['[CLS]']+tokenizer.tokenize(t)[:510]+['[SEP]'], imdb["test"]["text"]))
  train_token_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, train_tokens)),maxlen=512,truncating="post",padding="post",dtype="int")
  test_token_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, test_tokens)),maxlen=512,truncating="post",padding="post",dtype="int")
  train_y = np.array(imdb["train"]["label"])
  test_y = np.array(imdb["test"]["label"])
  test_orig_x = np.array(imdb["test"]["text"])

  #masks are important for BERT to prevent overfitting
  train_masks = [[float(i>0) for i in ii] for ii in train_token_ids]
  test_masks = [[float(i>0) for i in ii] for ii in test_token_ids]

  #import a function to report precision,accuracy,recall...
  from sklearn.metrics import classification_report

  #hyperparamters
  BATCH_SIZE = 3
  EPOCHS = 2

  print("preparing dataset...")

  #make our data into tensors
  train_tokens_tensor = torch.tensor(train_token_ids)
  train_y_tensor = torch.tensor(train_y.reshape(-1,1)).float()
  test_tokens_tensor = torch.tensor(test_token_ids)
  test_y_tensor = torch.tensor(test_y.reshape(-1,1)).float()
  train_masks_tensor = torch.tensor(train_masks)
  test_masks_tensor = torch.tensor(test_masks)

  #wrap datasets with dataloaders
  train_dataset = TensorDataset(train_tokens_tensor, train_masks_tensor, train_y_tensor)
  train_sampler = RandomSampler(train_dataset)
  train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)
  test_dataset = TensorDataset(test_tokens_tensor, test_masks_tensor, test_y_tensor)
  test_sampler = SequentialSampler(test_dataset)
  test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)

  #init an optimizer for the classifier (Adam w/ alpha=3e-06)
  param_optimizer = list(bert_clf.sigmoid.named_parameters())
  optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
  optimizer = Adam(bert_clf.parameters(), lr=3e-6)

  #empty the GPU cache
  torch.cuda.empty_cache()

  #keep track of memory usage on GPU
  print(str(torch.cuda.memory_allocated(device)/1000000)+"M")

  print("train")
  #train the model
  for epoch_num in range(EPOCHS):
      #put model into training mode
      bert_clf.train()
      train_loss = 0
      for step_num, batch_data in enumerate(train_dataloader):
          token_ids, masks, labels = tuple(t.to(device) for t in batch_data)
          print(str(torch.cuda.memory_allocated(device)/1000000)+"M")
          logits = bert_clf(token_ids, masks)
          loss_func = nn.BCELoss()
          batch_loss = loss_func(logits, labels)
          train_loss += batch_loss.item()
          bert_clf.zero_grad()
          batch_loss.backward()
          clip_grad_norm_(parameters=bert_clf.parameters(),max_norm=1.0)
          optimizer.step()
          print("Epoch: ",epoch_num+1)
          print("\r"+"{0}/{1} loss: {2}".format(step_num, len(train_dataset) // BATCH_SIZE, train_loss / (step_num + 1)))

  print("done training.")

  #evaluate the model
  bert_clf.eval()
  bert_predicted = []
  all_logits = []
  with torch.no_grad():
      for step_num, batch_data in enumerate(test_dataloader):
          token_ids, masks, labels = tuple(t.to(device) for t in batch_data)
          logits = bert_clf(token_ids, masks)
          loss_func=nn.BCELoss()
          loss = loss_func(logits, labels)
          numpy_logits = logits.cpu().detach().numpy()
          bert_predicted += list(numpy_logits[:,0] > 0.5)
          all_logits += list(numpy_logits[:,0])
          print("eval batch",step_num, "/",len(train_dataset)//BATCH_SIZE)
      print(classification_report(test_y,bert_predicted))



  #get attention example
  #instance_loader = DataLoader(test_dataset,sampler=test_sampler,batch_size=1)
  #D = [i for i in enumerate(instance_loader)]
  #token_ids, masks, labels = tuple(t.to(device) for t in D[0][1])
  #attn = bert_clf.forward_atten(token_ids)
  #text = imdb["test"]["text"][0]
  #tokens = test_tokens[0]

  #jump into interactive shell to play with attention matrices and trained model
  import code
  v = globals().copy()
  v.update(locals())
  shell = code.InteractiveConsole(v)
  shell.interact()

  #               precision    recall  f1-score   support

  #            0       0.96      0.90      0.93     12500
  #            1       0.90      0.96      0.93     12500

  #     accuracy                           0.93     25000
  #    macro avg       0.93      0.93      0.93     25000
  # weighted avg       0.93      0.93      0.93     25000

  #interesting:
  #1289 false positives
  #494 false negatives

  #0 true negative logit=0.0009 label=1
  #12518 false negative logit = 0.0010 label=1

  #longer training run:
  #Output of run
  #              precision    recall  f1-score   support
  #
  #            0       0.95      0.93      0.94     12500
  #            1       0.93      0.95      0.94     12500
  #
  #     accuracy                           0.94     25000
  #    macro avg       0.94      0.94      0.94     25000
  # weighted avg       0.94      0.94      0.94     25000


In [None]:
#NAIVE BAYES MODEL

In [None]:
!unzip train.zip -d "train/" #run this to unzip train

In [None]:
!unzip test.zip -d "test/" #run this to unzip test

In [None]:
from os import listdir

class IMDBProcessor():
    def __init__(self,verbose=1,prefix=None,suffix=None,word_freq_lower_limit=5,word_len_lower_limit=4,word_filter=str.isalpha,ngrams=[1]):
        self.suffix=suffix
        self.prefix=prefix
        self.verbose = verbose
        if self.verbose > 0:
            print("processing...")
        self.ngrams = ngrams
        self.word_filter = word_filter
        self.word_len_lower_limit = word_len_lower_limit
        neg_directory = 'train/neg'
        pos_directory = 'train/pos'
        neg_directory_test = 'test/neg'
        pos_directory_test = 'test/pos'
        self.vocab = dict()
        self.pos_vocab = dict()
        self.neg_vocab = dict()
        self.neg_review_tokens = self._process_docs(neg_directory)
        self.pos_review_tokens = self._process_docs(pos_directory)
        self.neg_review_tokens_test = self._process_docs(neg_directory_test)
        self.pos_review_tokens_test = self._process_docs(pos_directory_test)
        self.pos_review_strings = self._imdb_strings(pos_directory_test)
        self.neg_review_strings = self._imdb_strings(neg_directory_test)
        word_set = set()
        if self.verbose > 0:
            print("building vocab...")
        for ts in self.neg_review_tokens+self.pos_review_tokens:
            for t in ts:
                word_set.add(t)
        for word in word_set:
            self.vocab[word]=0.0
            self.neg_vocab[word] = 0.0
            self.pos_vocab[word] = 0.0
        for ts in self.neg_review_tokens:
            for t in ts:
                self.neg_vocab[t] += 1.0
                self.vocab[t] += 1.0
        for ts in self.pos_review_tokens:
            for t in ts:
                self.pos_vocab[t] += 1.0
                self.vocab[t] += 1.0
        low_freq = []
        for word in self.vocab:
            if self.vocab[word] < word_freq_lower_limit:
                low_freq.append(word)
        for word in low_freq:
            self.vocab.pop(word)
            self.pos_vocab.pop(word)
            self.neg_vocab.pop(word)
    def _ngrams(self, n, tokens):
        return list(map(lambda l:' '.join(l),zip(*[tokens[n-i-1:len(tokens)-i] for i in reversed(range(n))])))
    def _load_doc(self,filename):
        file = open(filename, 'r')
        text = file.read()
        file.close()
        return text
    def _process_docs(self,directory):
        tokens = []
        c = 1
        files = listdir(directory)
        total = len(files)
        for filename in files:
            c += 1
            if self.verbose > 1:
                print("file",c, "/",total)
            if not filename.endswith(".txt"):
                continue
            path = directory + '/' + filename
            doc = self._load_doc(path)
            doc_tokens = doc.split(" ")
            if self.suffix is not None:
                doc_tokens = doc_tokens[-self.suffix:]
            elif self.prefix is not None:
                doc_tokens = doc_tokens[:self.prefix]
            doc_tokens = [''.join(filter(self.word_filter, token)).lower() for token in doc_tokens if len(token)>=self.word_len_lower_limit]
            ngram_tokens = []
            for n in self.ngrams:
                ngrams = self._ngrams(n,doc_tokens)
                for ngram in ngrams:
                    ngram_tokens.append(ngram)
            tokens.append(ngram_tokens)
        return tokens
    def _imdb_strings(self, directory):
        strings = []
        files = listdir(directory)
        for filename in files:
            if not filename.endswith(".txt"):
                continue
            path = directory + '/' + filename
            doc = self._load_doc(path)
            strings.append(doc)
        return strings
    def _process_string(self,string):
        tokens = string.split(" ")
        tokens = [''.join(filter(str.isalpha, token)).lower() for token in tokens if len(token)>=self.word_len_lower_limit]
        ngram_tokens = []
        for n in self.ngrams:
            ngrams = self._ngrams(n, tokens)
            for ngram in ngrams:
                ngram_tokens.append(ngram)
        return ngram_tokens
                
class NaiveBayesBagged():
    def __init__(self, processors, verbose=1,laplace=False):
        self.verbose = verbose
        self.processors = processors
        self.prior_pos = len(processors[0].pos_review_tokens)/(len(processors[0].neg_review_tokens)+len(processors[0].pos_review_tokens))
        self.prior_neg = 1-self.prior_pos
        self.models = dict()
        for ind,processor in enumerate(processors):
            self.models[ind] = dict()
            theta_pos = dict()
            theta_neg = dict()
            if verbose > 0:
                print("building model... "+str(ind))
            for word in processor.vocab:
                theta_pos[word] = 0.0
                theta_neg[word] = 0.0
            for word in processor.pos_vocab:
                theta_pos[word] = processor.pos_vocab[word]
            for word in processor.neg_vocab:
                theta_neg[word] = processor.neg_vocab[word]
            for word in processor.vocab:
                if not laplace:
                    theta_pos[word] /= processor.vocab[word]
                    theta_neg[word] /= processor.vocab[word]
                elif laplace:
                    theta_pos[word] += 1
                    theta_neg[word] += 1
                    theta_pos[word] /= (processor.vocab[word]+2)
                    theta_neg[word] /= (processor.vocab[word]+2)
            self.models[ind]["theta_pos"]=theta_pos
            self.models[ind]["theta_neg"]=theta_neg

    def test_acc_pos(self):
        if self.verbose > 0:
            print("testing...")
        correct = 0.0
        for review_ind in range(len(self.processors[0].pos_review_strings)):
            av_prob_pos=0.0
            av_prob_neg=0.0
            for ind,processor in enumerate(self.processors):
                prob_pos = self.prior_pos
                prob_neg = self.prior_neg
                theta_pos = self.models[ind]["theta_pos"]
                theta_neg = self.models[ind]["theta_neg"]
                tokens = processor.pos_review_tokens_test[review_ind]
                for t in tokens:
                    if t in processor.vocab:
                        prob_pos*=theta_pos[t]
                        prob_neg*=theta_neg[t]
                norm = prob_pos+prob_neg
                if norm == 0:
                    norm = 1
                prob_pos /= norm
                prob_neg /= norm
                av_prob_pos += prob_pos
                av_prob_neg += prob_neg
            av_prob_pos /= len(self.processors)
            av_prob_neg /= len(self.processors)
            if av_prob_pos >= av_prob_neg:
                correct+=1
        return correct / len(self.processors[0].pos_review_tokens)
    def test_acc_neg(self):
        if self.verbose > 0:
            print("testing...")
        correct = 0.0
        for review_ind in range(len(self.processors[0].neg_review_strings)):
            av_prob_pos=0.0
            av_prob_neg=0.0
            for ind,processor in enumerate(self.processors):
                prob_pos = self.prior_pos
                prob_neg = self.prior_neg
                theta_pos = self.models[ind]["theta_pos"]
                theta_neg = self.models[ind]["theta_neg"]
                tokens = processor.neg_review_tokens_test[review_ind]
                for t in tokens:
                    if t in processor.vocab:
                        prob_pos*=theta_pos[t]
                        prob_neg*=theta_neg[t]
                norm = prob_pos+prob_neg
                if norm == 0:
                    norm = 1
                prob_pos /= norm
                prob_neg /= norm
                av_prob_pos += prob_pos
                av_prob_neg += prob_neg
            av_prob_pos /= len(self.processors)
            av_prob_neg /= len(self.processors)
            if av_prob_pos <= av_prob_neg:
                correct+=1
        return correct / len(self.processors[0].neg_review_tokens)

    def test_acc(self):
        if self.verbose > 0:
            print("testing...")
        correct = 0.0
        for review_ind in range(len(self.processors[0].pos_review_strings)):
            av_prob_pos=0.0
            av_prob_neg=0.0
            for ind,processor in enumerate(self.processors):
                prob_pos = self.prior_pos
                prob_neg = self.prior_neg
                theta_pos = self.models[ind]["theta_pos"]
                theta_neg = self.models[ind]["theta_neg"]
                tokens = processor.pos_review_tokens_test[review_ind]
                for t in tokens:
                    if t in processor.vocab:
                        prob_pos*=theta_pos[t]
                        prob_neg*=theta_neg[t]
                norm = prob_pos+prob_neg
                if norm == 0:
                    norm = 1
                prob_pos /= norm
                prob_neg /= norm
                av_prob_pos += prob_pos
                av_prob_neg += prob_neg
            av_prob_pos /= len(self.processors)
            av_prob_neg /= len(self.processors)
            if av_prob_pos >= av_prob_neg:
                correct+=1
        for review_ind in range(len(self.processors[0].neg_review_strings)):
            av_prob_pos
            av_prob_neg
            for ind,processor in enumerate(self.processors):
                prob_pos = self.prior_pos
                prob_neg = self.prior_neg
                theta_pos = self.models[ind]["theta_pos"]
                theta_neg = self.models[ind]["theta_neg"]
                tokens = processor.neg_review_tokens_test[review_ind]
                for t in tokens:
                    if t in processor.vocab:
                        prob_pos*=theta_pos[t]
                        prob_neg*=theta_neg[t]
                norm = prob_pos+prob_neg
                if norm == 0:
                    norm = 1
                prob_pos /= norm
                prob_neg /= norm
                av_prob_pos += prob_pos
                av_prob_neg += prob_neg
            av_prob_pos /= len(self.processors)
            av_prob_neg /= len(self.processors)
            if av_prob_pos <= av_prob_neg:
                correct+=1
        return correct / len(self.processors[0].pos_review_tokens+self.processors[0].neg_review_tokens)

class NaiveBayes():
    def __init__(self, processor, verbose=1,laplace=False):
        self.verbose = verbose
        self.processor = processor
        self.prior_pos = len(processor.pos_review_tokens)/(len(processor.neg_review_tokens)+len(processor.pos_review_tokens))
        self.prior_neg = 1-self.prior_pos
        self.theta_pos = dict()
        self.theta_neg = dict()
        if verbose > 0:
            print("building model...")
        for word in processor.vocab:
            self.theta_pos[word] = 0.0
            self.theta_neg[word] = 0.0
        for word in processor.pos_vocab:
            self.theta_pos[word] = processor.pos_vocab[word]
        for word in processor.neg_vocab:
            self.theta_neg[word] = processor.neg_vocab[word]
        for word in processor.vocab:
            if not laplace:
                self.theta_pos[word] /= processor.vocab[word]
                self.theta_neg[word] /= processor.vocab[word]
            elif laplace:
                self.theta_pos[word] += 1
                self.theta_neg[word] += 1
                self.theta_pos[word] /= (processor.vocab[word]+2)
                self.theta_neg[word] /= (processor.vocab[word]+2)
    def predict_string(self, review):
        prob_pos = self.prior_pos
        prob_neg = self.prior_neg
        for word in self.processor._process_string(review):
            if word in self.processor.vocab:
                prob_pos*=self.theta_pos[word]
                prob_neg*=self.theta_neg[word]
        if prob_pos>prob_neg:
            return 1
        else:
            return 0

    def test_acc(self):
        if self.verbose > 0:
            print("testing...")
        correct = 0.0
        incorrects = []
        for tokens,review in zip(self.processor.pos_review_tokens_test,self.processor.pos_review_strings):
            prob_pos = self.prior_pos
            prob_neg = self.prior_neg
            for t in tokens:
                if t in self.processor.vocab:
                    prob_pos*=self.theta_pos[t]
                    prob_neg*=self.theta_neg[t]
            norm = prob_pos+prob_neg
            if norm == 0:
                norm = 1
            prob_pos /= norm
            prob_neg /= norm
            if prob_pos>prob_neg:
                correct +=1
            else:
                incorrects.append([review, 1, prob_pos,prob_neg])
        for tokens,review in zip(self.processor.neg_review_tokens_test,self.processor.neg_review_strings):
            prob_pos = self.prior_pos
            prob_neg = self.prior_neg
            for t in tokens:
                if t in self.processor.vocab:
                    prob_pos*=self.theta_pos[t]
                    prob_neg*=self.theta_neg[t]
            norm = prob_pos+prob_neg
            if norm == 0:
                norm = 1
            prob_pos /= norm
            prob_neg /= norm
            if prob_pos<prob_neg:
                correct +=1
            else:
                incorrects.append([review,0,prob_pos,prob_neg])
        return correct / len(self.processor.pos_review_tokens_test+self.processor.neg_review_tokens_test), incorrects



In [None]:

#models
#nb_best = NaiveBayes(IMDBProcessor(word_freq_lower_limit=1,word_len_lower_limit=1,ngrams=[2,4]), laplace=True)  #acc=0.883
#nb = NaiveBayes(IMDBProcessor(verbose=0,word_freq_lower_limit=1,word_len_lower_limit=1,ngrams=[1]), verbose=0,laplace=True) 
#acc1,inc1 = nb.test_acc()
#nb2 = NaiveBayes(IMDBProcessor(verbose=0,word_freq_lower_limit=1,word_len_lower_limit=1,ngrams=[2,4]), verbose=0,laplace=True)
#acc2,inc2 = nb2.test_acc()
#print(acc1)
#print(acc2)
#nb_bag_best = NaiveBayesBagged([IMDBProcessor(verbose=1,word_freq_lower_limit=1,word_len_lower_limit=1,ngrams=[1]),
#                           IMDBProcessor(verbose=1,word_freq_lower_limit=1,word_len_lower_limit=1,ngrams=[2,4])],
#                           laplace=True)
#nb_bag_best = NaiveBayesBagged([IMDBProcessor(verbose=1,word_freq_lower_limit=2,word_len_lower_limit=4,ngrams=[1]),
#                           IMDBProcessor(verbose=1,word_freq_lower_limit=1,word_len_lower_limit=1,ngrams=[3]),
#                           IMDBProcessor(verbose=1,word_freq_lower_limit=1,word_len_lower_limit=1,ngrams=[2,4])],
#                           laplace=True)

#best bagged model I could find
nb = NaiveBayesBagged([IMDBProcessor(verbose=1,suffix=25,word_freq_lower_limit=2,word_len_lower_limit=5,ngrams=[1]),
               IMDBProcessor(verbose=1,word_freq_lower_limit=2,word_len_lower_limit=1,ngrams=[3]),
               IMDBProcessor(verbose=1,suffix=50,word_freq_lower_limit=2,word_len_lower_limit=1,ngrams=[1,3]),
               IMDBProcessor(verbose=1,word_freq_lower_limit=2,word_len_lower_limit=1,ngrams=[2,4])],laplace=True)
print(nb.test_acc())


processing...
building vocab...
processing...
building vocab...
processing...
building vocab...
processing...
building vocab...
building model... 0
building model... 1
building model... 2
building model... 3
testing...
0.90236


In [None]:
12500-(nb.test_acc_pos()*12500)

testing...


1477.0

In [None]:
12500-(nb.test_acc_neg()*12500)

testing...


1274.0

In [None]:
len(nb.processors[0].vocab)+len(nb.processors[1].vocab)+len(nb.processors[2].vocab)+len(nb.processors[3].vocab)

1305704

In [None]:
#hill climb to find a good combination

#best so far
#8,16,17,4

processors =  [IMDBProcessor(verbose=1,word_freq_lower_limit=2,word_len_lower_limit=1,ngrams=[1]),
               IMDBProcessor(verbose=1,word_freq_lower_limit=5,word_len_lower_limit=3,ngrams=[1]),
               IMDBProcessor(verbose=1,word_freq_lower_limit=10,word_len_lower_limit=5,ngrams=[1]),
               IMDBProcessor(verbose=1,prefix=25,word_freq_lower_limit=2,word_len_lower_limit=5,ngrams=[1]),
               IMDBProcessor(verbose=1,suffix=25,word_freq_lower_limit=2,word_len_lower_limit=5,ngrams=[1]),
               IMDBProcessor(verbose=1,suffix=10,word_freq_lower_limit=2,word_len_lower_limit=5,ngrams=[1]),
               IMDBProcessor(verbose=1,word_freq_lower_limit=2,word_len_lower_limit=1,ngrams=[2]),
               IMDBProcessor(verbose=1,word_freq_lower_limit=5,word_len_lower_limit=1,ngrams=[2]),
               IMDBProcessor(verbose=1,word_freq_lower_limit=2,word_len_lower_limit=1,ngrams=[3]),
               IMDBProcessor(verbose=1,prefix=25,word_freq_lower_limit=2,word_len_lower_limit=1,ngrams=[3]),
               IMDBProcessor(verbose=1,suffix=25,word_freq_lower_limit=2,word_len_lower_limit=1,ngrams=[3]),
               IMDBProcessor(verbose=1,word_freq_lower_limit=5,word_len_lower_limit=1,ngrams=[3]),
               IMDBProcessor(verbose=1,word_freq_lower_limit=10,word_len_lower_limit=1,ngrams=[3]),
               IMDBProcessor(verbose=1,word_freq_lower_limit=2,word_len_lower_limit=1,ngrams=[2,3]),
               IMDBProcessor(verbose=1,word_freq_lower_limit=2,word_len_lower_limit=1,ngrams=[1,3]),
               IMDBProcessor(verbose=1,prefix=50,word_freq_lower_limit=2,word_len_lower_limit=1,ngrams=[1,3]),
               IMDBProcessor(verbose=1,suffix=50,word_freq_lower_limit=2,word_len_lower_limit=1,ngrams=[1,3]),
               IMDBProcessor(verbose=1,word_freq_lower_limit=2,word_len_lower_limit=1,ngrams=[2,4]),
               IMDBProcessor(verbose=1,prefix=25,word_freq_lower_limit=2,word_len_lower_limit=1,ngrams=[2,4]),
               IMDBProcessor(verbose=1,suffix=25,word_freq_lower_limit=2,word_len_lower_limit=1,ngrams=[2,4]),
               IMDBProcessor(verbose=1,word_freq_lower_limit=2,word_len_lower_limit=1,ngrams=[1,2]),
               IMDBProcessor(verbose=1,suffix=50,word_freq_lower_limit=2,word_len_lower_limit=1,ngrams=[1,2]),
               IMDBProcessor(verbose=1,prefix=50,word_freq_lower_limit=2,word_len_lower_limit=1,ngrams=[1,2]),
               IMDBProcessor(verbose=1,word_freq_lower_limit=2,word_len_lower_limit=1,ngrams=[1,4]),
               IMDBProcessor(verbose=1,prefix=50,word_freq_lower_limit=2,word_len_lower_limit=1,ngrams=[1,4]),
               IMDBProcessor(verbose=1,prefix=25,word_freq_lower_limit=2,word_len_lower_limit=1,ngrams=[1,4]),
               IMDBProcessor(verbose=1,prefix=10,word_freq_lower_limit=2,word_len_lower_limit=1,ngrams=[1,4]),
               ]


import random
rand_ind = lambda l: random.sample(range(len(l)),1)[0]
#hill climb search for 50 iterations
proc_inds = [4,8,16,17]
saved_results = [[0,proc_inds]]
for _ in range(100):
    proc_inds = max(saved_results,key=lambda r:r[0])[1]
    for _ in range(5):#5 steps from current max
        proc_inds[rand_ind(proc_inds)] = rand_ind(processors)#swap one
        procs = []
        for i in proc_inds:
            procs.append(processors[i])
        nb = NaiveBayesBagged(procs,laplace=True)
        acc = nb.test_acc()
        print(acc)
        saved_results.append([acc,proc_inds[:]])
print(max(saved_results,key=lambda r:r[0]))

        
# 12x12 -> 512x512


#no laplace
#               precision    recall  f1-score   support
# 
#            0       0.76      0.82      0.79     12500
#            1       0.81      0.74      0.77     12500
# 
#     accuracy                           0.78     25000
#    macro avg       0.78      0.78      0.78     25000
# weighted avg       0.78      0.78      0.78     25000

#laplace 
#               precision    recall  f1-score   support
# 
#            0       0.82      0.84      0.83     12500
#            1       0.84      0.82      0.83     12500
# 
#     accuracy                           0.83     25000
#    macro avg       0.83      0.83      0.83     25000
# weighted avg       0.83      0.83      0.83     25000


#freq 1 len 1 ngram [2]
#acc: 0.87208
#vocab size: 1452767
#freq 1 len 1 ngram [3]
#acc: 0.8764
#vocab size: 3657592
#freq 1 len 1 ngram [2, 4]
#acc: 0.8834
#vocab size: 6485060
#freq 1 len 1 ngram [2, 3]
#acc: 0.87556
#vocab size: 5110359
#freq 1 len 1 ngram [3, 4]
#acc: 0.87704
#vocab size: 8689885
#freq 1 len 2 ngram [2]
#acc: 0.87192
#vocab size: 1506216
#freq 1 len 2 ngram [3]
#acc: 0.87356
#vocab size: 3691440
#freq 1 len 2 ngram [2, 4]
#acc: 0.88088
#vocab size: 6426719
#freq 1 len 2 ngram [2, 3]
#acc: 0.87812
#vocab size: 5197656
#freq 1 len 2 ngram [3, 4]
#acc: 0.87096
#vocab size: 8611943
#freq 1 len 3 ngram [2]
#acc: 0.87156
#vocab size: 1637887
#freq 1 len 3 ngram [3]
#acc: 0.84924
#vocab size: 3595510
#freq 1 len 3 ngram [2, 4]
#acc: 0.8768
#vocab size: 6022243
#freq 1 len 3 ngram [2, 3]
#acc: 0.8792
#vocab size: 5233397
#freq 1 len 3 ngram [3, 4]
#acc: 0.8456
#vocab size: 7979866
#freq 5 len 1 ngram [2]
#acc: 0.87284
#vocab size: 130106
#freq 5 len 1 ngram [3]
#acc: 0.86532
#vocab size: 108883
#freq 5 len 1 ngram [2, 4]
#acc: 0.8792
#vocab size: 170239
#freq 5 len 1 ngram [2, 3]
#acc: 0.88036
#vocab size: 238989
#freq 5 len 1 ngram [3, 4]
#acc: 0.862
#vocab size: 149016
#freq 5 len 2 ngram [2]
#acc: 0.87276
#vocab size: 129550
#freq 5 len 2 ngram [3]
#acc: 0.85888
#vocab size: 95728
#freq 5 len 2 ngram [2, 4]
#acc: 0.8776
#vocab size: 161177
#freq 5 len 2 ngram [2, 3]
#acc: 0.8796
#vocab size: 225278
#freq 5 len 2 ngram [3, 4]
#acc: 0.8544
#vocab size: 127355
#freq 5 len 3 ngram [2]
#acc: 0.87176
#vocab size: 114038
#freq 5 len 3 ngram [3]
#acc: 0.82244
#vocab size: 57964
#freq 5 len 3 ngram [2, 4]
#acc: 0.8738
#vocab size: 126285
#freq 5 len 3 ngram [2, 3]
#acc: 0.87548
#vocab size: 172002
#freq 5 len 3 ngram [3, 4]
#acc: 0.81732
#vocab size: 70211
#freq 10 len 1 ngram [2]
#acc: 0.86812
#vocab size: 63733
#freq 10 len 1 ngram [3]
#acc: 0.85304
#vocab size: 42351
#freq 10 len 1 ngram [2, 4]
#acc: 0.87276
#vocab size: 75934
#freq 10 len 1 ngram [2, 3]
#acc: 0.87596
#vocab size: 106084
#freq 10 len 1 ngram [3, 4]
#acc: 0.8474
#vocab size: 54552
#freq 10 len 2 ngram [2]
#acc: 0.86956
#vocab size: 62444
#freq 10 len 2 ngram [3]
#acc: 0.84268
#vocab size: 36216
#freq 10 len 2 ngram [2, 4]
#acc: 0.8728
#vocab size: 71759
#freq 10 len 2 ngram [2, 3]
#acc: 0.87504
#vocab size: 98660
#freq 10 len 2 ngram [3, 4]
#acc: 0.836
#vocab size: 45531
#freq 10 len 3 ngram [2]
#acc: 0.8664
#vocab size: 52468
#freq 10 len 3 ngram [3]
#acc: 0.79216
#vocab size: 19977
#freq 10 len 3 ngram [2, 4]
#acc: 0.8672
#vocab size: 55591
#freq 10 len 3 ngram [2, 3]
#acc: 0.86792
#vocab size: 72445
#freq 10 len 3 ngram [3, 4]
#acc: 0.78812
#vocab size: 23100
