# Evaluating Cross-Validation Trees
Files created in the notebook **create_CV_CategoryTrres.ipynb** were previously uploaded to Google Cloud Storage and made public

# Choose Fold to evaluate


In [0]:
run_num = 5 # Any integer from 1 to 5

# Loading files from Google Cloud Storage
Downloading training and validation trees and a json file containing credentials to upload the scoring results back to GCS


In [0]:
url = 'https://storage.googleapis.com/capstone_wikipedia/'

train_file = 'tree_train_{}.pkl'.format(run_num)
train_url = url + train_file

val_file = 'tree_val_{}.pkl'.format(run_num)
val_url = url + val_file

!wget $train_url
!wget $val_url
!wget https://storage.googleapis.com/capstone_wikipedia/crack-petal-273320-bdcfa7d69d7a.json

--2020-05-05 22:41:56--  https://storage.googleapis.com/capstone_wikipedia/tree_train_5.pkl
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.12.208
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.12.208|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3206056759 (3.0G) [application/octet-stream]
Saving to: ‘tree_train_5.pkl’


2020-05-05 22:49:17 (6.93 MB/s) - ‘tree_train_5.pkl’ saved [3206056759/3206056759]

--2020-05-05 22:49:18--  https://storage.googleapis.com/capstone_wikipedia/tree_val_5.pkl
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.10.240
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.10.240|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 751170264 (716M) [application/octet-stream]
Saving to: ‘tree_val_5.pkl’


2020-05-05 22:52:47 (3.43 MB/s) - ‘tree_val_5.pkl’ saved [751170264/751170264]

--2020-05-05 22:52:47--  https://storage.googleap

# Import packages

In [0]:
import os
import json
import pickle
import pandas as pd
from google.cloud import storage
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
import numpy as np 

import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Setting up GCS credentials

### Helper functions

In [0]:
def get_bucket(client, bucket):
    return storage_client.get_bucket(bucket)

def download_file(bucket, file):
    blob = bucket.blob(file)
    blob.download_to_filename(file)

def upload_file(bucket, file):
    blob = bucket.blob(file)
    blob.upload_from_filename(file)

### Credentials

In [0]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "crack-petal-273320-bdcfa7d69d7a.json"
storage_client = storage.Client()
bucket = get_bucket(storage_client, 'capstone_wikipedia')

## Ad-hoc version of tree. Optimized for validation speed
To speed up the process, the trees created in **create_CV_CategoryTrres.ipynb** are pickled dictionaries ready to be loaded. The **load** method reads and unpickles them

In [0]:
MAX_FEATURES = 75000

class CategoryNode:
    def __init__(self, category):
        self.category = category
        self.children = []
        self.pages = []
        self.texts = []
        self.vectors = []
        
    def add_child(self, node):
        self.children.append(node)
                
    def __str__(self):
        return self.category
    
    def delete_child(self, cat):
        del_index = None
        for i, child in enumerate(self.children):
            if child.category == cat:
                del_index = i 
                break
        del self.children[del_index]
        
    def infer_vector(self):
        vectors = []
        for child in self.children:
            try:
                if np.all((child.vectors != 0)):
                    if type(vectors) is list:
                        vectors = child.vectors
                    else:
                        vectors = np.append(vectors, child.vectors, axis = 0)
            except ValueError:
                continue
                
        if type(vectors) is not list:        
            self.vectors = np.mean(vectors, axis = 0).reshape((1, MAX_FEATURES))
        else:
            self.vectors = np.zeros((1, MAX_FEATURES))

class CategoryTree:
    def __init__(self, vectorizer):
        self.root = CategoryNode("Root")
        self.vectorizer = vectorizer
        
    def load(self, file):
        with open(file, 'rb') as f:
          tree = pickle.load(f)
          
        corpus = []
        print("Category Tree loading...")
        
        unchecked_nodes = [(name, self.root, []) for name in tree.keys()]
        
        while len(unchecked_nodes) > 0:
            cat, parent_node, path = unchecked_nodes[0]
            unchecked_nodes = unchecked_nodes[1:]

            node = CategoryNode(cat)
            parent_node.add_child(node)

            section = tree
            for p in path:
                section = section[p]

            node.pages = section[cat]['pages']
            node.texts = section[cat]['texts']
            corpus += node.texts
            cat_children = set(section[cat]) - set(["pages","texts"])

            node_children = [(child, node, path + [cat]) for child in cat_children]
            unchecked_nodes += node_children
        
        
        print("Vectorizer is fitting...")
        self.vectorizer.fit(corpus)
        
        del corpus
        del tree
        
        print("Transforming tree...")
        self.vectorize()
        
        print("Tree is ready!")
        
    def load_vectorizer(self, vectorizer_filename, pages_filename):
        self.vectorizer = pickle.load(open(vectorizer_filename, 'rb'))

        tree = json.load(open(pages_filename,'r'))
        tree = json.loads(tree)
        print("Category Tree loading...")
        
        unchecked_nodes = [(name, self.root, []) for name in tree.keys()]
        
        while len(unchecked_nodes) > 0:
            cat, parent_node, path = unchecked_nodes[0]
            unchecked_nodes = unchecked_nodes[1:]

            node = CategoryNode(cat)
            parent_node.add_child(node)

            section = tree
            for p in path:
                section = section[p]

            node.pages = section[cat]['pages']
            node.texts = section[cat]['texts']
            cat_children = set(section[cat]) - set(["pages","texts"])

            node_children = [(child, node, path + [cat]) for child in cat_children]
            unchecked_nodes += node_children
        
        del tree
        
        print("Transforming tree...")
        self.vectorize()
        
        print("Tree is ready!")
    
    def save_vectorizer(self, vectorizer_filename):
        pickle.dump(self.vectorizer, open(vectorizer_filename, 'wb'))
        
    def vectorize(self):      
        bad_nodes = []
        unchecked_nodes = [child for child in self.root.children]
        while len(unchecked_nodes) != 0:
            node = unchecked_nodes[0]
            unchecked_nodes = unchecked_nodes[1:]
            try:
                node.vectors = self.vectorizer.transform(node.texts)
            except ValueError:
                node.vectors = np.zeros((1, MAX_FEATURES))
                bad_nodes.append(node)
                
            unchecked_nodes += node.children

        for node in bad_nodes:
            node.infer_vector()
        
                
    def search(self, words, similarity_metric, fingerprint=False, depth=2):
        if not fingerprint:
            input_vector = self.vectorizer.transform(words)
        else:
            input_vector = np.array([0 if _ not in words.keys() else 1 for idx,_ in enumerate(self.vectorizer.get_feature_names())]).reshape([1,len(self.vectorizer.get_feature_names())])
        result = []
        current_node = self.root

        cnt = 1
        while len(current_node.children) != 0:
            cat_maximum = 0, None
            page_maximum = 0, None
            
            for node in current_node.children:
                cat_vec = np.mean(node.vectors, axis=0)
                if np.max(cat_vec) == 0:
                    continue
                test = similarity_metric(cat_vec, input_vector)[0][0]
                if test > cat_maximum[0]:
                    cat_maximum = test, node
                    
            for i, vec in enumerate(cat_maximum[1].vectors):
                test = similarity_metric(vec, input_vector)[0][0]
                if test > page_maximum[0]:
                    if len(cat_maximum[1].pages) != 0:
                        page_maximum = test, cat_maximum[1].pages[i]
                    else:
                        page_maximum = None
                        
            result.append(cat_maximum)
            result.append(page_maximum)
            
            current_node = cat_maximum[1]
            
            cnt += 1
            if cnt>depth:
              break
        
        return result

## Training tree and backing up vectorizer

### Helper functions

In [0]:
def score_text(txt, category_lst, tree, metric):
  '''
      Scores ONE text and returns a binary list representing True Positives as 1,
      and Otherwise 0

      INPUT:
        txt: string with the text to be evaluated
        category_lst: list of ground truth for Categories and Subcategories
        tree: an instance of CategoryTree previously trained
        metric: metric to evaluate similarity between groun truth and predictions.
                Usually cosine similarity
      OUTPUT:
        Binary list of the type [cat1, cat2, cat3] where 1 means a True Positive and 
        0 means an incorrect prediction
  '''
  predicted = [(x[0], str(x[1])) for x in tree.search([txt], metric) if type(x[1]) is not str]

  classified = np.zeros(3)
  for i, cats in enumerate(zip(category_lst, predicted)):
    if cats[0]==cats[1][1]:
      classified[i] = 1
  return list(classified)


def score_texts(texts, category_lst, tree, metric):
  '''
      Scores a list of texts and returns a list of binary lists representing 
      True Positives as 1, and Otherwise 0

      INPUT:
        txt: list of string with the texts to be evaluated
        category_lst: list of ground truth for Categories and Subcategories
        tree: an instance of CategoryTree previously trained
        metric: metric to evaluate similarity between groun truth and predictions.
                Usually cosine similarity
      OUTPUT:
        List of Binary lists of the type [cat1, cat2, cat3] where 1 means a 
        True Positive and 0 means an incorrect prediction
  '''
  scores_lst = []

  for txt in texts:
    scores_lst.append(category_lst + score_text(txt, category_lst, tree, metric))
  
  return scores_lst

def score_text_skip_exceptions(texts, category_lst, tree, metric):
  '''This is a function to work around situations like evaluating an empty text
  or evaluating a text using a category not contained in the trained model'''
  scores_lst = []

  for txt in texts:
    try:
      scores_lst.append(category_lst + score_text(txt, category_lst, tree, metric))
    except:
      pass
  
  return scores_lst

### Train tree and upload vecotrizer to GCS

In [0]:
tree = CategoryTree(TfidfVectorizer(stop_words=list(stopwords.words('english')),
                                        max_features=MAX_FEATURES))
tree.load(train_file)

vectorizer_file = 'vectorizer_train_{}.pkl'.format(run_num)
tree.save_vectorizer(vectorizer_file)
upload_file(bucket, vectorizer_file)
print('Vectorizer upload successful')

Vectorizer upload successful


### Evaluate Validation set (tree)

In [0]:
with open(val_file, 'rb') as f:
  val = pickle.load(f)
print('Validation tree loaded')

scores_lst = []

for k1, v1 in val.items():
  category_lst = [k1, None, None]
  print(k1)
  scores_lst += score_text_skip_exceptions(v1['texts'], category_lst, tree, cosine_similarity)
  for k2, v2 in v1.items():
    if k2 in ['texts','pages']:
      continue
    category_lst = [k1, k2, None]
    print('\t', k2)
    scores_lst += score_text_skip_exceptions(v2['texts'], category_lst, tree, cosine_similarity)

### Export results and exports them to GCS

In [0]:
output_file = 'scores_cv_{}.csv'.format(run_num)
pd.DataFrame(scores_lst).to_csv(output_file, index=False)
upload_file(bucket, output_file)