# Create trees for Cross Validation
## Mount Google Drive

In [0]:
from google.colab import drive
drive.mount('/gdrive')
# %cd /gdrive/My Drive/Academic/Data_Science/capstone/colab

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /gdrive


## Copy original tree to Colab

In [0]:
% cp /gdrive/My\ Drive/Academic/Data_Science/capstone/wikipedia/search_tree_with_text_1.json ./
% cp /gdrive/My\ Drive/Academic/Data_Science/capstone/wikipedia/search_tree_with_text.json ./
% cp /gdrive/My\ Drive/Academic/Data_Science/capstone/wikipedia/search_tree_with_text_LARGE.json ./

## Import Packages

In [0]:
import os
import json
import numpy as np 
import pickle as pkl
from sklearn.model_selection import KFold

## Read original tree

In [0]:
import time
start = time.time()
tree = json.load(open('search_tree_with_text_LARGE.json','r'))
tree = json.loads(tree)
end = time.time()
round((end-start)/60, 2)

1.45

## Exploring the tree

In [0]:
def explore_tree(tree, level=0, lst=[], output=True, key=''):
  if level>3:
    return
  
  append = True
  for k,v in tree.items():
    if k in ['texts','pages']:
      if append:
        if len(tree['texts']) != len(tree['pages']):
          cnt[0] += 1

        lst.append([2*(level-1)*' '+key, len(tree['texts']), len(tree['pages'])])
      append = False
    else:
      explore_tree(v, level+1, lst, False, k)

  if output:
    return lst

cnt = [0]
lst = explore_tree(tree)
lst[:20]

[['Leisure activities', 40, 40],
 ['  Games', 6, 6],
 ['    Games media', 1, 1],
 ['    Games by type', 0, 0],
 ['    Game franchises', 6, 6],
 ['    Lists of games', 46, 46],
 ['    Games by location', 4, 4],
 ['    Game characters', 1, 1],
 ['    Game images', 0, 0],
 ['    Works about games', 1, 1],
 ['    Social deduction games', 4, 4],
 ['    Games by designer', 0, 0],
 ['    History of games', 5, 5],
 ['    Historical games', 8, 8],
 ['    Games by year', 61, 61],
 ['    Games by company', 0, 0],
 ['    Creative Commons-licensed games', 8, 8],
 ['    Games by genre', 2, 2],
 ['    Playing cards', 60, 60],
 ['    Open-source games', 1, 1]]

In [0]:
brr = tree['Leisure activities']['Bathing']['Jet ski']
[(x[0], x[1][:75]) for x in zip(brr['pages'],brr['texts']+[''])]


[('Shanmugam Murugesu',
  'Shanmugam "Sam" Murugesu () (c.1967 – 13 May 2005) was the Singaporean Nati'),
 ('Jet Ski',
  'Jet Ski is the brand name of a personal water craft (PWC) manufactured by K')]

## Create 5-Fold cross validation trees

### Helper functions

In [0]:
def fill_with_None(lst, n=5):
  ''' fills a list with None a the end to assure that its length is at least n'''
  if len(lst)<n:
    texts = lst.copy() + [None]*n
    return texts
  else:
    return lst.copy()

def split_generator(gen, texts, pages):
  ''' 
  splits both texts and pages (both are lists) into test and validation
  '''
  for train_idx, val_idx in gen:
      train_txt = [texts[i] for i in train_idx]
      val_txt = [texts[i] for i in val_idx]
      train_pag = [pages[i] for i in train_idx]
      val_pag = [pages[i] for i in val_idx]
      yield train_txt, train_pag, val_txt, val_pag

### Create trees

In [0]:
trees_train = {1:{}, 2:{}, 3:{}, 4:{}, 5:{}}
trees_val = {1:{}, 2:{}, 3:{}, 4:{}, 5:{}}

for key1, values1 in tree.items():
  # print(key1, '(', len(values1), ')')
  tree1 = values1
  
  n1 = max(5, len(tree1['texts']), len(tree1['pages']))
  texts = fill_with_None(tree1['texts'], n1)
  pages = fill_with_None(tree1['pages'], n1)
  
  
  folds = split_generator(KFold(shuffle=True).split(range(n1)),
                          texts,
                          pages)
  for i in range(1, 6):
    trees_train[i][key1] = {}
    trees_val[i][key1] = {}

    train_txt, train_pag, val_txt, val_pag = folds.__next__()
    trees_train[i][key1]['texts'] = [txt for txt in train_txt if txt is not None]
    trees_val[i][key1]['texts'] = [txt for txt in val_txt if txt is not None]
    trees_train[i][key1]['pages'] = [pag for pag in train_pag if pag is not None]
    trees_val[i][key1]['pages'] = [pag for pag in val_pag if pag is not None]

  for key2, values2 in tree1.items():
    # print('. ', key2)
    if key2 in ['pages','texts']:
      continue
      
    tree2 = values2
    n2 = max(5, len(tree2['texts']), len(tree2['pages']))
    texts = fill_with_None(tree2['texts'], n2)
    pages = fill_with_None(tree2['pages'], n2)

    folds = split_generator(KFold(shuffle=True).split(range(n2)),
                          texts,
                          pages)
    for j in range(1, 6):
      trees_train[j][key1][key2] = {}
      trees_val[j][key1][key2] = {}

      train_txt, train_pag, val_txt, val_pag = folds.__next__()
      trees_train[j][key1][key2]['texts'] = [txt for txt in train_txt if txt is not None]
      trees_val[j][key1][key2]['texts'] = [txt for txt in val_txt if txt is not None]
      trees_train[j][key1][key2]['pages'] = [pag for pag in train_pag if pag is not None]
      trees_val[j][key1][key2]['pages'] = [pag for pag in val_pag if pag is not None]
    
    for key3, values3 in tree2.items():
      if key3 in ['pages','texts']:
        continue
        
      tree3 = values3
      n3 = max(5, len(tree3['texts']), len(tree3['pages']))
      texts = fill_with_None(tree3['texts'], n3)
      pages = fill_with_None(tree3['pages'], n3)

      folds = split_generator(KFold(shuffle=True).split(range(n3)),
                          texts,
                          pages)
      for k in range(1, 6):
        trees_train[k][key1][key2][key3] = {}
        trees_val[k][key1][key2][key3] = {}

        train_txt, train_pag, val_txt, val_pag = folds.__next__()
        trees_train[k][key1][key2][key3]['texts'] = [txt for txt in train_txt if txt is not None]
        trees_val[k][key1][key2][key3]['texts'] = [txt for txt in val_txt if txt is not None]
        trees_train[k][key1][key2][key3]['pages'] = [pag for pag in train_pag if pag is not None]
        trees_val[k][key1][key2][key3]['pages'] = [pag for pag in val_pag if pag is not None]
  # break

## Export Trees to pickle files

In [0]:
def pickle_dict(file_name, dct):
  with open(file_name, 'wb') as f:
    pkl.dump(dct, f)

for i in range(1,6):
  file_name = 'tree_train_' + str(i) + '.pkl'
  print(file_name)
  pickle_dict(file_name, trees_train[i])

  file_name = 'tree_val_' + str(i) + '.pkl'
  print(file_name)
  pickle_dict(file_name, trees_val[i])



tree_train_1.pkl
tree_val_1.pkl
tree_train_2.pkl
tree_val_2.pkl
tree_train_3.pkl
tree_val_3.pkl
tree_train_4.pkl
tree_val_4.pkl
tree_train_5.pkl
tree_val_5.pkl


## Copy files back to Google Drive

In [0]:
% cp {tree_train_1.pkl,tree_train_2.pkl,tree_train_3.pkl,tree_train_4.pkl,tree_train_5.pkl,tree_val_1.pkl,tree_val_2.pkl,tree_val_3.pkl,tree_val_4.pkl,tree_val_5.pkl} /gdrive/My\ Drive/Academic/Data_Science/capstone/wikipedia/CV_trees