# Tableau - Extract tables and queries

### 1. Set-up Notebook

In [1]:
import tableauserverclient as TSC
import zipfile
import os
import yaml
import tqdm
import json
from bs4 import BeautifulSoup

import pandas as pd

In [2]:
def tableau_creds(yaml_path):
    
    """
    Returns authorization and server objects needed for signing into Tableau Server
    Requires yaml_file with user_name, password, and server_name fields.
    """

    with open(yaml_path) as file:
        tableau_creds = yaml.safe_load(file)
    
    #Create auth and server to connect to Tableau server
    auth = TSC.TableauAuth(tableau_creds['user_name'], tableau_creds['password'])
    server = TSC.Server(tableau_creds['server_name'], use_server_version=True)
    
    return auth, server

In [3]:
def grab_workbook_ids(auth, server):
    
    """
    Requires authorization and server objects from tableau_creds()
    For all workbooks on Tableau Server, pairs workbook's name its unique id
    Returns dictionary of workbook_name:workbook_id pairings
    """
    
    workbook_ids = {}
    
    with server.auth.sign_in(auth):
        all_workbooks = TSC.Pager(server.workbooks)
        
        for workbook in all_workbooks:
            workbook_ids[workbook.name] = workbook.id
        
    return workbook_ids

In [4]:
def download_and_extract(auth, server, workbook_name, workbook_id, desired_filepath='workbooks'):
    
    """
    Requires authorization and server objects from tableau_creds()
    Downloads individual workbooks from Tableau Server.
    Extracts workbook from zipped file if need (zipped file is then removed)
    Returns file_path to Tableau workbook with file format .twb
    """
    
    with server.auth.sign_in(auth):
        
        download_path = server.workbooks.download(workbook_id, filepath = desired_filepath, include_extract=False)
        
        #If dashboard in zipped file format (.twbx), extracts only the dashboard file (.twb).
        #Then deletes original zipped file.
        if zipfile.is_zipfile(download_path):
            with zipfile.ZipFile(download_path,'r') as zipObj:
                #Get a list of all file names
                filenames = zipObj.namelist()
                #For each file in zipped object, extract only those with .twb ending
                #Expecting only one per zipped object
                for filename in filenames:
                    if filename.endswith('.twb'):
                        filepath = zipObj.extract(member=filename, path = desired_filepath)    
            os.remove(download_path)
        
        else:
            filepath = download_path
            
        return filepath     

In [5]:
def extract_datasources(file_path):
    """
    Requires file_path to Tableau workbook (which is returned by function download_and_extract)
    Searches through datasources in Tableau workbook, 
    Returns dictionary containing queries and tables
    """

    results = {}
    
    #Open Tableau dashboard and convert using BeautifulSoup
    with open(file_path, 'r', encoding='utf8') as file:
        wb = file.read()
        soup = BeautifulSoup(wb, 'xml')

        #Finds all 'datasource' tags and their children
        datasources = soup.find_all('datasource')

        for datasource in datasources:
            relations = datasource.find_all('relation', {'type':['text', 'table']})
            
            for relation in relations:
                if (relation.attrs['type'] == 'table') & (relation.attrs['name'] != 'Extract'):
                    table = relation.attrs['table'].replace('[','').replace(']','')
                    name = relation.attrs['name']
                    if relation.parent.has_attr('class'):
                        connection = relation.parent.attrs['class']
                    else:
                        connection = 'no_class'
                    results['table-{},{}'.format(name, connection)] = table
                elif relation.attrs['type'] == 'text':
                    query = relation.contents[0]
                    name = relation.attrs['name']
                    if relation.parent.has_attr('class'):
                        connection = relation.parent.attrs['class']
                    else:
                        connection = 'no_class'
                    results['query-{}, {}'.format(name, connection)] = query 
        
        return results

### 3. Extract queries/tables from all workbooks

In [6]:
def extract_all(yaml_path):
    
    all_sources = {}
    
    auth, server = tableau_creds(yaml_path)
    
    workbook_names_ids = grab_workbook_ids(auth, server)
    
    try:
        for workbook_name, workbook_id in tqdm.tqdm(workbook_names_ids.items()):
            #Remove 'Which Tableau Should I Look At' - It pulls only information from Tableau Server itself
            if workbook_name not in ('Which Tableau Should I Look At'):
                workbook_path = download_and_extract(auth, server, workbook_name, workbook_id, desired_filepath = 'workbooks')
                sources = extract_datasources(workbook_path)

                os.remove(workbook_path)

            all_sources[workbook_name] = sources
    except Exception as e:
        all_sources[workbook_name] = 'Failed because of {}'.format()
    return all_sources

In [7]:
all_sources = extract_all('C:/Users/tylers/Documents/credentials/tableau_credentials.yml')

100%|████████████████████████████████████████████████████████████████████████████████| 207/207 [02:16<00:00,  1.52it/s]


In [8]:
#Store dictionary containing wokrbook names and queries/tables to a json file
#Currently does not have Marketing_Dashboard_Acquisitions workbook (issues downloading)
with open('data/all_sources.json', 'w') as f:
    json.dump(all_sources, f)