## Python set up

In [3]:
# Load custom scripts in reusable_code folder - do you have to do this every time? 
import sys
sys.path.append(r'/home/jupyter/reusable_code')
sys.path.append(r'/home/jupyter/GithubClose/bbdig_reporting_networkanalysis')

!pip install bigquery_helper
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import plotly.graph_objects as go

from google.cloud import bigquery
from pandasql import sqldf  

import google_api_functions as gaf
import bigquery_helper as bqh
import network_functions as nf

# set up credentials 
creds=gaf.Authenticate_Google(r'/home/jupyter/reusable_code/')

--------------------------------------------
--------------------------------------------

## General

In [None]:
loc = '/home/jupyter/network_analysis/bbdig_reporting_networkanalysis/'

# pull in the data

# add nodes and edges

## use table names as nodes
## from -> to as the directed edge
## atributes - view / table / report / source

## Views
A look at the views in the various projects we can write to

### pull in the data

In [None]:
# set up the view - can be omitted really as its a view
set_up = ['views.sql', 'views2.sql']
set_up2 = [loc + script for script in set_up]

# run the scripts on bq
bq = bigquery.Client(project='itv-bde-analytics-prd',credentials=creds)
bqh.run_files(bq, set_up2)

# pull the lot into python 
sql = """
select * 
from `itv-bde-analytics-prd.britbox_analytics.network_analysis_views`
order by destination_object
"""
query_job = bq.query(sql, location = "EU")
views = query_job.to_dataframe() 

## Tables
A look at the tables we have created. This analyses all the queries ever run up to a static point

In [None]:
# set up the view - can be omitted really as its a view
set_up = ['tables.sql', 'tables2.sql']
set_up2 = [loc + script for script in set_up]

# run the scripts on bq
bq = bigquery.Client(project='itv-bde-analytics-prd',credentials=creds)
bqh.run_files(bq, set_up2)

# pull the lot into python 
sql = """
select * 
from `itv-bde-analytics-prd.britbox_analytics.network_analysis_tables`
order by destination_object
"""
query_job = bq.query(sql, location = "EU")
tables = query_job.to_dataframe() 


## Reporting Doc (gsheet)
Pulling in from the GCP Reporting doc that we need
This provides the end nodes (ie the reports)

In [None]:
# pull info out of gsheets
r , reports = gaf.read_google_sheets_as_rows(
    '10YrdDZaAdoVdXrpSEtmUQ8U6A8KW37LgkGLrei1iVZw'
    ,'Live Reports!A:T'
    ,creds
    ,header_row=0
)

# take just those reports that we own
cols = ['Report Name', 'GCP tables used', 'BI Tool']
reports2 = reports[cols].loc[reports['Maintained by'] == 'BritBox Analytics']

# ripped this method off of here: 
# https://medium.com/@sureshssarda/pandas-splitting-exploding-a-column-into-multiple-rows-b1b1d59ea12e
# make a new series from the original column
# split the original column by the delimter new-line
# kept falling over because of None data type, so make this the empty string
new_df = pd.DataFrame(
    reports2['GCP tables used'].str.split("\n").fillna('').tolist()
     , index = [reports2['Report Name'], reports2['BI Tool']]
    ).stack()

# Remove the (multi) indexes from the series
# this has the happy (?) benefit of changing the series to a df
# also drops the *counter* index aka 0... 
new_df = new_df.reset_index(['Report Name', 'BI Tool', 0])

# rename your fields 
new_df.columns = ['Report Name', 'BI Tool', 'Created From']

# a bit of rejiging to get it in the right format (right field names etc)
reports3 = new_df
reports3 = reports3.rename(columns = {
    'Report Name' : 'destination_object'
    , 'BI Tool' : 'definition'
    , 'Created From' : 'created_from_full'
    }
)

# add a field
reports3['type'] = 'Report'
reports3['destination_object'] = reports3['destination_object'].str.strip()
reports3['created_from_full'] = reports3['created_from_full'].str.strip()
# reports3

## Putting it all together
Combining all the nodes and checking for any inconsistencies. 

Views will take precedence over tables where there are conflicts

In [None]:
# combine the three sets of nodes
# https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html

# what columsn do we want 
# cut down the dataframes created so far to just those columns
cols = [
    'destination_object'
    , 'created_from_full'
    , 'definition'
    , 'type'
]
the_tables = tables[cols]
the_views = views[cols]
the_reports = reports3[cols]

# smash them together
frames = [the_tables, the_views, the_reports]
everything = pd.concat(frames)
everything = everything.reset_index(drop = True)

### Exclusions. 
A few nodes need exlucding as was getting circular references. this is a problem as we need a directed acylcical graph

In [None]:
records_to_exclude = [
    '`itv-bde-analytics-dev.britbox_sandbox.SB_stripe_iTunes_conversions_final`'
    , '`itv-bde-analytics-dev.britbox_sandbox.SB_stripe_iTunes_conversions_tables`'
]
everything2 = everything.loc[~everything['destination_object'].isin(records_to_exclude)]
everything2 = everything2.reset_index(drop = True)

### Dealing with the case of conflicting defintions
this is mainly between tables and views (as the views will be only ever the "latest") 
and also tables and tables. 

**-----------Skip for now---------**

In [None]:
# it would be possible for a destination_object to appear in the tables and the views part 
# so need somethig that prioritises the views (as these are current, tables are snapshots)

# # urgh this is me trying to use pandas methods and not just sql like code... couldnt figure it out tho. revisit
# # just check if there are conflicts (ie multiple definitions per destination object) 
# everything['hi'] = everything.sort_values(['type', 'definition'], ascending=[False, True]) \
#     .groupby(['destination_object', 'type', 'definition']) \
#     .rank() 
# everything

# so simple in SQL, cant figure out the pandas way
sql = '''
select 
    *, dense_rank() over (partition by destination_object order by type desc) as hihi
from everything 
'''
hi = sqldf(sql)
# hi.loc[hi['hihi']!= 1]
# its never happened?! hmm.... moving on for now. 

### Create a full graph 
add nodes and edges from the dataframe we have gathered

In [None]:
# create a graph
T = nx.from_pandas_edgelist(
    everything2
    , 'created_from_full'
    , 'destination_object'
    , None
    , nx.DiGraph
    )

# draw it lol 
# nx.draw(T, with_labels = False)

### Some node types defined

In [None]:
# stick as many classifications as you like in 
known_reports = set(reports3['destination_object'])
known_sources = set([
    '`itv-bde-svod-dev.svod_entitlements.entitlements`'
    , '`itv-bde-svod-prd.svod_entitlements.entitlements`'
])
known_marts = set([
    '`itv-bde-analytics-prd.britbox_analytics.entitlements`'
    , '`itv-bde-analytics-dev.britbox_analytics.Viewing_clean`'
    , '`itv-bde-analytics-dev.britbox_sandbox.Viewing_clean`'
])
known_simon = set([
    '`itv-bde-analytics-dev.britbox_sandbox.ss_entitlements`'
])
known_pre_marts = set([
    '`itv-bde-analytics-dev.britbox_sandbox.ss_stripe_layer2`'
    , '`itv-bde-analytics-dev.britbox_sandbox.ss_itunes_layer2`'
    , '`itv-bde-analytics-dev.britbox_sandbox.ss_ee_layer2`'
    , '`itv-bde-analytics-prd.britbox_sandbox.ss_bt_layer2`'
])

### Cut down the graph. 
Everythign has to ultimately feed a report. 
If it doesnt it is culled. 
For this reason it is important that the reporting tracker is well maintained.

In [None]:
# need to remove objects if they do not contribute to a mart or a report
# this is a set of nodes that are terminal
# we might want to remove these from the diagram to make it simpler..        
important_nodes = known_marts.union(known_reports)
nf.cutdown_unimportant_nodes(T, important_nodes)

# hit me with a diagram 
# nx.draw(T, with_labels = False)


### Save the graph

In [None]:
# save the bastard to avoid going through that faff above
nx.write_gpickle(T,"test_graph.gpickle")

--------------------------------------------
--------------------------------------------

### Pick up the graph

In [None]:
T = nx.read_gpickle("test_graph.gpickle")

### Set position of the nodes

In [None]:
x_pos = nf.xpos_left_aligned(T)
y_pos = nf.ypos_simple_distributed(T, x_pos)
pos = {key: [value, y_pos[key]] for key, value in x_pos.items()}
# nx.draw(T, pos, with_labels = False)

### Some node types defined

In [None]:
# stick as many classifications as you like in 
known_reports = set([
    'Axis Catalogue'
    , 'BBC PS Reporting'
    , 'BBC Partnership Board Reporting'
    , 'Brand Tracker'
    , 'Connected TV Revenue Share'
    , 'Contact Centre Data Feed'
    , 'Content Delta Reporting'
    , 'Content Reporting: Channel 4'
    , 'Content Reporting: Channel 5'
    , 'Content Reporting: Channel BBC'
    , 'Content Reporting: Channel NBCU'
    , 'Joiners Survey'
    , 'Marketing Attribution Dashboard'
    , 'PES Data Quality Dashboard'
    , 'PRS Music Reporting'
    , 'Spitting Image Launch Dashboard'
    , 'Spitting Image Launch Dashboard (backup)'
    , 'Trading Dashboard'
    , 'Viewing Toplines'
])
known_sources = set([
    '`itv-bde-svod-dev.svod_entitlements.entitlements`'
    , '`itv-bde-svod-prd.svod_entitlements.entitlements`'
])
known_marts = set([
    '`itv-bde-analytics-prd.britbox_analytics.entitlements`'
    , '`itv-bde-analytics-dev.britbox_analytics.Viewing_clean`'
    , '`itv-bde-analytics-dev.britbox_sandbox.Viewing_clean`'
])
known_simon = set([
    '`itv-bde-analytics-dev.britbox_sandbox.ss_entitlements`'
])
known_pre_marts = set([
    '`itv-bde-analytics-dev.britbox_sandbox.ss_stripe_layer2`'
    , '`itv-bde-analytics-dev.britbox_sandbox.ss_itunes_layer2`'
    , '`itv-bde-analytics-dev.britbox_sandbox.ss_ee_layer2`'
    , '`itv-bde-analytics-prd.britbox_sandbox.ss_bt_layer2`'
])

## do some colouring
# groups of nodes that should be coloured the same. each group coloured differently 
colourings = [known_reports, known_sources, known_marts, known_simon, known_pre_marts]

### Coloured version of the graph

In [None]:
# run the colouring algorithm (lol is this an algorithm?)
the_colours = nf.node_type(T,colourings, inplace = False)

# draw it
plt.figure(3,figsize=(12,12)) 
nx.draw(
    T
    , pos
    , with_labels = False
    , node_size=60
    , font_size=8
    , node_color = list(the_colours.values())
)
plt.show()
#### there looks to be a node out of place

### Example: Cut down the graph to an interesting node

In [None]:
## get subnodes and generate subgraph
subnodes = nf.subnodes(T, ['`itv-bde-analytics-prd.britbox_analytics.entitlements`'])
# subnodes = nf.subnodes(T,['Trading Dashboard'])
# subnodes = nf.predecessor_subnodes(T,['`itv-bde-analytics-dev.britbox_sandbox.ss_new_entitlement_stripe_layer1`'])
# subnodes = nf.predecessor_subnodes(T,['`itv-bde-analytics-dev.britbox_sandbox.ss_stripe_layer1`'])
U = nx.subgraph(T, subnodes)

## set new postitions of the nodes (optional)
U_x_pos = nf.xpos_left_aligned(U)
U_y_pos = nf.ypos_simple_distributed(U, U_x_pos)
U_pos = {key: [value, U_y_pos[key]] for key, value in U_x_pos.items()}

# run the colouring algorithm (lol is this an algorithm?)
the_colours = nf.node_type(U,colourings, inplace = False)

# draw it
plt.figure(3,figsize=(12,12)) 
nx.draw(
    U
    , U_pos
    , with_labels = False
    , node_size=60
    , font_size=8
    , node_color = list(the_colours.values())
)
plt.show()

### Example: Testing a different position algorithm

In [None]:
ypos_test = nf.ypos_(U,U_x_pos)
U_pos = {key: [value, ypos_test[key]] for key, value in U_x_pos.items()}

# run the colouring algorithm (lol is this an algorithm?)
the_colours = nf.node_type(U,colourings, inplace = False)

# draw it
plt.figure(3,figsize=(12,12)) 
nx.draw(
    U
    , U_pos
    , with_labels = False
    , node_size=60
    , font_size=8
    , node_color = list(the_colours.values())
)
plt.show()

### Example: collapse the nodes

In [None]:
V = nf.collapse_nodes(U)
## set new postitions of the nodes (optional)
V_x_pos = nf.xpos_left_aligned(V)
V_y_pos = nf.ypos_(V, V_x_pos)
V_pos = {key: [value, V_y_pos[key]] for key, value in V_x_pos.items()}

# run the colouring algorithm (lol is this an algorithm?)
the_colours = nf.node_type(V,colourings, inplace = False)

# draw it
plt.figure(3,figsize=(11,11)) 
nx.draw(
    V
    , V_pos
    , with_labels = False
    , node_size=60
    , font_size=8
    , node_color = list(the_colours.values())
)
plt.show()

### Exploding the graph
* assign a duff node every time an edge crosses an x position without being used. 
* this might help the ordering propogate

In [None]:
W_ = U.copy()
nf.set_node_text(W_)
nf.node_type(W_, colourings)
W = nf.explode_graph(W_, U_x_pos)

## set new postitions of the nodes (optional)
W_x_pos = nf.xpos_left_aligned(W)
W_y_pos = nf.ypos_(W, W_x_pos)
W_pos = {key: [value, W_y_pos[key]] for key, value in W_x_pos.items()}

# set the position as an attribute
nx.set_node_attributes(W, W_pos, "pos")

# run the colouring algorithm (lol is this an algorithm?)
nf.node_type(W,colourings)

# some dicts and lists (i dont know it changes!) for use in the drawing function
W_pos = {w: W.nodes[w]['pos'] for w in W}
node_size = [W.nodes[w]['is_real_node']*40 for w in W]
node_colour = [W.nodes[w]['node_colouring'] for w in W]

# draw it
plt.figure(3,figsize=(11,11)) 
nx.draw(
    W
    , W_pos
    , with_labels = False
    , node_size=node_size
    , font_size=8
    , node_color = node_colour
)
plt.show()

In [None]:
# save the bastard to avoid going through that faff above
nx.write_gpickle(W,"test_graph2.gpickle")

---
---

In [None]:
W = nx.read_gpickle("test_graph2.gpickle")

# The start
of somethign magical
* https://plotly.com/python/network-graphs/

In [None]:
# take a copy of the exploded graph
WG = W.copy()

In [None]:
# just doing a once over on the attirbutes 
WG.nodes['`itv-bde-analytics-prd.britbox_analytics.entitlements`']
# WG.nodes['`itv-bde-analytics-dev.britbox_sandbox.ss_account_deleted_layer0b`->`itv-bde-analytics-dev.britbox_sandbox.ss_itunes_layer1`-0']

In [None]:
## Create Edges
## Add edges as disconnected lines in a single trace 
# seems daft that the position is already stored as an attribute on the graph, 
# but we then pull it out and create out own vars which store position
edge_x = []
edge_y = []
for edge in WG.edges():
    x0, y0 = WG.nodes[edge[0]]['pos']
    x1, y1 = WG.nodes[edge[1]]['pos']
    edge_x.append(x0)
    edge_x.append(x1)
    edge_x.append(None)
    edge_y.append(y0)
    edge_y.append(y1)
    edge_y.append(None)

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines')

In [46]:
## Add nodes as a scatter trace
# this little bit is just pulling out the attributes of the graph for use as lists
# wrap it up in a for loop and a try as list comprehension might not work when the attribute was set before it was exploded
node_x = []
node_y = []
node_texts = []
node_colours = []
node_sizes = []

for node in WG.nodes():
    
    x, y = WG.nodes[node]['pos']
    node_x.append(x)
    node_y.append(y)
    
    # NODE TEXT set the text if we have it. 
    # (we may not have it because we exploded)
    try : 
        text = WG.nodes[node]['node_text']
    except KeyError : 
        text = ''
    node_texts.append(text)
    
    # COLOURS
    try : 
        colour = WG.nodes[node]['node_colouring']
    except KeyError : 
        colour = 0 # would need to think harder here if i allowed non integer colours. 
    node_colours.append(colour)

    # NODE SIZE
    try : 
        size = WG.nodes[node]['is_real_node'] * 6
    except KeyError : 
        size = 0
    node_sizes.append(size)
    
# this bit is actually defining the scatter. 
node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers',
    hoverinfo='text',
    marker=dict(
        showscale=False,
        # colorscale options
        #'Greys' | 'YlGnBu' | 'Greens' | 'YlOrRd' | 'Bluered' | 'RdBu' |
        #'Reds' | 'Blues' | 'Picnic' | 'Rainbow' | 'Portland' | 'Jet' |
        #'Hot' | 'Blackbody' | 'Earth' | 'Electric' | 'Viridis' |
        colorscale='Rainbow',
        reversescale=True,
        color=node_colours, 
        size=node_sizes,
        colorbar=dict(
            thickness=15,
            title='Node Connections',
            xanchor='left',
            titleside='right'
        ),
        line_width=2)
    , text = node_texts
)

In [None]:
# draw it up
fig = go.Figure(
            data=[edge_trace, node_trace],
            layout=go.Layout(
                title='<br>Network graph of views, tables, and reports for BritBox',
                titlefont_size=16,
                showlegend=False,
                hovermode='closest',
                margin=dict(b=20,l=5,r=5,t=40),
                annotations=[ dict(
                    text="For any given pair of nodes connected by an edge the right-hand node is built in some way from the left-hand node",
                    showarrow=False,
                    xref="paper", yref="paper",
                    x=0.005, y=-0.002 ) ],
                xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
                )
fig.show()

In [None]:
WG.nodes()

How should this all work? 
* i think put in the unexploded graph
* give it attributes (colours etc)
* explode it for drawing (so the positioning attributes should come last)
* draw it

this way attributes can persist if you want them to (ie if we have edge attribtes in the future)
the usability of the graph remains (as were not dealing with phantom nodes that exist purely for drawing)


# Example: 
Full graph, collapsed, annotated, exploded, drawn

In [73]:
# take a copy
S = T.copy()

In [74]:
# collapse nodes
S1 = nf.collapse_nodes(S)

In [75]:
# annotate
nf.set_node_text(S1, shortmessage = False)
nf.node_type(S1, colourings)

In [76]:
# explode
S1_x_pos = nf.xpos_left_aligned(S1)
S2 = nf.explode_graph(S1, S1_x_pos)

# set y positions 
S2_x_pos = nf.xpos_left_aligned(S2)
S2_y_pos = nf.ypos_(S2, S2_x_pos)
S2_pos = {key: [value, S2_y_pos[key]] for key, value in S2_x_pos.items()}

# set the position as an attribute
nx.set_node_attributes(S2, S2_pos, "pos")

In [77]:
# draw it - Edges

## Create Edges
## Add edges as disconnected lines in a single trace 
# seems daft that the position is already stored as an attribute on the graph, 
# but we then pull it out and create out own vars which store position
edge_x = []
edge_y = []
for edge in S2.edges():
    x0, y0 = S2.nodes[edge[0]]['pos']
    x1, y1 = S2.nodes[edge[1]]['pos']
    edge_x.append(x0)
    edge_x.append(x1)
    edge_x.append(None)
    edge_y.append(y0)
    edge_y.append(y1)
    edge_y.append(None)

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines')

In [78]:
# draw it - Nodes

## Add nodes as a scatter trace
# this little bit is just pulling out the attributes of the graph for use as lists
# wrap it up in a for loop and a try as list comprehension might not work when the attribute was set before it was exploded
node_x = []
node_y = []
node_texts = []
node_colours = []
node_sizes = []

for node in S2.nodes():
    
    x, y = S2.nodes[node]['pos']
    node_x.append(x)
    node_y.append(y)
    
    # NODE TEXT set the text if we have it. 
    # (we may not have it because we exploded)
    try : 
        text = S2.nodes[node]['node_text']
    except KeyError : 
        text = ''
    node_texts.append(text)
    
    # COLOURS
    try : 
        colour = S2.nodes[node]['node_colouring']
    except KeyError : 
        colour = 0 # would need to think harder here if i allowed non integer colours. 
    node_colours.append(colour)

    # NODE SIZE
    try : 
        size = S2.nodes[node]['is_real_node'] * 6
    except KeyError : 
        size = 0
    node_sizes.append(size)
    
# this bit is actually defining the scatter. 
node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers',
    hoverinfo='text',
    marker=dict(
        showscale=False,
        # colorscale options
        #'Greys' | 'YlGnBu' | 'Greens' | 'YlOrRd' | 'Bluered' | 'RdBu' |
        #'Reds' | 'Blues' | 'Picnic' | 'Rainbow' | 'Portland' | 'Jet' |
        #'Hot' | 'Blackbody' | 'Earth' | 'Electric' | 'Viridis' |
        colorscale='Rainbow',
        reversescale=True,
        color=node_colours, 
        size=node_sizes,
        colorbar=dict(
            thickness=15,
            title='Node Connections',
            xanchor='left',
            titleside='right'
        ),
        line_width=2)
    , text = node_texts
)

In [79]:
# draw it up
fig = go.Figure(
            data=[edge_trace, node_trace],
            layout=go.Layout(
                title='<br>Network graph of views, tables, and reports for BritBox',
                titlefont_size=16,
                showlegend=False,
                hovermode='closest',
                margin=dict(b=20,l=5,r=5,t=40),
                annotations=[ dict(
                    text="For any given pair of nodes connected by an edge the right-hand node is built in some way from the left-hand node",
                    showarrow=False,
                    xref="paper", yref="paper",
                    x=0.005, y=-0.002 ) ],
                xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
                )
fig.show()