In [8]:
# Set up modules for Google functionality
from google.cloud import bigquery # To run BQ statements
from google_auth_oauthlib import flow # To authorise as user
from googleapiclient.discovery import build # To pull in from sheets, slides etc. API
from google.auth.transport.requests import Request
from google.cloud.bigquery import magics

# Display
import pprint

# Operating system stuff
import pickle
import os.path
import sys

# Data handling
import json
import requests
from pandas import read_csv
from pandas import datetime
import re

# Stats, models, datasheets
import pandas as pd
import pyreadstat
import math

# Visualisation
from matplotlib import pyplot
import matplotlib.pyplot as plt
import matplotlib_venn # For venn diagrams
from pandas.plotting import autocorrelation_plot

# Network graphs
import networkx as nx

# Misc
from xlsxwriter.utility import xl_rowcol_to_cell # Used to create cell references
import itertools


# Load custom scripts in reusable_code folder
sys.path.append(r'/home/jupyter/reusable_code')

import google_api_functions as gaf
creds=gaf.Authenticate_Google(r'/home/jupyter/reusable_code/')


In [12]:
!pip show bq


In [10]:
bq = bigquery.Client(project='itv-bde-analytics-dev',credentials=creds) #Apply credentials to BQ client "bq"
magics.context.credentials = creds  #apply these credentials to the BQ magic syntax too


In [11]:
query="""

select screen_type,
count(distinct screen_name) 
from `itv-bde-svod-prd.cpt.structured_cpt`
where date(timestamp)='2020-04-02'
and platform_id='dotcom'
group by 1
order by 2 desc;
"""
df = bq.query(query).to_dataframe()
df

In [147]:
df

In [66]:
query=""" 
select 
case when user_itv_id is null then 'Logged out' else 'Logged in' end as login_status,
screen_type,
case when screen_type in ('player','program-detail','show-detail','movie-detail') then screen_type
when screen_type ='sub-category' then 'actor-pages'
else replace(screen_name,'dotcom.','') end as screen_id,
form_id,
--referrer
count(*) as N
from `itv-bde-svod-prd.cpt.structured_cpt`
where date(timestamp)='2020-04-02'
and platform_id='dotcom'
group by 1,2,3,4
order by 1,2,3,4;
"""
df = bq.query(query).to_dataframe()
df


##NOTE
When using \ in the SQL code, it acts as an escape character in Python so it causes errors when copy-pasted directly from the GCP BQ editor. You either need to escape your escape, i.e. \ in BQ becomes \\ in Python, and \\ in BQ becomes \\\\ in Python.

The other alternative is to use r""" """ syntax to take it as a raw text string, over-riding any escaping etc. in Python, same as you'd do to read in a file name

In [49]:
query="""
create or replace table `itv-bde-analytics-dev.britbox_sandbox.Markov_test` as
with base as 
(select *,
/*
case 
when screen_type in ('player','program-detail','show-detail','movie-detail','feature') then screen_type
when screen_type ='sub-category' then 'actor-pages'
else screen_name||first_form
end||"-"||lgin_status
 as screen_id
*/
screen_type||"-"||lgin_status as screen_id
 from
(select 
TRIM(JSON_EXTRACT(raw,'$.cid'),"\\"") as device_id,
ifnull(TRIM(JSON_EXTRACT(raw,'$.screen.type'),"\\""),'Unknown') as screen_type,
ifnull(replace(TRIM(JSON_EXTRACT(raw,'$.screen.name'),"\\""),'dotcom.',''),'Unknown') as screen_name,
case when coalesce(TRIM(JSON_EXTRACT(raw,'$.user.itv_id'),"\\""),TRIM(JSON_EXTRACT(raw,'$.user.id'),"\\""))  
is null then 'LgdOut' 
else 'LgdIn' end as lgin_status,
ifnull(regexp_extract_all(
regexp_extract_all(REGEXP_EXTRACT(TRIM(JSON_EXTRACT(raw,'$.forms'),"\\""),
'({.*})'),'(\\\\{.*?\\\\})')[safe_offset(0)],'"id":"(.*?)",')[safe_offset(0)],'') as first_form,
timestamp
from `itv-bde-svod-prd.svod.svod_page_raw_in`
where _PARTITIONDATE='2020-04-04'
and TRIM(JSON_EXTRACT(raw,'$.platform.id'),"\\"")='dotcom'
and TRIM(JSON_EXTRACT(raw,'$.event'),"\\"")='screen.load.auto.dataLayer.load'
)
)
,paths as (select device_id, screen_id, timestamp,
case 
when lag(timestamp) over (partition by device_id order by timestamp) is null then 'Start'
when timestamp_diff(timestamp,lag(timestamp) over (partition by device_id order by timestamp),MINUTE)>180
then 'Start' else 'In Session' end as sesh_Start,
case 
when lead(timestamp) over (partition by device_id order by timestamp) is null then 'Bounce'
when timestamp_diff(lead(timestamp) over (partition by device_id order by timestamp),timestamp,MINUTE)<180
then lead(screen_id) over (partition by device_id order by timestamp) else 'Bounce' end as next_screen
from base)

,results as (select 
sesh_start as from_node,
screen_id as to_node,
count(*) as pages,
count(distinct device_id) as devices
from paths
where sesh_start='Start'
group by 1,2
union all
select 
screen_id as from_node,
next_screen as to_node,
count(*) as pages,
count(distinct device_id) as devices
from paths
group by 1,2)

select results.*,
pages/sum(pages) over (partition by from_node) as pc_transitions
from results
;
"""
bq.query(query).to_dataframe()


In [47]:
query=r"""
"""

In [50]:
query="""select * from `itv-bde-analytics-dev.britbox_sandbox.Markov_test`"""
df=bq.query(query).to_dataframe()
df

In [84]:
sorted_df=df.sort_values(by='pages', ascending=False) # Sorts the data
df_dict=sorted_df.loc[:250].to_dict(orient='records') #Selects only first N rows to minimise graph size


In [85]:
nodes=[]
edges=[]

import re
for n,i in enumerate(df_dict):
    edges.append((i['from_node'],i['to_node'],math.log10(i['pages'])))
    nodes.append(i['from_node'])
    nodes.append(i['to_node'])
    
nodes=list(set(nodes))#dedupe

In [86]:
G=nx.Graph()
G.add_nodes_from(nodes)
G.add_weighted_edges_from(edges)

In [87]:
nx.draw(G,pos=nx.circular_layout(G))

In [81]:
longest_non_repeating_route=max(len(x) for x in nx.all_simple_paths(G,source='Start',target='Bounce'))

## Trying to do an interactive graph using the instructions from here:
https://pyvis.readthedocs.io/en/latest/

In [88]:
#%pip install pyvis
from pyvis.network import Network
net = Network(notebook=True, height=1000, width=1000)


In [89]:
net.add_nodes(nodes)

In [90]:
for e in edges:
    net.add_edge(e[0],e[1],weight=e[2])

In [93]:
#help(net)
net.toggle_physics(False)
net.show_buttons(['physics','edges'])
net.show("mygraph.html")


## Trying to do an interactive graph using the instructions from here:

https://medium.com/kenlok/how-to-draw-an-interactive-network-graph-using-dash-b6b744f60931

In [7]:
#%pip install dash 
import dash
import dash_html_components as html
import dash_core_components as dcc
from dash.dependencies import Input, Output
import plotly.graph_objs as go
import networkx as nx

In [18]:
pos = nx.layout.spring_layout(G)
#Create Edges
edge_trace = go.Scatter(
    x=[],
    y=[],
    line=dict(width=0.5,color='#888'),
    hoverinfo='none',
    mode='lines')
for edge in G.edges():
    #x0, y0 = G.nodes[edge[0]]['pos']
    #x1, y1 = G.node[edge[1]]['pos']
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]    
    
    edge_trace['x'] += tuple([x0, x1, None])
    edge_trace['y'] += tuple([y0, y1, None])



In [20]:
node_trace = go.Scatter(
    x=[],
    y=[],
    text=[],
    mode='markers',
    hoverinfo='text',
    marker=dict(
        showscale=True,
        colorscale='YlGnBu',
        reversescale=True,
        color=[],
        size=10,
        colorbar=dict(
            thickness=15,
            title='Node Connections',
            xanchor='left',
            titleside='right'
        ),  
        line=dict(width=2)))

for node in G.nodes():
    x, y = pos[node]
    node_trace['x'] += tuple([x])
    node_trace['y'] += tuple([y])

In [21]:
#add color to node points
for node, adjacencies in enumerate(G.adjacency()):
    node_trace['marker']['color']+=tuple([len(adjacencies[1])])
    node_info = 'Name: ' + str(adjacencies[0]) + '<br># of connections: '+str(len(adjacencies[1]))
    node_trace['text']+=tuple([node_info])

In [24]:
fig = go.Figure(data=[edge_trace, node_trace],
             layout=go.Layout(
                title='Flow through website',
                titlefont=dict(size=16),
                showlegend=False,
                hovermode='closest',
                margin=dict(b=20,l=5,r=5,t=40),
                annotations=[ dict(
                    showarrow=False,
                    xref="paper", yref="paper",
                    x=0.005, y=-0.002 ) ],
                xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)))

In [25]:
app.layout = html.Div([
                html.Div(dcc.Graph(id='Graph',figure=fig)),
                html.Div(className='row', children=[
                    html.Div([html.H2('Overall Data'),
                              html.P('Num of nodes: ' + str(len(G.nodes))),
                              html.P('Num of edges: ' + str(len(G.edges)))],
                              className='three columns'),
                    html.Div([
                            html.H2('Selected Data'),
                            html.Div(id='selected-data'),
                        ], className='six columns')
                    ])
                ])


@app.callback(
    Output('selected-data', 'children'),
    [Input('Graph','selectedData')])
def display_selected_data(selectedData):
    num_of_nodes = len(selectedData['points'])
    text = [html.P('Num of nodes selected: '+str(num_of_nodes))]
    for x in selectedData['points']:
        material = int(x['text'].split('<br>')[0][10:])
        text.append(html.P(str(material)))
    return text

## WRITE A FUNCTION WHICH
- Resizes axis
- Do weighted edges
- Labels graph
-  Applies a colour scheme
- Allows you to resize nodes

In [112]:
#More precise drawing options
import numpy as np
#Position the nodes on the graph by the centrality metrics of the node
dc=nx.degree_centrality(G)
bc=nx.betweenness_centrality(G)
pos={}
for i in bc:
    pos[i]=np.asarray([bc[i],dc[i]])
    
#Position the nodes on the graph using one of the defaults
pos = nx.circular_layout(G)  # positions for all nodes
labelpos=[i+1 for i in pos]
#print(pos)

axsize = plt.subplots(figsize=(15,15))[1]# Set graph size for picture
#axsize.set_facecolor(bb_palette['Light Cloud']) # Set graph background colour
plt.axis('off')


#Draw nodes at specified positions
nx.draw_networkx_nodes(G, pos, node_size=2)


edgelist=[(i[0],i[1]) for i in edges]
width=[i[2] for i in edges]
nx.draw_networkx_edges(G,pos,edgelist=edges,width=width)
    
# labels
nx.draw_networkx_labels(G, pos, font_size=7, font_family='sans-serif')
nx.draw_networkx_labels(G,pos=pos,label_pos=20,font_size=7,\
                            #font_color=bb_palette['Dark Storm'],font_family='monserrat'
                       )
plt.show()

## Write a "pos" function

In [None]:
# For directed graph (edges only go one way) or undirected graph with start and end nodes- equivalent of Matrices with absorbing nodes
# Initialise all positions as -1,-1
# Identify start nodes (no inputs, some outputs)
# Identify end nodes (only inputs, no outputs)
# Position start nodes at -1 + padding
# Position end nodes at 1 - padding

# For all other nodes:
# Identify longest path e.g. 12 and set to P
# Create P X bands evenly distributed 
# Identify average length t start nodes and end nodes



# NODE SIZE as number of connections? Number of edges OR sum of weights

In [115]:
# Pos goes between -1 and 1 on both axes from the looks of it...
for node in pos:
    print(pos[node])


# Make an array for Markov chains.
For this we need symmetry, so all nodes need to be captured. This can be achieved with a cross join.
We also need to make "bounce" absorbing, this can union in.

In [124]:
query="""
with nodes as 
(select distinct nodes from
(
select distinct from_node as nodes from `itv-bde-analytics-dev.britbox_sandbox.Markov_test`
union all
select distinct to_node as nodes from `itv-bde-analytics-dev.britbox_sandbox.Markov_test`
))

,cross_join as (select a.nodes as from_node,b.nodes as to_node from nodes a cross join nodes b)

select x.from_node,
x.to_node,
ifnull(case 
    when x.from_node='Bounce' and x.to_node='Bounce' then 1
    else y.pc_transitions
end,0) as transition
from cross_join x
left join 
`itv-bde-analytics-dev.britbox_sandbox.Markov_test` y
on x.from_node=y.from_node
and x.to_node=y.to_node
order by x.to_node,y.to_node
"""
markov_base_df=bq.query(query).to_dataframe()
markov_base_df

In [125]:
# Calculate (and flag) which states are absorbing states
# markov_base_df[(markov_base_df['transition']==1) & (markov_base_df['from_node']==markov_base_df['to_node'])]
markov_base_df['Absorbing']=(markov_base_df['transition']==1) & (markov_base_df['from_node']==markov_base_df['to_node'])

In [126]:
markov_base_df.groupby('from_node').agg({"transition":"sum"}).head()

In [98]:
markov_base_df[(markov_base_df['from_node']=='Start') & (markov_base_df['transition']>0)].sort_values(by='transition', ascending=False)

In [100]:
query="""select from_node,to_node, pc_transitions from `itv-bde-analytics-dev.britbox_sandbox.Markov_test` where from_node='Start'
order by pc_transitions desc
"""
bq.query(query).to_dataframe()
#Works at this point

In [140]:
pivoted_markov_df=markov_base_df.pivot(index='from_node',columns='to_node',values='transition')

In [141]:
pivoted_markov_df

In [103]:
pivoted_markov_df.to_numpy()

In [104]:
P=pivoted_markov_df.to_numpy(landing-page-LgdOut)

In [106]:
import numpy as np
np.linalg.matrix_power(P,2)

In [109]:
pivoted_markov_df.index

In [123]:
markov_base_df[markov_base_df['Absorbing']==True]

In [None]:
|