In [22]:
# Set up modules for Google functionality
from google.cloud import bigquery # To run BQ statements
from google_auth_oauthlib import flow # To authorise as user
from googleapiclient.discovery import build # To pull in from sheets, slides etc. API
from google.auth.transport.requests import Request

# Display
import pprint

# Operating system stuff
import pickle
import os.path
import sys

# Data handling
import json
import requests
from pandas import read_csv
from pandas import datetime

# Stats, models, datasheets
import pandas as pd
import pyreadstat

# Visualisation
from matplotlib import pyplot
import matplotlib.pyplot as plt
import matplotlib_venn # For venn diagrams
from pandas.plotting import autocorrelation_plot
import wordcloud


# Text processing
#!pip install nltk
import nltk
from nltk.corpus import stopwords
#nltk.download('stopwords')

# Network graphs
import networkx as nx


# Misc
from xlsxwriter.utility import xl_rowcol_to_cell # Used to create cell references
import itertools

# Load custom scripts in reusable_code folder
sys.path.append(r'/home/jupyter/reusable_code')

import google_api_functions as gaf



In [2]:
creds=gaf.Authenticate_Google(r'/home/jupyter/reusable_code/')
bq = bigquery.Client(project='itv-bde-analytics-dev',credentials=creds)

## WordClouds from programme Synopses

In [61]:
query=""" 
select macro_segment,
segment,
lower(array_to_string(array_agg(ifnull(thousand,' ')),' ')) as synopses
from
  (select distinct  ifnull(macro_segment,'No segment') as macro_segment
  , ifnull(segment,'No segment') as Segment,
  ccid.programme_id,synopsis.prog.thousand
  from `itv-bde-analytics-dev.britbox_sandbox.SW_ContentSegPOC_Catalogue` )
group by 1,2
"""
df = bq.query(query).to_dataframe()
df

In [64]:
STOPWORDS=stopwords.words('english')

for i in df.to_dict('records'):
    #print(i['synopses'])
    wc =wordcloud.WordCloud(max_font_size=50, max_words=50,background_color='white',stopwords=['starring','of','the','and','to']+ list(STOPWORDS)).generate(i['synopses'])
    
# Display the generated image:
    plt.imshow(wc, interpolation='bilinear')
    plt.axis("off")
    plt.title(i['segment']).set_fontfamily('Monserrat')
    plt.show()
    wc.to_file("WordClouds/{}.png".format('Synopses-'+i['macro_segment']+'-'+i['segment']))

## WordClouds from subgenre

In [60]:
query=""" 
select macro_segment,
segment,
lower(array_to_string(array_agg(ifnull(subgenre,' ')),' ')) as subgenre
from
  (select distinct  ifnull(macro_segment,'No segment') as macro_segment
  , ifnull(segment,'No segment') as Segment,
  ccid.programme_id,array_to_string(sub_genre,' ') as subgenre
  from `itv-bde-analytics-dev.britbox_sandbox.SW_ContentSegPOC_Catalogue` )
group by 1,2
"""
df_subg = bq.query(query).to_dataframe()
df_subg

In [63]:
STOPWORDS=stopwords.words('english')

for i in df_subg.to_dict('records'):
    #print(i['synopses'])
    wc =wordcloud.WordCloud(max_font_size=50, max_words=50,background_color='white',stopwords=['starring','of','the','and','to']+ list(STOPWORDS)).generate(i['subgenre'])
    
# Display the generated image:
    plt.imshow(wc, interpolation='bilinear')
    plt.axis("off")
    plt.title(i['segment']).set_fontfamily('Monserrat')
    plt.show()
    wc.to_file("WordClouds/{}.png".format('SubGenre-'+i['macro_segment']+'-'+i['segment']))