In [20]:
#!pip install -U scikit-learn
# !pip install -U plotly
import sklearn
from numpy import unique
from numpy import where
from sklearn.datasets import make_classification
from sklearn.cluster import AgglomerativeClustering,KMeans, DBSCAN, OPTICS, Birch
from sklearn.mixture import GaussianMixture
from matplotlib import pyplot as plt
from sklearn.metrics import silhouette_score #Silhouette score basically represents the intra-cluster distance. You want it to be as large as poss (to maximise distance between clusters and so uniqueness)
import plotly.express as px
import plotly.graph_objects as go

import numpy as np
import sys
sys.path.append(r'/home/jupyter/reusable_code')
import google_api_functions as gaf
from google.cloud import bigquery # To run BQ statements
creds=gaf.Authenticate_Google(r'/home/jupyter/reusable_code/') # Return logged-in credentials


In [2]:
bq = bigquery.Client(project='itv-bde-analytics-dev',credentials=creds)
query='''select *,pc_first_views/pc_notfirst_views AS FIRSTVIEWINDEX, 1 as dummyField from
`itv-bde-analytics-dev.britbox_sandbox.SW_Viewing_FirstViewsProg`'''
df = bq.query(query).to_dataframe()
df.head()

In [26]:
df.plot.scatter(x='pc_first_views', y='pc_notfirst_views')

In [35]:
df['Big6']=df['programme'].apply(lambda x: 5 if x in ['New Spitting Image','Broadchurch','Vera','The Only Way is Essex','Doctor Who Classic','Love Island'] else 1)

In [44]:
layout = Layout(paper_bgcolor='rgba(0,0,0,0)',plot_bgcolor='rgba(0,0,0,0)')


# Plot
fig = px.scatter(df, x='pc_first_views', y='pc_notfirst_views', log_x=True, size_max=20,hover_data=['first_view_rank','notfirst_view_rank','programme'], color='FIRSTVIEWINDEX',size='Big6'\
                 #,trendline="ols")
                ,template="plotly_white")
fig.update_traces(textposition='top center')
fig.update_layout(title_text='Programme Groupings', title_x=0.5)
fig.show()

In [5]:
df.plot.scatter(x='first_view_rank', y='notfirst_view_rank')

In [6]:
# Plot
fig = px.scatter(df, x='first_view_rank', y='notfirst_view_rank', log_x=False, size_max=100,hover_data=['first_view_rank','notfirst_view_rank','programme'],trendline="ols",color='FIRSTVIEWINDEX')
fig.update_traces(textposition='top center')
fig.update_layout(title_text='Programme Groupings', title_x=0.5)
fig.show()

In [7]:
# Plot
fig = px.scatter(df, x='FIRSTVIEWINDEX', y='dummyField', log_x=False, size_max=100,hover_data=['first_view_rank','notfirst_view_rank','programme'])
fig.update_traces(textposition='top center')
fig.update_layout(title_text='Programme Groupings', title_x=0.5)
fig.show()

In [8]:
# Define array of values to cluster on
X=df[['FIRSTVIEWINDEX','dummyField']]

https://machinelearningmastery.com/clustering-algorithms-with-python/

https://scikit-learn.org/stable/modules/clustering.html

In [9]:
elbow = []
kmax = 10
for k in range(2, kmax+1):
    kmeans = KMeans(n_clusters = k).fit(X)
    elbow.append(kmeans.inertia_)
    
plt.figure(figsize=(8,6))
plt.plot(np.arange(2,11), elbow)
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia (Intra cluster sum of squares)')
plt.title('Inertia vs n_clusters to determine optimal cluster size', fontweight='bold')
plt.show()

https://towardsdatascience.com/opening-the-black-box-of-clustering-kmeans-e970062ff415

In [10]:

sil = []
elbow = []
kmax = 10
for k in range(2, kmax+1):
    kmeans = KMeans(n_clusters = k).fit(X)
    elbow.append(kmeans.inertia_)
    labels = kmeans.labels_
    sil.append(silhouette_score(X, labels, metric = 'euclidean'))
    
plt.figure(figsize=(8,6))
#plt.plot(np.arange(2,11), elbow)
plt.plot(np.arange(2,11), sil)
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia (Intra cluster sum of squares)')
plt.title('Inertia vs n_clusters to determine optimal cluster size', fontweight='bold')
plt.show()




In [23]:
# agglomerative clustering



# define the model/ methodology used for clustering

model = GaussianMixture(n_components=7) # Decent
model = KMeans(n_clusters=5) # Decent (ish) results
model = AgglomerativeClustering(n_clusters=6,linkage='complete') # Best model IMO, Single linkage is the worst (picks up on the outliers)
model= DBSCAN(eps=0.01,min_samples=1) # Poor Results
model= OPTICS() # Poor Results
model= Birch(threshold=0.01,n_clusters=6) # Fair Results
model = KMeans(n_clusters=3) # Decent (ish) results

# fit model and predict clusters
yhat = model.fit_predict(X)
# retrieve unique clusters
clusters = unique(yhat)

# Map clusters back onto the dataframe
df['clusterNum']=yhat

df['Labels']=df['programme'].where(df['clusterNum']>0)

# Visualise segments on scatter plot


fig = px.scatter(df, x='FIRSTVIEWINDEX', y='dummyField', log_x=False, size_max=100,hover_data=['first_view_rank','notfirst_view_rank','programme'],color='clusterNum',text='Labels')
#fig.update_traces(textposition='top center')
#fig.update_layout(title_text='Programme Groupings', title_x=0.5)
#fig.update_layout(annotations=[
#            go.layout.Annotation(x='FIRSTVIEWINDEX',
#            y='dummyField',
#            xref="x",
#            yref="y",
#            text="Labels",
#            align='center',
#            showarrow=False,
#            yanchor='bottom',
#            textangle=0) for point in df['Labels']])

fig.show()

# Visualise df
df.head()

In [12]:
help(fig.update_traces)

In [13]:
dataset=bq.dataset('britbox_sandbox')
table_ref = dataset.table("SW_Viewing_FirstViewsProg_withClust")

newcol_names={x:x.replace(" ", "_").replace("/","").replace("?","").replace("-","").replace(".","") for x in df.columns}
df_for_bq=df.rename(columns=newcol_names)

try:
    bq.delete_table(table_ref)
except:
    pass
job = bq.load_table_from_dataframe(df_for_bq, table_ref)

job.result()  # Waits for table load to complete.
print("Loaded dataframe to {}".format(table_ref.path))