In [1]:
#!pip install -U scikit-learn
import sklearn
from numpy import unique
from numpy import where
from sklearn.datasets import make_classification
from sklearn.cluster import AgglomerativeClustering,KMeans, DBSCAN, OPTICS, Birch
from sklearn.mixture import GaussianMixture
from matplotlib import pyplot as plt
from sklearn.metrics import silhouette_score #Silhouette score basically represents the intra-cluster distance. You want it to be as large as poss (to maximise distance between clusters and so uniqueness)
import plotly.express as px

import numpy as np
import sys
sys.path.append(r'/home/jupyter/reusable_code')
import google_api_functions as gaf
from google.cloud import bigquery # To run BQ statements
creds=gaf.Authenticate_Google(r'/home/jupyter/reusable_code/') # Return logged-in credentials
from sklearn.linear_model import LinearRegression
import pandas as pd
pd.options.display.max_rows = 999

In [2]:
bq = bigquery.Client(project='itv-bde-analytics-dev',credentials=creds)
query='''select * from
`itv-bde-analytics-dev.britbox_sandbox.SW_Viewing_Programme_Score_4`'''
df = bq.query(query).to_dataframe()
df.head()

In [3]:
df.plot.scatter(x='depth_rate', y='pc_cross_Sell')

In [4]:


# Create a "graph labels" column that fills in the name only if certain criteria are met
df.loc[(df['pc_cross_Sell'] <= 0.83) | (df['depth_rate'] >= 0.65), 'Graph_labels'] = df['programme']  


# Plot
fig = px.scatter(df, x='depth_rate', y='pc_cross_Sell', log_x=False, size_max=100)
fig.update_traces(textposition='top center')
fig.update_layout(title_text='Programme Groupings', title_x=0.5)
fig.show()

In [5]:

# Plot
fig = px.scatter(df, x='watched', y='watched_depth', log_x=False, size_max=100,hover_data=['watched','watched_depth','programme'],trendline="ols")
fig.update_traces(textposition='top center')
fig.update_layout(title_text='Programme Groupings', title_x=0.5)
fig.show()

In [6]:
model = LinearRegression()
x=np.array(df.watched).reshape((-1, 1))
y=df.watched_depth
model.fit(x, y)
r_sq = model.score(x, y)
r_sq


In [7]:
df['Watched_depth_pred'] = model.predict(x)
df['Watched_depth_pred']= df['Watched_depth_pred'].apply(lambda x: x if x>0 else 1) # Overwrite negative values
df['Watched_depth_performance']=(df['watched_depth']/df['Watched_depth_pred'])-1


# Plot
fig = px.scatter(df[df['Viewer_rank']<100], x='watched', y='watched_depth', log_x=True, size_max=100,hover_data=['watched','watched_depth','programme'],color='Watched_depth_performance')
fig.update_traces(textposition='top center')
fig.update_layout(title_text='Programme Groupings', title_x=0.5)
fig.show()

df[df['Viewer_rank']<100][['programme','watched','Watched_depth_pred','watched_depth','Watched_depth_performance','depth_rate']].sort_values(by='Watched_depth_performance',ascending=True)

In [8]:
# Define array of values to cluster on
X=df[['depth_rate','pc_cross_Sell']]

https://machinelearningmastery.com/clustering-algorithms-with-python/

https://scikit-learn.org/stable/modules/clustering.html

In [9]:
elbow = []
kmax = 10
for k in range(2, kmax+1):
    kmeans = KMeans(n_clusters = k).fit(X)
    elbow.append(kmeans.inertia_)
    
plt.figure(figsize=(8,6))
plt.plot(np.arange(2,11), elbow)
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia (Intra cluster sum of squares)')
plt.title('Inertia vs n_clusters to determine optimal cluster size', fontweight='bold')
plt.show()

https://towardsdatascience.com/opening-the-black-box-of-clustering-kmeans-e970062ff415

In [10]:

sil = []
elbow = []
kmax = 10
for k in range(2, kmax+1):
    kmeans = KMeans(n_clusters = k).fit(X)
    elbow.append(kmeans.inertia_)
    labels = kmeans.labels_
    sil.append(silhouette_score(X, labels, metric = 'euclidean'))
    
plt.figure(figsize=(8,6))
#plt.plot(np.arange(2,11), elbow)
plt.plot(np.arange(2,11), sil)
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia (Intra cluster sum of squares)')
plt.title('Inertia vs n_clusters to determine optimal cluster size', fontweight='bold')
plt.show()




In [11]:

# Create a function to map a colour to cluster number
def get_colour(x):
    if x ==0:
        return 'red'
    elif x ==1:
        return 'blue'
    elif x ==2:
        return 'purple'
    elif x ==3:
        return 'orange'
    elif x ==4:
        return 'yellow'
    elif x ==5:
        return 'green'
    elif x ==6:
        return 'brown'
    elif x ==7:
        return 'pink'
    elif x ==8:
        return 'cyan'
    else:
        return 'olive'



In [12]:
# agglomerative clustering



# define the model/ methodology used for clustering

model = GaussianMixture(n_components=7) # Decent
model = KMeans(n_clusters=5) # Decent (ish) results
model = AgglomerativeClustering(n_clusters=6,linkage='complete') # Best model IMO, Single linkage is the worst (picks up on the outliers)
model= DBSCAN(eps=0.01,min_samples=1) # Poor Results
model= OPTICS() # Poor Results
model= Birch(threshold=0.01,n_clusters=6) # Fair Results

model = AgglomerativeClustering(n_clusters=6,linkage='complete') # Best model IMO, Single linkage is the worst (picks up on the outliers)

# fit model and predict clusters
yhat = model.fit_predict(X)
# retrieve unique clusters
clusters = unique(yhat)

# Map clusters back onto the dataframe
df['clusterNum']=yhat

# Add a column to the dataframe to say what colour to use
df['color']= df['clusterNum'].apply(get_colour)


# Visualise segments on scatter plot
fig = px.scatter(df, x='depth_rate', y='pc_cross_Sell', log_x=False, size_max=10, size=np.log(df['total_episodes_in_programme']),color='clusterNum',hover_data=['pc_cross_Sell','depth_rate','programme','watched','clusterNum'])
fig.update_traces(textposition='top center')
fig.update_layout(title_text='Programme Groupings', title_x=0.5)
fig.show()

fig2 = px.scatter(df, x='depth_rate', y='pc_cross_Sell', log_x=False, size_max=10, size=np.log(df['watched']),color='clusterNum',hover_data=['pc_cross_Sell','depth_rate','programme','watched','clusterNum'])
fig2.show()

# Visualise df
df.head()

In [13]:
dataset=bq.dataset('britbox_sandbox')
table_ref = dataset.table("SW_Viewing_Programme_Score_5")

newcol_names={x:x.replace(" ", "_").replace("/","").replace("?","").replace("-","").replace(".","") for x in df.columns}
df_for_bq=df.rename(columns=newcol_names)

try:
    bq.delete_table(table_ref)
except:
    pass
job = bq.load_table_from_dataframe(df_for_bq, table_ref)

job.result()  # Waits for table load to complete.
print("Loaded dataframe to {}".format(table_ref.path))

In [None]:
df[df['Viewer_rank']<=19].sort_values(by='watched_depth',ascending=False)

In [None]:
df[df['Viewer_rank']<=30].sort_values(by='depth_rate', ascending=False)


In [37]:
dfforPresentation= df.copy()
dfforPresentation['Classification']=df['programme'].apply(lambda x:\
                                                         'Bad Content' if x in ['First Tuesday','Strictly Come Dancing','All Creatures Great and Small','Famalam','Royal Shakespeare Company',"Hugh's Fat Fight"]\
                                                          else ('Good Content' if x in ['DCI Banks','Brief Encounters','Life Begins','Silk','The Syndicate','Scott & Bailey'] else \
                                                                ('Top 7' if x in ['Love Island','New Spitting Image','The Only Way is Essex','Broadchurch','Doctor Who Classic','Vera','Only Fools and Horses'] else\
                                                                 'Other')))

# Visualise segments on scatter plot
fig = px.scatter(dfforPresentation, x='depth_rate', y='pc_cross_Sell', log_x=False, size_max=10, size=np.log(dfforPresentation['total_episodes_in_programme']),\
                 color='Classification',hover_data=['pc_cross_Sell','depth_rate','programme','watched','clusterNum'])
fig.update_traces(textposition='top center')
fig.update_layout(title_text='Programme Groupings', title_x=0.5)
fig.show()

# Visualise df
df.head()