# Clustering Models

In [1]:
import numpy as np
import pandas as pd
import sklearn
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import dtale
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.cluster import *
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
from mpl_toolkits.mplot3d import Axes3D

pd.set_option('display.max_columns', 300)

In [2]:
file = open('dataframe.p', 'rb')
df1 = pickle.load(file)
file.close()
file = open('data.p', 'rb')
df2 = pickle.load(file)
file.close()

In [3]:
# Set df1 as article text data
df1 = df1.iloc[:, :5]

In [4]:
model_df = df2

In [5]:
X_model = model_df.drop(columns=['source', 'total_sentences'])
y_model = model_df['source']

## Pre-Model Scaling and PCA

In [6]:
scaler = RobustScaler()
X_model = scaler.fit_transform(X_model)

In [7]:
# Find lowest n-components with explained variance > 95%
for n in range(2, 10):
    pca_ = PCA(n_components=n)
    pca_.fit_transform(X_model)
    print(f'{n} Components: {np.sum(pca_.explained_variance_ratio_)}')

2 Components: 0.6988869143328782
3 Components: 0.8814884095785006
4 Components: 0.9290281999324606
5 Components: 0.9461266570171378
6 Components: 0.9599591787469713
7 Components: 0.9693773194268559
8 Components: 0.9784806907902068
9 Components: 0.9842831777377676


In [8]:
pca_model = PCA(n_components=6)
X_model = pca_model.fit_transform(X_model)

## Models

### Hierarchical and K-Means

In [None]:
# Find best n-clusters for AggClustering and K-Means
for n in range(2, 6):
    hc = AgglomerativeClustering(n_clusters=n, affinity='euclidean', linkage='ward')
    kc = KMeans(n_clusters=n)
    y_hc = hc.fit(X_model)
    y_kc = kc.fit(X_model)
    print(f'---{n} Clusters---')
    print('Hierarchical Clustering:', sklearn.metrics.silhouette_score(X_model, y_hc.labels_))
    print('K-Means Clustering:', sklearn.metrics.silhouette_score(X_model, y_kc.labels_))

### DBSCAN and OPTICS

In [None]:
# DBSCAN
for n in np.arange(1.5, 2.3, 0.1):
    dbc = DBSCAN(eps=n)
    y_dbc = dbc.fit(X_model)
    print(f'Sil. Score (eps={round(n, 1)}): {sklearn.metrics.silhouette_score(X_model, y_dbc.labels_)}')

In [None]:
# OPTICS
for n in range(24, 30, 1):
    opc = OPTICS(min_samples=n, metric='correlation')
    y_opc = opc.fit(X_model)
    print(f'Sil. Score (min_samples={n}): {sklearn.metrics.silhouette_score(X_model, y_opc.labels_)}')

### Mean Shift and Spectral Clustering

In [None]:
# Mean Shift 
for n in np.arange(3.3, 3.9, 0.1):
    msc = MeanShift(bandwidth=n)
    y_msc = msc.fit(X_model)
    print(f'Sil. Score (bw={round(n, 1)}): {sklearn.metrics.silhouette_score(X_model, y_msc.labels_)}')

In [None]:
# Spectral Clustering
for n in range(2, 8):
    scc = SpectralClustering(n_clusters=n)
    y_scc = scc.fit(X_model)
    print(f'Sil. Score ({n} clusters): {sklearn.metrics.silhouette_score(X_model, y_scc.labels_)}')

### Affinity Propagation

In [None]:
# Affinity Propagation
for n in np.arange(0.86, 0.92, 0.01):
    apc = AffinityPropagation(damping=n)
    y_apc = apc.fit(X_model)
    print(f'Sil. Score (damping={round(n, 2)}): {sklearn.metrics.silhouette_score(X_model, y_apc.labels_)}')

In [9]:
# Top clustering models
hc_model = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward')
kc_model = KMeans(n_clusters=3)
dbc_model = DBSCAN(eps=1.9)
opc_model = OPTICS(min_samples=26)
msc_model = MeanShift(bandwidth=3.4)
scc_model = SpectralClustering(n_clusters=2)
apc_model = AffinityPropagation(damping=0.87)

# Labels from models
y_hc = hc_model.fit(X_model).labels_.reshape(1341,1)
y_kc = kc_model.fit(X_model).labels_.reshape(1341,1)
y_dbc = dbc_model.fit(X_model).labels_.reshape(1341,1)
y_opc = opc_model.fit(X_model).labels_.reshape(1341,1)
y_msc = msc_model.fit(X_model).labels_.reshape(1341,1)
y_scc = scc_model.fit(X_model).labels_.reshape(1341,1)
y_apc = apc_model.fit(X_model).labels_.reshape(1341,1)

In [10]:
labels = [y_hc, y_kc, y_dbc, y_opc, y_msc, y_scc, y_apc]
models = ['Hierarchical', 'K-Means', 'DBSCAN', 'OPTICS', 'Mean Shift', 'Spectral', 'Aff. Prop.']
mod_lab = list(zip(models, labels))
for m, l in mod_lab:
    print(f'{m} - Number of Labels: {len(set(l.reshape(1341,)))}')

Hierarchical - Number of Labels: 3
K-Means - Number of Labels: 3
DBSCAN - Number of Labels: 2
OPTICS - Number of Labels: 2
Mean Shift - Number of Labels: 2
Spectral - Number of Labels: 2
Aff. Prop. - Number of Labels: 44


### Notes
- DBSCAN and OPTICS both had 2 clusters: 1 labeled cluster, 1 outliers cluster
- Mean Shift had 2 clusters: Cluster 0 had 1297 obs, Cluster 1 had 44 obs (43 were CCTV articles)
- Spectral Clustering had 2 clusters: Cluster 0 had 1319 obs, Cluster 1 had 22 obs (all CCTV)
- Affinity Propagation had 44 clusters

## Model Visualizations

In [11]:
data_col = ['pc1', 'pc2', 'pc3', 'pc4', 'pc5', 'pc6']

In [12]:
mod_lab_df = pd.DataFrame(np.concatenate((y_hc, y_kc, y_dbc, y_opc, y_msc, y_scc, y_apc, X_model), axis=1))
mod_lab_df.columns = models + data_col

In [28]:
viz_df = pd.concat((mod_lab_df, model_df), axis=1)

In [14]:
# file = open('viz_df.p', 'wb')      
# pickle.dump(viz_df, file)
# file.close()

In [29]:
viz_df.head()

Unnamed: 0,Hierarchical,K-Means,DBSCAN,OPTICS,Mean Shift,Spectral,Aff. Prop.,pc1,pc2,pc3,pc4,pc5,pc6,source,protest,econ,poli,gov,protest_mention,econ_mention,poli_mention,gov_mention,total_sentences,w_protest,w_econ,w_gov,w_poli,hl_sent,protest_ratio,econ_ratio,poli_ratio,gov_ratio
0,0.0,1.0,0.0,-1.0,0.0,0.0,8.0,1.844696,-0.109955,-0.418385,-1.005067,-0.015804,-0.23037,SCMP,0.97,0.0,0.62,1.14,1,0,1,2,4,0.2425,0.0,0.57,0.155,0.296,0.25,0.0,0.25,0.5
1,0.0,1.0,0.0,-1.0,0.0,0.0,1.0,-0.830036,-1.477628,-1.431293,0.202932,0.739912,0.411129,SCMP,1.78,0.53,0.77,0.0,4,1,1,0,6,1.186667,0.088333,0.0,0.128333,0.7351,0.666667,0.166667,0.166667,0.0
2,0.0,1.0,0.0,-1.0,0.0,0.0,11.0,-0.275266,-2.661202,-0.377432,0.792375,0.457879,0.41172,SCMP,2.13,0.0,1.57,0.0,4,0,2,0,6,1.42,0.0,0.0,0.523333,0.6249,0.666667,0.0,0.333333,0.0
3,0.0,0.0,0.0,-1.0,0.0,0.0,9.0,-2.964947,1.68331,-0.319508,-0.967357,0.345269,0.703037,SCMP,0.28,1.13,0.0,0.0,1,4,0,0,5,0.056,0.904,0.0,0.0,0.4588,0.2,0.8,0.0,0.0
4,0.0,1.0,0.0,-1.0,0.0,0.0,27.0,1.765604,-0.257575,-0.894021,-0.317798,1.245709,-0.702622,SCMP,1.04,0.0,0.8,1.19,3,0,1,3,7,0.445714,0.0,0.51,0.114286,0.8979,0.428571,0.0,0.142857,0.428571


In [30]:
viz_df['Hierarchical'] = viz_df['Hierarchical'].map(lambda x: round(x)).astype(str)
viz_df['K-Means'] = viz_df['K-Means'].map(lambda x: round(x)).astype(str)
viz_df['DBSCAN'] = viz_df['DBSCAN'].map(lambda x: round(x)).astype(str)
viz_df['OPTICS'] = viz_df['OPTICS'].map(lambda x: round(x)).astype(str)
viz_df['Mean Shift'] = viz_df['Mean Shift'].map(lambda x: round(x)).astype(str)
viz_df['Spectral'] = viz_df['Spectral'].map(lambda x: round(x)).astype(str)
viz_df['Aff. Prop.'] = viz_df['Aff. Prop.'].map(lambda x: round(x)).astype(str)

In [44]:
viz_df.head()

Unnamed: 0,Hierarchical,K-Means,DBSCAN,OPTICS,Mean Shift,Spectral,Aff. Prop.,pc1,pc2,pc3,pc4,pc5,pc6,source,protest,econ,poli,gov,protest_mention,econ_mention,poli_mention,gov_mention,total_sentences,w_protest,w_econ,w_gov,w_poli,hl_sent,protest_ratio,econ_ratio,poli_ratio,gov_ratio
0,0,1,0,-1,0,0,8,1.844696,-0.109955,-0.418385,-1.005067,-0.015804,-0.23037,SCMP,0.97,0.0,0.62,1.14,1,0,1,2,4,0.2425,0.0,0.57,0.155,0.296,0.25,0.0,0.25,0.5
1,0,1,0,-1,0,0,1,-0.830036,-1.477628,-1.431293,0.202932,0.739912,0.411129,SCMP,1.78,0.53,0.77,0.0,4,1,1,0,6,1.186667,0.088333,0.0,0.128333,0.7351,0.666667,0.166667,0.166667,0.0
2,0,1,0,-1,0,0,11,-0.275266,-2.661202,-0.377432,0.792375,0.457879,0.41172,SCMP,2.13,0.0,1.57,0.0,4,0,2,0,6,1.42,0.0,0.0,0.523333,0.6249,0.666667,0.0,0.333333,0.0
3,0,0,0,-1,0,0,9,-2.964947,1.68331,-0.319508,-0.967357,0.345269,0.703037,SCMP,0.28,1.13,0.0,0.0,1,4,0,0,5,0.056,0.904,0.0,0.0,0.4588,0.2,0.8,0.0,0.0
4,0,1,0,-1,0,0,27,1.765604,-0.257575,-0.894021,-0.317798,1.245709,-0.702622,SCMP,1.04,0.0,0.8,1.19,3,0,1,3,7,0.445714,0.0,0.51,0.114286,0.8979,0.428571,0.0,0.142857,0.428571


In [83]:
list(viz_df['DBSCAN'].unique())

['0', '-1']

In [90]:
def two_d(mod, viz_df=viz_df, components=['pc1', 'pc3']):
    n_clusters = len(viz_df[mod].unique())
    c_label = list(viz_df[mod].unique())
    color_map = {'0': 'green', '1': 'blue', '2': 'red', '-1': 'fuchsia'}
    fig = px.scatter(viz_df, x=components[0], y=components[1],
                     color=mod,
                     color_discrete_map=color_map, opacity=0.8)
    fig.update_layout(legend_orientation="h")
    dicts = []
    for i in c_label:
        if i != '-1':
            a = dict(type="circle", xref="x", yref="y",
                     x0=min(viz_df.loc[viz_df[mod] == f'{i}' ][components[0]]),
                     y0=min(viz_df.loc[viz_df[mod] == f'{i}' ][components[1]]),
                     x1=max(viz_df.loc[viz_df[mod] == f'{i}' ][components[0]]),
                     y1=max(viz_df.loc[viz_df[mod] == f'{i}' ][components[1]]),
                     opacity=0.2, fillcolor=color_map[f'{i}'],
                     line_color=color_map[f'{i}'])
            dicts.append(a)
         
    fig.update_layout(shapes=dicts)
    fig.update_layout(
        title={'text': f'{mod} (n={n_clusters}) in 2D (n-components=6)',
               'y':0.9,
               'x':0.5,
               'xanchor': 'center',
               'yanchor': 'top'},
        font=dict(family='Arial',
                  size=18,
                  color='#7f7f7f'))
    
    fig.update_layout(xaxis_title=f"Principal Component: {components[0]}",
                      yaxis_title=f"Principal Component: {components[1]}",
                      font=dict(family='Arial',
                                size=12,
                                color='#7f7f7f'))

    fig.show()

In [91]:
mod = 'Mean Shift'
two_d(mod, components=['pc1', 'pc6'])

In [92]:
mod = 'DBSCAN'
two_d(mod, components=['pc1', 'pc3'])

In [None]:
# Convert source category to int: 0 - SCMP, 1 - ABC (Australia), 2 - Reuters, 3 - CCTV, 4 - CNN
# cond = [df['source'] == 'SCMP',
#         df['source'] == 'ABC (Australia)',
#         df['source'] == 'Reuters',
#         df['source'] == 'CCTV',
#         df['source'] == 'CNN']
# choice = [0, 1, 2, 3, 4]
# df['source'] = np.select(cond, choice)

In [None]:
# df['source'] = df['source'].astype('category')

In [None]:
# X = df.drop(columns=['source'])
# y = df['source']

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)

In [None]:
# scaler = RobustScaler()
# X_train = scaler.fit_transform(X_train)

In [None]:
# pca_1 = PCA(n_components=3)
# pca_2 = PCA(n_components=4)
# pca_3 = PCA(n_components=5)

# principalComponents = pca_1.fit_transform(X_model)
# principalComponents = pca_2.fit_transform(X_model)
# principalComponents = pca_3.fit_transform(X_model)

# print(np.sum(pca_1.explained_variance_ratio_))
# print(np.sum(pca_2.explained_variance_ratio_))
# print(np.sum(pca_3.explained_variance_ratio_))


In [None]:
pca = PCA(n_components=6)
comp = pca.fit_transform(X_train)

In [None]:
X_array = np.concatenate((comp, np.array(y_train).reshape(1072, 1)), axis=1)

In [None]:
comp_df = pd.DataFrame(data=X_array, columns=['pc1', 'pc2', 'pc3', 'pc4', 'pc5', 'pc6', 'source'])

In [None]:
comp_df['source'] = comp_df['source'].astype('category')

In [None]:
X1 = comp_df.drop(columns=['source'])
y1 = comp_df['source']

In [None]:
n = 4
hc = AgglomerativeClustering(n_clusters=n, affinity='euclidean', linkage='ward')
kc = KMeans(n_clusters=n)

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
# scaler = MinMaxScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

In [None]:
# pca = PCA(n_components=3)
# comp_train = pca.fit_transform(X_train)
# comp_test = pca.transform(X_test)

In [None]:
# X = df.iloc[:, 1:5].values
# y = df['source']

In [None]:
# comp_train_df = pd.DataFrame(data = comp_train, columns = ['pc1', 'pc2', 'pc3'])
# comp_test_df = pd.DataFrame(data = comp_test, columns = ['pc1', 'pc2', 'pc3'])

In [None]:
# X1 = comp_train_df.values

In [None]:
# estimators = [('k_means_top_senti_2', KMeans(n_clusters=2)),
#               ('k_means_top_senti_3', KMeans(n_clusters=3)),
#               ('k_means_top_senti_5', KMeans(n_clusters=5))]

# fignum = 1
# titles = ['2 clusters', '3 clusters', '5 clusters']
# for name, est in estimators:
#     fig = plt.figure(fignum, figsize=(15, 10))
#     ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=60)
#     est.fit(X_array)
#     labels = est.labels_

#     ax.scatter(X_array[:, 0], X_array[:, 3], X_array[:, 4],
#                c=labels.astype(np.float), edgecolor='k')

#     ax.w_xaxis.set_ticklabels([])
#     ax.w_yaxis.set_ticklabels([])
#     ax.w_zaxis.set_ticklabels([])
#     ax.set_xlabel('PC1')
#     ax.set_ylabel('PC2')
#     ax.set_zlabel('PC3')
#     ax.set_title(titles[fignum - 1])
#     ax.dist = 12
#     fignum = fignum + 1

# # # Plot the ground truth
# fig = plt.figure(figsize=(15, 10))
# ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=50, azim=60)

# for name, label in [('SCMP', 0),
#                     ('ABC', 1),
#                     ('Reuters', 2),
#                     ('CCTV', 3),
#                     ('CNN', 4)]:
#     ax.text3D(X_array[y1 == label, 0].mean(),
#               X_array[y1 == label, 3].mean(),
#               X_array[y1 == label, 4].mean(), name,
#               horizontalalignment='center',
#               bbox=dict(alpha=1, edgecolor='w', facecolor='w'))
# # Reorder the labels to have colors matching the cluster results
# # y = np.choose(y, [1, 3, 4])
# ax.scatter(X_array[:, 0], X_array[:, 3], X_array[:, 4], c=y1, edgecolor='k', alpha=0)

# ax.w_xaxis.set_ticklabels([])
# ax.w_yaxis.set_ticklabels([])
# ax.w_zaxis.set_ticklabels([])
# ax.set_xlabel('PC1')
# ax.set_ylabel('PC4')
# ax.set_zlabel('PC5')
# ax.set_title('Ground Truth')
# ax.dist = 12

# fig.show()

In [None]:
cond1 = (comp_df['source'] == 'CCTV')
cond2 = (comp_df['source'] == 'CNN')
cond3 = (comp_df['source'] == 'ABC (Australia)')
cond4 = (comp_df['source'] == 'SCMP')
cond5 = (comp_df['source'] == 'Reuters')

In [None]:
n = 4
hc = AgglomerativeClustering(n_clusters=n, affinity='euclidean', linkage='ward')
kc = KMeans(n_clusters=n)

In [None]:
y_hc = hc.fit(X1)
y_kc = kc.fit(X1)

In [None]:
comp_df['hc_labels'] = y_hc.labels_
comp_df['kc_labels'] = y_kc.labels_

In [None]:
print('Hierarchical Clustering:', sklearn.metrics.silhouette_score(X1, comp_df['hc_labels']))
print('K-Means Clustering:', sklearn.metrics.silhouette_score(X1, comp_df['kc_labels']))

In [None]:
n = 2
hc2 = AgglomerativeClustering(n_clusters=n, affinity='euclidean', linkage='ward')
kc2 = KMeans(n_clusters=n)

In [None]:
plot2_df = comp_df
y_hc2 = hc2.fit(X1)
y_kc2 = kc2.fit(X1)
plot2_df['hc_labels'] = y_hc2.labels_
plot2_df['kc_labels'] = y_kc2.labels_

In [None]:
print('Hierarchical Clustering:', sklearn.metrics.silhouette_score(X1, plot2_df['hc_labels']))
print('K-Means Clustering:', sklearn.metrics.silhouette_score(X1, plot2_df['kc_labels']))

In [None]:
plot_df['kc_labels'] = plot_df['kc_labels'].astype('str')
plot2_df['kc_labels'] = plot2_df['kc_labels'].astype('str')
plot_df['hc_labels'] = plot_df['hc_labels'].astype('str')
plot2_df['hc_labels'] = plot2_df['hc_labels'].astype('str')

In [None]:
color_map = {'0': 'lightslategray', '1': 'crimson'}
fig = px.scatter(plot2_df, x='pc1', y='pc3',
                 color='hc_labels',
                 color_discrete_map=color_map, opacity=0.8)
fig.update_layout(legend_orientation="h")
fig.update_layout(
    shapes=[dict(type="circle",
                 xref="x",
                 yref="y",
                 x0=min(plot2_df.loc[plot2_df['hc_labels'] == '0' ]['pc1']),
                 y0=min(plot2_df.loc[plot2_df['hc_labels'] == '0' ]['pc3']),
                 x1=max(plot2_df.loc[plot2_df['hc_labels'] == '0' ]['pc1']),
                 y1=max(plot2_df.loc[plot2_df['hc_labels'] == '0' ]['pc3']),
                 opacity=0.2,
                 fillcolor="lightslategray",
                 line_color="lightslategray"),
            dict(type="circle",
                 xref="x",
                 yref="y",
                 x0=min(plot2_df.loc[plot2_df['hc_labels'] == '1' ]['pc1']),
                 y0=min(plot2_df.loc[plot2_df['hc_labels'] == '1' ]['pc3']),
                 x1=max(plot2_df.loc[plot2_df['hc_labels'] == '1' ]['pc1']),
                 y1=max(plot2_df.loc[plot2_df['hc_labels'] == '1' ]['pc3']),
                 opacity=0.2,
                 fillcolor="crimson",
                 line_color="crimson")])

fig.update_layout(
    title={'text': 'Hierarchical Clusters (n=2) in 2D (n-components=6)',
           'y':0.9,
           'x':0.5,
           'xanchor': 'center',
           'yanchor': 'top'},
    font=dict(family='Arial',
              size=18,
              color='#7f7f7f'))
fig.update_layout(xaxis_title="Principal Component 1",
                  yaxis_title="Principal Component 3",
                  font=dict(family='Arial',
                            size=12,
                            color='#7f7f7f'))

fig.show()

In [None]:
plot2_df.groupby('source')['hc_labels'].value_counts()

In [None]:
outlets = ['CCTV', 'Everyone else']
colors = ['indianred'] * 2
colors[0] = 'crimson'
fig = go.Figure([go.Bar(x=outlets,
                        y=[45, 6],
                        marker_color=colors)])
fig.update_layout(
    title={'text': 'Cluster 1 (Agglomerative Clustering, n=2)',
           'y':0.9,
           'x':0.5,
           'xanchor': 'center',
           'yanchor': 'top'},
    font=dict(family='Arial',
              size=18,
              color='#7f7f7f'))
fig.update_layout(xaxis_title="Sources",
                  yaxis_title="Total Counts",
                  font=dict(family='Arial',
                            size=12,
                            color='#7f7f7f'))

fig.show()

In [None]:
# color_map = {'CCTV': 'red', 'SCMP': 'darkslategrey', 'ABC (Australia)': 'khaki'}
fig = px.scatter_3d(plot_df, x='pc1', y='pc3', z='pc4', color='hc_labels', symbol='source', opacity=0.8)
fig.update_layout(showlegend=False)
fig.show()

In [None]:
tsne_comp = TSNE(n_components=2, perplexity=100, early_exaggeration=100).fit_transform(X_train)

In [None]:
tsne_array = np.concatenate((tsne_comp, np.array(y_train).reshape(1072, 2)), axis=1)

In [None]:
tsne_df = pd.DataFrame(data=tsne_array, columns=['tsne1', 'tsne2', 'source', 'date'])

In [None]:
tsne_df['hc_labels'] = y_hc.labels_
tsne_df['kc_labels'] = y_kc.labels_

In [None]:
fig = px.scatter(tsne_df, x='tsne1', y='tsne2',
                 color='kc_labels', opacity=0.8,
                 title='2-D TSNE Distribution of Articles')
fig.show()

In [None]:
fig, ax = plt.subplots(figsize=(15,7))
comp_df.groupby('source')['kc_labels'].value_counts(normalize=True, sort=False).plot(ax=ax, kind='bar')
plt.show()

In [None]:
abc = np.array(round(comp_df.groupby('source')['kc_labels'].value_counts(normalize=True, sort=False)['ABC (Australia)']*100))
cctv = np.array(round(comp_df.groupby('source')['kc_labels'].value_counts(normalize=True, sort=False)['CCTV']*100))
cnn = np.array(round(comp_df.groupby('source')['kc_labels'].value_counts(normalize=True, sort=False)['CNN']*100))
reuters = np.array(round(comp_df.groupby('source')['kc_labels'].value_counts(normalize=True, sort=False)['Reuters']*100))
scmp = np.array(round(comp_df.groupby('source')['kc_labels'].value_counts(normalize=True, sort=False)['SCMP']*100))

# label_source.loc['ABC (Australia)'] = abc
# label_source.loc['CCTV'] = cctv
# label_source.loc['CNN'] = cnn
# label_source.loc['Reuters'] = reuters
# label_source.loc['SCMP'] = scmp

In [None]:
sources = ['ABC (Australia)', 'CCTV', 'CNN', 'Reuters', 'SCMP']
labels = ['Cluster 0', 'Cluster 1', 'Cluster 2', 'Cluster 3']

In [None]:
ls = np.concatenate((abc, cctv, cnn, reuters, scmp), axis=0).reshape(5,4)

In [None]:
ls

In [None]:
x_data = ls
y_data = sources
top_labels = labels
colors = ['rgba(38, 24, 74, 0.8)', 'rgba(71, 58, 131, 0.8)',
          'rgba(122, 120, 168, 0.8)', 'rgba(164, 163, 204, 0.85)']

fig = go.Figure()

for i in range(0, len(x_data[0])):
    for xd, yd in zip(x_data, y_data):
        fig.add_trace(go.Bar(
            x=[xd[i]], y=[yd],
            orientation='h',
            marker=dict(
                color=colors[i],
                line=dict(color='rgb(248, 248, 249)', width=1)
            )
        ))

fig.update_layout(
    xaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=False,
        zeroline=False,
        domain=[0.15, 1]
    ),
    yaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=False,
        zeroline=False,
    ),
    barmode='stack',
    paper_bgcolor='rgb(248, 248, 255)',
    plot_bgcolor='rgb(248, 248, 255)',
    margin=dict(l=10, r=40, t=140, b=80),
    showlegend=False,
)

annotations = []

for yd, xd in zip(y_data, x_data):
    # labeling the y-axis
    annotations.append(dict(xref='paper', yref='y',
                            x=0.14, y=yd,
                            xanchor='right',
                            text=str(yd),
                            font=dict(family='Arial', size=10,
                                      color='rgb(67, 67, 67)'),
                            showarrow=False, align='right'))
    # labeling the first percentage of each bar (x_axis)
    annotations.append(dict(xref='x', yref='y',
                            x=xd[0] / 2, y=yd,
                            text=str(xd[0]) + '%',
                            font=dict(family='Arial', size=10,
                                      color='rgb(248, 248, 255)'),
                            showarrow=False))
    # labeling the first Likert scale (on the top)
    if yd == y_data[-1]:
        annotations.append(dict(xref='x', yref='paper',
                                x=xd[0] / 2, y=1.1,
                                text=top_labels[0],
                                font=dict(family='Arial', size=10,
                                          color='rgb(67, 67, 67)'),
                                showarrow=False))
    space = xd[0]
    for i in range(1, len(xd)):
            # labeling the rest of percentages for each bar (x_axis)
            annotations.append(dict(xref='x', yref='y',
                                    x=space + (xd[i]/2), y=yd,
                                    text=str(xd[i]) + '%',
                                    font=dict(family='Arial', size=10,
                                              color='rgb(248, 248, 255)'),
                                    showarrow=False))
            # labeling the Likert scale
            if yd == y_data[-1]:
                annotations.append(dict(xref='x', yref='paper',
                                        x=space + (xd[i]/2), y=1.1,
                                        text=top_labels[i],
                                        font=dict(family='Arial', size=10,
                                                  color='rgb(67, 67, 67)'),
                                        showarrow=False))
            space += xd[i]

fig.update_layout(annotations=annotations)
fig.update_layout(
    title={'text': 'Cluster Groups by Source',
           'y':0.9,
           'x':0.5,
           'xanchor': 'center',
           'yanchor': 'top'},
           font=dict(
               family='Arial',
               size=18,
               color='#7f7f7f'))
fig.show()

In [None]:

fig.update_layout(barmode='stack')
fig.show()