# Topic Modeling and Sentiment Analysis Rework

In [1]:
import numpy as np
import pandas as pd
import pickle
import gensim
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import scipy.stats as stats

from statsmodels.nonparametric.kernel_density import KDEMultivariate
from sklearn.decomposition import PCA
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler, Normalizer
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from plotly.subplots import make_subplots
from textblob import TextBlob
from pprint import pprint
from gensim import corpora, models
from gensim.models import CoherenceModel

from topic_model import compute_coherence_values, plot_c_v, topics_in_doc

## Changes to the final project
In the original "the Language in News" project dominant topic bias and  class imbalance possibly hindered potential insights.  In an attempt to improve topic coherence and clustering, various methods and changes will be employed, including:
- Class imbalance will be dealt with by removing all South China Morning Post (SCMP) articles since it is the only local news source and produced various forms of bias in the data.
- Certain features (i.e. sentiment scores of topics) will be scaled in order to help alleviate bias from dominant topics.
- The entire articles will be used instead of the first 10 sentences.
- Bigram model created across all articles instead of per article (incorrectly applied in the final project).

## Additional cleaning of articles
A script and functions were written to remove 'SCMP' articles, articles before the protests, unwanted articles.  A pickled DataFrame ```df_topic.p``` with additional preprocessing (tokenization, bigram creation, and lemmatization) is returned by running the ```prep_for_tm.py``` file.

In [None]:
# %run 'prep_for_tm.py'

f = open('df_topic.p', 'rb')
df = pickle.load(f)
f.close()

In [None]:
# Text for topic modeling
data = df['word_tokens']

In [None]:
id2word = corpora.Dictionary(data)
id2word.filter_extremes(no_below=5, no_above=0.5, keep_n=10000)
corpus = [ id2word.doc2bow(datum) for datum in data ]

In [None]:
num_docs = len(data)

## Topic modeling
Modeled on the collection of articles with standard LDA and LDA Mallet.  There local maxima for LDA and LDA Mallet coherence values at 3 topics (0.388 and  0.455, respectively) that had far greater interpretability than the possible global maxima at 12 and 19 topics.  The topics for LDA and LDA Mallet were manually labeled as economic, protests, political and protests, political, economic.  Ultimately the LDA Mallet model was chosen for its higher coherence values and less vocabulary overlap between topics.

Additionally, the original project tried to assign a topic to individual sentences, however, after looking at topic probabilities, the model had difficulty determining which topic was most probable (most topic probabilities by sentence were between 30-40%).  Whereas the model was more capable of finding a most probable topic (often >50% for certain articles) when evaluating an entire article.

Headlines did not appear to have much subjectivity or polarity.

In [None]:
mallet_path = '/Users/waynelam/Documents/DevStuff/mallet-2.0.8/bin/mallet'

In [None]:
start, limit, step = 3, 20, 1
model_list, coherence_values = compute_coherence_values(id2word,
                                                        corpus,
                                                        data,
                                                        start=start,
                                                        limit=limit,
                                                        step=step,
                                                        num_docs=num_docs)

In [None]:
x = range(start, limit, step)
plot_c_v(x, coherence_values)

In [None]:
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       workers=6,
                                       num_topics=3,
                                       minimum_probability=0.75,
                                       random_state=100,
                                       chunksize=num_docs,
                                       passes=10)

In [None]:
pprint(lda_model.print_topics())

In [None]:
print('\nPerplexity: ', lda_model.log_perplexity(corpus))

coherence_model_lda = CoherenceModel(model=lda_model, texts=data, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
start, limit, step = 3, 20, 1
model_list, coherence_values = compute_coherence_values(id2word,
                                                        corpus,
                                                        data,
                                                        start=start,
                                                        limit=limit,
                                                        step=step,
                                                        num_docs=num_docs,
                                                        mallet_path=mallet_path)

In [None]:
x = range(start, limit, step)
plot_c_v(x, coherence_values)

In [None]:
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path,
                                             corpus=corpus,
                                             random_seed=100,
                                             num_topics=3,
                                             topic_threshold=0.75,
                                             workers=6,
                                             id2word=id2word)
ldamallet.print_topics()

In [None]:
coherence_model_ldamallet = CoherenceModel(model=ldamallet,
                                           texts=data,
                                           dictionary=id2word,
                                           coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)

In [None]:
mallet_model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(ldamallet)

In [None]:
# file = open('mallet.p', 'wb')      
# pickle.dump(mallet_model, file)
# file.close()

In [None]:
file = open('mallet.p', 'rb')
mallet_model = pickle.load(file)
file.close()

In [None]:
df['topics'] = df['word_tokens'].map(lambda x: topics_in_doc(x, id2word, mallet_model))

In [None]:
# Map topic probabilities to individual features
df['protest'] = df['topics'].map(lambda x: x[0][1])
df['political'] = df['topics'].map(lambda x: x[1][1])
df['economic'] = df['topics'].map(lambda x: x[2][1])

In [None]:
# Map polarity and subjectivity to individual features
df['polarity'] = df['body'].map(lambda x: TextBlob(x).sentiment[0])
df['subjectivity'] = df['body'].map(lambda x: TextBlob(x).sentiment[1])

In [None]:
df = df[['headline', 'body', 'url', 'date', 'source', 'protest', 'political', 'economic', 'polarity', 'subjectivity']]

In [None]:
# file = open('df4cluster.p', 'wb')
# pickle.dump(df, file)
# file.close()

## Topic and Sentiment EDA

In [43]:
file = open('df4cluster.p', 'rb')
df = pickle.load(file)
file.close()

In [36]:
df['polarity'] = abs(df['polarity'])

In [44]:
df['source'] = np.where(df['source'] == 'ABC (Australia)', 'ABC', df['source'])

### Significant Events:
- Major spikes in coverage, sentiment, and topic compositions appear to coincide with major events:
    - Early July coincide with storming of Legislative Council
    - Mid August coincide with more aggressive police tactics (beanbag rounds, increased tear gas, etc.)
    - Mid October coincide with banning of facemasks and other regulations
    - Mid November coincide with university seiges and increasing violence

In [160]:
fig = px.histogram(
    df,
    x='date',
    color='source',
    nbins=293,
)

fig.update_layout(
    title={
        'text': 'Number of Articles by Day (293 days)',
    }
)

fig.show()

In [161]:
sources = ['ABC', 'CCTV', 'CNN', 'Reuters']
colors = ['#636EFA', '#EF553B', '#00CC96', '#AB63FA']
topics = ['economic', 'political', 'protest']
sentiments = ['polarity', 'subjectivity']

In [162]:
for source in sources:
    data = df.loc[df['source'] == source].groupby('date').mean().rolling(7).mean().reset_index()
    fig = go.Figure()
    for topic in topics:
        fig.add_trace(
            go.Scatter(
                x=data['date'],
                y=data[topic],
                mode='lines+markers',
                name=topic,
            )
        )
        fig.update_layout(
            title={
                'text': f'Topic Composition of Average {source} Article by Day (7-day Rolling Window)',
            }
        )
        
    fig.show()

In [163]:
fig = make_subplots(
    rows=1,
    cols=2,
    subplot_titles=['Polarity', 'Sentiment'],
)

for i in range(len(sentiments)):
    for source, color in zip(sources, colors):
        data = df.loc[df['source'] == source].groupby('date').mean().rolling(7).mean().reset_index()
        if i == 0:
            legend = True
        else:
            legend = False
            
        fig.add_trace(
            go.Scatter(
                x=data['date'],
                y=data[sentiments[i]],
                mode='lines+markers',
                name=source,
                marker=dict(color=color),
                showlegend=legend,
            ),
            row=1,
            col=i+1,
        )
        
fig.show()

In [164]:
metrics_by_source = df.groupby('source').mean()

In [165]:
fig = go.Figure()
annotations = []

for topic in topics:
    source_data = [ round(float(metrics_by_source[topic][metrics_by_source.index == source]), 3) for source in sources ]
    fig.add_trace(go.Bar(
        name=f'{topic.capitalize()}', 
        x=source_data, 
        y=sources,
        orientation='h',
    ))
    if topic == topics[0]:
        annotations.append(dict(
            xref='x',
            yref='paper',
            x=source_data[0] / 2, y=1.1,
            text=f'{topic.capitalize()}',
            showarrow=False,
            font=dict(
                family='Arial',
                size=14,
                color='rgb(67, 67, 67)',
            ),
        ))
    elif topic == topics[1]:
        annotations.append(dict(
            xref='x',
            yref='paper',
            x=(source_data[0] / 1.5) + (source_data[1] / 2),
            y=1.1,
            text=f'{topic.capitalize()}',
            font=dict(
                family='Arial',
                size=14,
                color='rgb(67, 67, 67)',
            ),
            showarrow=False,
        ))
    else:
        annotations.append(dict(
            xref='x',
            yref='paper',
            x=source_data[0] + source_data[1] + (source_data[2] / 2),
            y=1.1,
            text=f'{topic.capitalize()}',
            font=dict(
                family='Arial',
                size=14,
                color='rgb(67, 67, 67)',
            ),
            showarrow=False,
        ))

fig.update_layout(
    xaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=False,
        zeroline=False,
    ),
    yaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=True,
        zeroline=False,
    ),
    barmode='stack',
    paper_bgcolor='rgb(255, 255, 255)',
    plot_bgcolor='rgb(255, 255, 255)',
    xaxis_title='Topic Proportion of Average Article by Source',
    annotations=annotations,
)

fig.show()

### Statistically Significant Differences:
- For each metric a One-Way ANOVA was run to determine if there was a statistically significant difference between source metric means:
    - $H_{0}$: $\mu_{1}$ = $\mu_{2}$ = $\mu_{3}$ = $\mu_{4}$
    - $H_{A}$: At least one source mean differs
    - $\alpha$ = 0.05
- All 5 null hypotheses were rejected
- 'Polarity' and 'subjectivity' appears uni-modal

In [110]:
metrics = ['protest', 'political', 'economic', 'polarity', 'subjectivity']
alpha = 0.05

for metric in metrics:
    x = stats.f_oneway(
        df[metric][df['source'] == 'ABC'],
        df[metric][df['source'] == 'CCTV'],
        df[metric][df['source'] == 'CNN'],
        df[metric][df['source'] == 'Reuters'],
    )
    print('Metric:', metric)
    print('F-Statistic:', x[0])
    print('P-Value:', x[1])
    if x[1] < alpha:
        print('Reject Null')
    else:
        print('Cannot Reject Null')
    print(30 * '-')

Metric: protest
F-Statistic: 11.961832620992647
P-Value: 1.1485294659482508e-07
Reject Null
------------------------------
Metric: political
F-Statistic: 115.05091320760104
P-Value: 7.037753253379389e-62
Reject Null
------------------------------
Metric: economic
F-Statistic: 38.50715476142313
P-Value: 3.062004048707377e-23
Reject Null
------------------------------
Metric: polarity
F-Statistic: 3.1356642690879686
P-Value: 0.024895003714769572
Reject Null
------------------------------
Metric: subjectivity
F-Statistic: 18.837710726095203
P-Value: 8.238657939633586e-12
Reject Null
------------------------------


In [104]:
fig = px.histogram(df, x='subjectivity', color='source', nbins=100)
fig.show()

In [105]:
fig = px.histogram(df, x='polarity', color='source', nbins=100)
fig.show()

In [109]:
fig = px.scatter_matrix(df, dimensions=['protest', 'political', 'economic', 'polarity', 'subjectivity'])
fig.show()

## Clustering

In [111]:
metrics = ['protest', 'political', 'economic']
X = df[metrics]

In [112]:
preprocess = [StandardScaler(), MinMaxScaler(), RobustScaler(), Normalizer()]
scaled_Xs = [X]
for process in preprocess:
    scaler = process
    X1 = scaler.fit_transform(X)
    scaled_Xs.append(X1)

In [113]:
p_labels = ['Raw', 'Standard Scaler', 'MinMax Scaler', 'Robust Scaler', 'Normalizer']

best_hscore = (0, None, None)
best_kscore = (0, None, None)


for n in range(2, 10):
    for i in range(len(scaled_Xs)):
        hc = AgglomerativeClustering(n_clusters=n, affinity='euclidean', linkage='ward')
        kc = KMeans(n_clusters=n)
        y_hc = hc.fit(scaled_Xs[i])
        y_kc = kc.fit(scaled_Xs[i])
        h_score = silhouette_score(scaled_Xs[i], y_hc.labels_)
        k_score = silhouette_score(scaled_Xs[i], y_kc.labels_)
        if h_score > best_hscore[0]:
            best_hscore = (h_score, p_labels[i], n)
        if k_score > best_kscore[0]:
            best_kscore = (k_score, p_labels[i], n)
#         print(f'---{n} Clusters---')
#         print(f'Hierarchical Clustering ({p_labels[i]}):', h_score)
#         print(f'K-Means Clustering ({p_labels[i]}):', k_score)
print('Hierarchical Clustering:')
print(f'Best Score - {best_hscore[0]}')
print(f'Best Scale - {best_hscore[1]}')
print(f'n-clusters - {best_hscore[2]}')
print(30*'-')
print('K-Means Clustering:')
print(f'Best Score - {best_kscore[0]}')
print(f'Best Scale - {best_kscore[1]}')
print(f'n-clusters - {best_kscore[2]}')

Hierarchical Clustering:
Best Score - 0.5542620366196466
Best Scale - Robust Scaler
n-clusters - 2
------------------------------
K-Means Clustering:
Best Score - 0.5701576419445975
Best Scale - Robust Scaler
n-clusters - 2


In [114]:
scaler = RobustScaler()
X1 = scaler.fit_transform(X)

In [115]:
hc_model = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='ward')
kc_model = KMeans(n_clusters=2)

In [116]:
y_hc = hc_model.fit(X1).labels_.reshape(799,1)
y_kc = kc_model.fit(X1).labels_.reshape(799,1)

In [117]:
models = ['Hierarchical', 'K-Means']
labels = [y_hc, y_kc]

In [118]:
mod_lab_df = pd.DataFrame(np.concatenate((y_hc, y_kc, X1), axis=1))
mod_lab_df.columns = models + metrics
mod_lab_df['source'] = df['source']

### Clustering Notes:
- As illustrated below, there were 2 clusters: highly economic articles and less economic articles
- Most inter-source differences do not seem to be attributable to article clusters
- Reuters has the most highly economic articles (already illustrated by mean article topic proportions)

In [146]:
subplot_titles = []
traces = []
for i in range(0, len(metrics)-1):
    for j in range(i+1, len(metrics)):
        trace = (mod_lab_df[metrics[i]], mod_lab_df[metrics[j]], i+j)
        subplot_titles.append(f'{metrics[j]} vs. {metrics[i]}')
        traces.append(trace)
        
fig = make_subplots(rows=1, cols=3, subplot_titles=subplot_titles)

for trace in traces:
    fig.add_trace(
        go.Scatter(
            x=trace[0],
            y=trace[1],
            mode='markers',
            marker=dict(color=mod_lab_df['K-Means']),
            showlegend=False,
        ),
        row=1,
        col=trace[2],
    )

fig.show()

In [120]:
mod_lab_df.drop(columns='Hierarchical').groupby('K-Means').mean()

Unnamed: 0_level_0,protest,political,economic
K-Means,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,0.305817,0.234378,-0.087839
1.0,-0.511935,-0.569541,2.168645


In [147]:
mod_lab_df['K-Means'].value_counts()

0.0    638
1.0    161
Name: K-Means, dtype: int64

In [148]:
mod_lab_df.drop(columns='K-Means').groupby('Hierarchical').mean()

Unnamed: 0_level_0,protest,political,economic
Hierarchical,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,0.252494,0.189999,0.047944
1.0,-0.620572,-0.731298,2.54601


In [149]:
mod_lab_df['Hierarchical'].value_counts()

0.0    697
1.0    102
Name: Hierarchical, dtype: int64

In [157]:
mod_lab_df.groupby('source')['K-Means'].value_counts()

source   K-Means
ABC      0.0         94
         1.0          8
CCTV     0.0        174
         1.0          8
CNN      0.0         70
         1.0         11
Reuters  0.0        300
         1.0        134
Name: K-Means, dtype: int64

In [158]:
mod_lab_df.groupby('source')['Hierarchical'].value_counts()

source   Hierarchical
ABC      0.0              99
         1.0               3
CCTV     0.0             180
         1.0               2
CNN      0.0              78
         1.0               3
Reuters  0.0             340
         1.0              94
Name: Hierarchical, dtype: int64