# Topic Modeling and Sentiment Analysis Rework

In [189]:
import numpy as np
import pandas as pd
import pickle
import gensim
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

from plotly.subplots import make_subplots
from textblob import TextBlob
from pprint import pprint
from gensim import corpora, models
from gensim.models import CoherenceModel

from topic_model import compute_coherence_values, plot_c_v, topics_in_doc

## Changes to the final project
In the original "the Language in News" project dominant topic bias and  class imbalance possibly hindered potential insights.  In an attempt to improve topic coherence and clustering, various methods and changes will be employed, including:
- Class imbalance will be dealt with by removing all South China Morning Post (SCMP) articles since it is the only local news source and produced various forms of bias in the data.
- Certain features (i.e. sentiment scores of topics) will be scaled in order to help alleviate bias from dominant topics.
- The entire articles will be used instead of the first 10 sentences.
- Bigram model created across all articles instead of per article (incorrectly applied in the final project).

## Additional cleaning of articles
A script and functions were written to remove 'SCMP' articles, articles before the protests, unwanted articles.  A pickled DataFrame ```df_topic.p``` with additional preprocessing (tokenization, bigram creation, and lemmatization) is returned by running the ```prep_for_tm.py``` file.

In [None]:
# %run 'prep_for_tm.py'

f = open('df_topic.p', 'rb')
df = pickle.load(f)
f.close()

In [None]:
# Text for topic modeling
data = df['word_tokens']

In [None]:
id2word = corpora.Dictionary(data)
id2word.filter_extremes(no_below=5, no_above=0.5, keep_n=10000)
corpus = [ id2word.doc2bow(datum) for datum in data ]

In [None]:
num_docs = len(data)

## Topic modeling
Modeled on the collection of articles with standard LDA and LDA Mallet.  There local maxima for LDA and LDA Mallet coherence values at 3 topics (0.388 and  0.455, respectively) that had far greater interpretability than the possible global maxima at 12 and 19 topics.  The topics for LDA and LDA Mallet were manually labeled as economic, protests, political and protests, political, economic.  Ultimately the LDA Mallet model was chosen for its higher coherence values and less vocabulary overlap between topics.

Additionally, the original project tried to assign a topic to individual sentences, however, after looking at topic probabilities, the model had difficulty determining which topic was most probable (most topic probabilities by sentence were between 30-40%).  Whereas the model was more capable of finding a most probable topic (often >50% for certain articles) when evaluating an entire article.

Headlines did not appear to have much subjectivity or polarity.

In [None]:
mallet_path = '/Users/waynelam/Documents/DevStuff/mallet-2.0.8/bin/mallet'

In [None]:
start, limit, step = 3, 20, 1
model_list, coherence_values = compute_coherence_values(id2word,
                                                        corpus,
                                                        data,
                                                        start=start,
                                                        limit=limit,
                                                        step=step,
                                                        num_docs=num_docs)

In [None]:
x = range(start, limit, step)
plot_c_v(x, coherence_values)

In [None]:
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       workers=6,
                                       num_topics=3,
                                       minimum_probability=0.75,
                                       random_state=100,
                                       chunksize=num_docs,
                                       passes=10)

In [None]:
pprint(lda_model.print_topics())

In [None]:
print('\nPerplexity: ', lda_model.log_perplexity(corpus))

coherence_model_lda = CoherenceModel(model=lda_model, texts=data, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
start, limit, step = 3, 20, 1
model_list, coherence_values = compute_coherence_values(id2word,
                                                        corpus,
                                                        data,
                                                        start=start,
                                                        limit=limit,
                                                        step=step,
                                                        num_docs=num_docs,
                                                        mallet_path=mallet_path)

In [None]:
x = range(start, limit, step)
plot_c_v(x, coherence_values)

In [None]:
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path,
                                             corpus=corpus,
                                             random_seed=100,
                                             num_topics=3,
                                             topic_threshold=0.75,
                                             workers=6,
                                             id2word=id2word)
ldamallet.print_topics()

In [None]:
coherence_model_ldamallet = CoherenceModel(model=ldamallet,
                                           texts=data,
                                           dictionary=id2word,
                                           coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)

In [None]:
mallet_model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(ldamallet)

In [None]:
# file = open('mallet.p', 'wb')      
# pickle.dump(mallet_model, file)
# file.close()

In [None]:
file = open('mallet.p', 'rb')
mallet_model = pickle.load(file)
file.close()

In [None]:
df['topics'] = df['word_tokens'].map(lambda x: topics_in_doc(x, id2word, mallet_model))

In [None]:
# Map topic probabilities to individual features
df['protest'] = df['topics'].map(lambda x: x[0][1])
df['political'] = df['topics'].map(lambda x: x[1][1])
df['economic'] = df['topics'].map(lambda x: x[2][1])

In [None]:
# Map polarity and subjectivity to individual features
df['polarity'] = df['body'].map(lambda x: TextBlob(x).sentiment[0])
df['subjectivity'] = df['body'].map(lambda x: TextBlob(x).sentiment[1])

In [None]:
df = df[['headline', 'body', 'url', 'date', 'source', 'protest', 'political', 'economic', 'polarity', 'subjectivity']]

In [None]:
# file = open('df4cluster.p', 'wb')
# pickle.dump(df, file)
# file.close()

## Topic and Sentiment EDA

In [175]:
file = open('df4cluster.p', 'rb')
df = pickle.load(file)
file.close()

In [176]:
df['polarity'] = abs(df['polarity'])

In [177]:
df['source'] = np.where(df['source'] == 'ABC (Australia)', 'ABC', df['source'])

### Significant Events:
- Major spikes in coverage, sentiment, and topic compositions appear to coincide with major events:
    - Early July coincide with storming of Legislative Council
    - Mid August coincide with more aggressive police tactics (beanbag rounds, increased tear gas, etc.)
    - Mid October coincide with banning of facemasks and other regulations
    - Mid November coincide with university seiges and increasing violence

In [190]:
fig = px.histogram(
    df,
    x='date',
    color='source',
    nbins=293,
)

fig.update_layout(
    title={
        'text': 'Number of Articles by Day (293 days)',
    }
)

fig.show()

In [200]:
sources = ['Reuters', 'CNN', 'CCTV', 'ABC']
colors = ['#636EFA', '#EF553B', '#00CC96', '#AB63FA']
topics = ['economic', 'political', 'protest']
sentiments = ['polarity', 'subjectivity']

In [192]:
for source in sources:
    data = df.loc[df['source'] == source].groupby('date').mean().rolling(7).mean().reset_index()
    fig = go.Figure()
    for topic in topics:
        fig.add_trace(
            go.Scatter(
                x=data['date'],
                y=data[topic],
                mode='lines+markers',
                name=topic,
            )
        )
        fig.update_layout(
            title={
                'text': f'Topic Composition of Average {source} Article by Day (7-day Rolling Window)',
            }
        )
    fig.show()

In [204]:
fig = make_subplots(
    rows=1,
    cols=2,
    subplot_titles=['Polarity', 'Sentiment'],
)

for i in range(len(sentiments)):
    for source, color in zip(sources, colors):
        data = df.loc[df['source'] == source].groupby('date').mean().rolling(7).mean().reset_index()
        if i == 0:
            legend = True
        else:
            legend = False
            
        fig.add_trace(
            go.Scatter(
                x=data['date'],
                y=data[sentiments[i]],
                mode='lines+markers',
                name=source,
                marker=dict(color=color),
                showlegend=legend,
            ),
            row=1,
            col=i+1,
        )
fig.show()

### Statistical Significance Notes:
- 