# Feature Engineering and EDA

In [None]:
import re
import numpy as np
import pandas as pd
import pickle
import gensim
import spacy
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
import seaborn as sns
from pprint import pprint
from gensim.utils import simple_preprocess
from gensim import corpora, models
from gensim.parsing.preprocessing import STOPWORDS
from nltk.corpus import stopwords
from gensim.models import CoherenceModel
from nltk.tokenize import sent_tokenize
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.decomposition import PCA

import fe

%matplotlib inline

In [None]:
file = open('articles.p', 'rb')      
df = pickle.load(file)
file.close()

In [None]:
drop_indices = df.loc[df['date'] < pd.Timestamp(2019, 3, 15)].index
df.drop(index=drop_indices, inplace=True)

In [None]:
# Drop update articles and investing articles
df = df.loc[df['headline'].map(lambda x: re.search(r'UPDATE', x)).isna()]
df = df.loc[df['headline'].map(lambda x: re.search(r'US STOCKS', x)).isna()]
df = df.loc[df['headline'].map(lambda x: re.search(r'PRESS', x)).isna()]
df = df.loc[df['url'].map(lambda x: re.search(r'/education/', x)).isna()]
df = df.loc[df['url'].map(lambda x: re.search(r'/politics/', x)).isna()]
df = df.loc[df['url'].map(lambda x: re.search(r'/diplomacy/', x)).isna()]
df = df.loc[df['url'].map(lambda x: re.search(r'/letters/', x)).isna()]
df = df.loc[df['url'].map(lambda x: re.search(r'health-', x)).isna()]
df = df.loc[df['url'].map(lambda x: re.search(r'/money/', x)).isna()]
df = df.loc[df['url'].map(lambda x: re.search(r'/transport/', x)).isna()]
df = df.loc[df['url'].map(lambda x: re.search(r'investing', x)).isna()]
df = df.loc[df['url'].map(lambda x: re.search(r'/society/', x)).isna()]
df.reset_index(inplace=True)
df.drop(columns='index', inplace=True)
# df.drop(columns='url', inplace=True)

In [None]:
df['source'].value_counts()

In [None]:
df_urls = df['url']
df.drop(columns='url', inplace=True)

In [None]:
# file = open('df_urls.p', 'wb')      
# pickle.dump(df_urls, file)
# file.close()

## Add sentences and words features

In [None]:
# Additional cleaning of body of article
df['body'] = df['body'].map(fe.replace_words)

In [None]:
# Create sentence feature
df['sentences'] = df['body'].map(lambda x: sent_tokenize(x)[:10])

In [None]:
# Preprocess sentences
df['sentence_tokens'] = df['sentences'].map(fe.preprocess_sent)

# Preprocess entire body for training
df['word_tokens'] = df['sentence_tokens'].map(lambda x: [ item for l in x for item in l ])

## Dictionary and bigram models
- This section heavily borrows from https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

In [None]:
# Create bigrams in df['word_tokens']
df['word_tokens'] = df['word_tokens'].map(fe.make_bigrams)

# Create bigrams in df['sentence_tokens']
df['sentence_tokens'] = df['sentence_tokens'].map(fe.make_bigrams_sent)

In [None]:
# Lemmatize df['word_tokens']
df['word_tokens'] = df['word_tokens'].map(fe.lemmatization)

# Lemmatize df['sentence_tokens']
df['sentence_tokens'] = df['sentence_tokens'].map(fe.lemmatize_sent)

In [None]:
# Split into SCMP and not SCMP
df1 = df.loc[df['source'] == 'SCMP'] # SCMP
df2 = df.loc[df['source'] != 'SCMP'] # Not SCMP

In [None]:
# Choose corpus for training
data_lemmatized = df2['word_tokens'] # Not SCMP

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Filter extremes from dictionary
id2word.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [ id2word.doc2bow(text) for text in texts ]

In [None]:
# Create TF-IDF corpus
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [None]:
# Pickle id2word
# file = open('id2word.p', 'wb')      
# pickle.dump(id2word, file)
# file.close()

## LDA Models

### Standard LDA

In [None]:
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       workers=2,
                                       num_topics=4, 
                                       random_state=100,
                                       passes=10,
                                       per_word_topics=True)

In [None]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
print('\nPerplexity: ', lda_model.log_perplexity(corpus))

coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

#### Notes:
- Topics:
    - Topic 0: Protests
    - Topic 1: Economic
    - Topic 2: Government

### TF-IDF LDA

In [None]:
tfidf_lda_model = gensim.models.LdaMulticore(corpus=corpus_tfidf,
                                             id2word=id2word,
                                             workers=4,
                                             num_topics=4,
                                             chunksize=100,
                                             random_state=100,
                                             passes=10,
                                             per_word_topics=False)

In [None]:
pprint(tfidf_lda_model.print_topics())
doc_tfidf_lda = tfidf_lda_model[corpus]

In [None]:
print('\nPerplexity: ', tfidf_lda_model.log_perplexity(corpus))

coherence_model_tfidf_lda = CoherenceModel(model=tfidf_lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_tfidf_lda = coherence_model_tfidf_lda.get_coherence()
print('\nCoherence Score: ', coherence_tfidf_lda)

#### Notes:
- Topics:
    - Topic 0: Government
    - Topic 1: Legal Issues
    - Topic 2: Economic
    - Topic 3: Protests

### LDA Mallet

In [None]:
mallet_path = '/Users/waynelam/Documents/DevStuff/mallet-2.0.8/bin/mallet'

In [None]:
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path,
                                             corpus=corpus,
                                             random_seed=123,
                                             num_topics=4,
                                             id2word=id2word)

In [None]:
pprint(ldamallet.show_topics())

In [None]:
coherence_model_ldamallet = CoherenceModel(model=ldamallet,
                                           texts=data_lemmatized,
                                           dictionary=id2word,
                                           coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)

#### Notes:
- Topics:
    - Topic 0: Protest
    - Topic 1: Economy
    - Topic 2: Politics
    - Topic 3: Government

### Coherence

In [None]:
model_list, coherence_values = fe.compute_coherence_values(id2word,
                                                           corpus,
                                                           data_lemmatized,
                                                           start=3,
                                                           limit=15,
                                                           step=1)

In [None]:
limit = 15
start = 3
step = 1
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In [None]:
# Use weights from LDA Mallet and transfer to standard LDA Model
mallet_model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(ldamallet)

In [None]:
# pyLDAvis.enable_notebook()
# vis = pyLDAvis.gensim.prepare(mallet_model, corpus, id2word)
# vis.show()

In [None]:
pyLDAvis.save_html(vis, 'topic_words.html')

In [None]:
# Pickle LDA Mallet model
# file = open('mallet.p', 'wb')      
# pickle.dump(mallet_model, file)
# file.close()

In [None]:
# file = open('ldavis.p', 'wb')      
# pickle.dump(vis, file)
# file.close()

## Sentiment

In [None]:
# Create column for sentiment analysis
# Sentences for sentiment analysis and sentence_tokens for topic analysis
s = df['sentences']
st = df['sentence_tokens']

combine = []
for i in range(len(s)):
    total = []
    total.append(s[i])
    total.append(st[i])
    combine.append(total)

df['combine'] = combine

In [None]:
df['sentiment'] = df['combine'].map(fe.sentiment_doc)

In [None]:
# Topic 0: Protests
# Topic 1: Econ
# Topic 2: Poli
# Topic 3: Gov

In [None]:
# Map sentiment scores into columns
df['protest'] = df['sentiment'].map(lambda x: x[0][0])
df['econ'] = df['sentiment'].map(lambda x: x[1][0])
df['poli'] = df['sentiment'].map(lambda x: x[2][0])
df['gov'] = df['sentiment'].map(lambda x: x[3][0])
df['protest_mention'] = df['sentiment'].map(lambda x: x[0][1])
df['econ_mention'] = df['sentiment'].map(lambda x: x[1][1])
df['poli_mention'] = df['sentiment'].map(lambda x: x[2][1])
df['gov_mention'] = df['sentiment'].map(lambda x: x[3][1])

In [None]:
# Create total sentences column
df['total_sentences'] = df['protest_mention'] + df['econ_mention'] + df['poli_mention'] + df['gov_mention']

## Data Visualization

In [None]:
# Create weighted scores
df['w_protest'] = df['protest'] * (df['protest_mention'] / df['total_sentences'])
df['w_econ'] = df['econ'] * (df['econ_mention'] / df['total_sentences'])
df['w_gov'] = df['gov'] * (df['gov_mention'] / df['total_sentences'])
df['w_poli'] = df['poli'] * (df['poli_mention'] / df['total_sentences'])

In [None]:
# file = open('dataframe.p', 'wb')      
# pickle.dump(df, file)
# file.close()

In [None]:
df2 = pd.DataFrame(pd.concat([df.iloc[:, 3:4], df.iloc[:, 9:]], axis=1))

In [None]:
# Sentiment analysis of headline
df2['hl_sent'] = df['headline'].map(lambda x: abs(vader_analysis(x)['compound']))

In [None]:
df2.groupby('source')['hl_sent'].mean()

In [None]:
df2['protest_ratio'] = df2['protest_mention'] / df2['total_sentences']
df2['econ_ratio'] = df2['econ_mention'] / df2['total_sentences']
df2['poli_ratio'] = df2['poli_mention'] / df2['total_sentences']
df2['gov_ratio'] = df2['gov_mention'] / df2['total_sentences']

In [None]:
df2.head()

### EDA

In [None]:
X = df2.iloc[:, 1:5].values
X = RobustScaler().fit_transform(X)
y = df2['source']

In [None]:
pca = PCA(n_components=2)

In [None]:
components = pca.fit_transform(X)

In [None]:
comp_df = pd.DataFrame(data = components, columns = ['pc1', 'pc2'])

In [None]:
comp_df['source'] = y

In [None]:
cond1 = (comp_df['source'] == 'CNN')
cond2 = (comp_df['source'] == 'ABC (Australia)')
cond3 = (comp_df['source'] == 'CCTV')
cond4 = (comp_df['source'] == 'Reuters')
cond5 = (comp_df['source'] == 'SCMP')

In [None]:
df2.groupby('source').mean()

In [None]:
fig = plt.figure(figsize = (10,10))
sns.scatterplot(x='pc1', y='pc2', hue='source', alpha=0.5, data=comp_df.loc[cond4 | cond5 | cond3])
plt.show()

In [None]:
fig = plt.figure(figsize = (10,10))
pd.value_counts(df2['source']).plot.bar()
plt.title('Article Counts by Source')
plt.show()

In [None]:
ratios = ['gov_ratio', 'econ_ratio', 'poli_ratio', 'protest_ratio']
fig, ax = plt.subplots(figsize=(15,7))
df2.groupby('source').mean()[ratios].plot(ax=ax, kind='bar')
plt.title('Mean Topic Ratios of Articles by Source')
plt.xlabel('Source')
plt.ylabel('% of Article')
plt.show()

In [None]:
ratios = ['protest_ratio', 'econ_ratio', 'poli_ratio', 'gov_ratio']
df_ratio = df2.loc[:, features]
df_ratio['source'] = df2['source']

In [None]:
df2.groupby('source').mean()

In [None]:
features = ['protest_ratio', 'econ_ratio', 'poli_ratio', 'gov_ratio']
X2 = df2.loc[:, features].values
X2 = MinMaxScaler().fit_transform(X2)
y = df2['source']

In [None]:
comp2 = pca.fit_transform(X2)

In [None]:
comp2_df = pd.DataFrame(data = comp2, columns = ['pc1', 'pc2'])
comp2_df['source'] = y

In [None]:
cond1 = (comp2_df['source'] == 'CNN')
cond2 = (comp2_df['source'] == 'ABC (Australia)')
cond3 = (comp2_df['source'] == 'CCTV')
cond4 = (comp2_df['source'] == 'Reuters')
cond5 = (comp2_df['source'] == 'SCMP')

In [None]:
fig = plt.figure(figsize = (10,10))
sns.scatterplot(x='pc1', y='pc2', hue='source', alpha=0.5, data=comp2_df)
plt.show()

In [None]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors

In [None]:
# Word cloud visualization
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

cloud = WordCloud(stopwords=stop_words,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=10,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

topics = mallet_model.show_topics(formatted=False)

fig, axes = plt.subplots(2, 2, figsize=(10,10), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')


plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()

In [None]:
# file = open('data.p', 'wb')      
# pickle.dump(df2, file)
# file.close()