# Introduction to Natural Language Processing
## 2. Unsupervised Learning
### ASI Data Science Fellowship IX - 6th October 2017

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

# Get Vader data for sentiment analysis
import nltk
nltk.download('vader_lexicon') 

# dictionary of colours for making nice plots later
PARTY_COLOURS = {'trump': '#E91D0E', 'obama': '#00A6EF'}

%matplotlib inline

## Data Cleaning 

In [None]:
import re

def clean_tweet(text):
    # encode tweets as utf-8 strings
    text = text.decode('utf-8')
    # remove commas in numbers (else vectorizer will split on them)
    text = re.sub(r',([0-9])', '\\1', text)
    # sort out HMTL formatting of &
    text = re.sub(r'&amp', 'and', text)
    # strip urls
    return re.sub(r'http[s]{0,1}://[^\s]*', '', text)

df = pd.read_pickle('tweets.pkl')
df['text'] = df['text'].map(clean_tweet)

## Dimension reduction

If we allow our vectorizer to infer a vocabulary from the corpus, then this will typically result in a huge number of sparesely populated features. We can often dimension reduce and retain relevant information (albeit sacrificing some interpretability), and improve the efficiency of our models and analysis.

Let's visualise our tfidf vectors in a few different ways.

### Principal Component Analysis

Principal component analysis aims find a coordinate system where correlation between features is minimized. By keeping only the coordinate directions in the new system that explain the most variance, we can reduce the dimensions of our feature space.

In [None]:
from sklearn.decomposition import PCA
import seaborn as sns

def scatter(x, colors):
    f = plt.figure(figsize=(8, 8))
    ax = plt.subplot(aspect='equal')
    ax.scatter(x[(colors==0),0], x[(colors==0),1], c=PARTY_COLOURS['trump'], label='Trump', alpha=0.5)
    ax.scatter(x[(colors==1),0], x[(colors==1),1], c=PARTY_COLOURS['obama'], label='Obama', alpha=0.5)
    ax.axes.get_yaxis().set_visible(False)
    ax.axes.get_xaxis().set_visible(False)
    plt.legend()
    
    return f, ax

# dimension reduction algorithms can be pretty slow, so let's work with a sample
# try on the whole data set if you want!
sample_trump = df.loc[df['label'] == 0, ['text', 'label']].sample(500)
sample_obama = df.loc[df['label'] == 1, ['text', 'label']].sample(500)
sample = sample_trump.append(sample_obama)

tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
tfidf_vectors = tfidf_vectorizer.fit_transform(sample['text'])

**Use `PCA` to reduce `tfidf_vectors` to two dimensions, then plot the results using the `scatter` function. Pass the labels as colours.**

In [None]:
# dimension reduce and plot here

### t-SNE

t-SNE is another dimension reduction algorithm, but one that is generally better at preserving the global structure of the data. In the case of our twitter data it does a much better job than PCA.

In [None]:
from sklearn.manifold import TSNE

# we have suggested some parameters below, feel free to experiment
tsne = TSNE(perplexity = 800, random_state = 42)

**Transform `tfidf_vectors` using `tsne` and plot them using `scatter`**.

In [None]:
# dimension reduce and plot here

## Clustering

Given a numeric representation of our data, there are many clustering algorithms we can try out. Since our feature vectors are extremely high dimensional, it is a good idea to first dimension reduce so that we do not fall foul of the curse of dimensionality.

**Use KMeans to cluster your t-SNE vectors into two classes.**

In [None]:
from sklearn.cluster import KMeans

# cluster tsne vectors using K-Means

sample['kmeans_labels'] = # your kmeans labels

In [None]:
# some reorganizing for plotting clusters

df_trump = sample[sample['label'] == 0]
df_obama = sample[sample['label'] == 1]

trump_counts = df_trump[['kmeans_labels', 'label']].groupby('kmeans_labels').count().values.flatten()
obama_counts = df_obama[['kmeans_labels', 'label']].groupby('kmeans_labels').count().values.flatten()

f, ax = plt.subplots()
bars11 = ax.bar(np.arange(2)-0.15, trump_counts, 0.3, color=PARTY_COLOURS['trump'], label='Trump')
bars12 = ax.bar(np.arange(2)+0.15, obama_counts, 0.3, color=PARTY_COLOURS['obama'], label='Obama')
plt.legend(fontsize=12)
plt.ylabel('Count', fontsize=15)
plt.xticks([0, 1])
plt.xlabel('Cluster', fontsize=15)
plt.show()

### Other clustering

Let's also try some other clustering algorithms.

**Try using spectral clustering to cluster the t-SNE vectors.**

In [None]:
from sklearn.cluster import SpectralClustering

# spectral cluster the t-SNE vectors here

sample['scl_labels'] = # your spectral cluster labels

In [None]:
# some reorganizing for plotting clusters

df_trump = sample[sample['label'] == 0]
df_obama = sample[sample['label'] == 1]

trump_counts = df_trump[['scl_labels', 'label']].groupby('scl_labels').count().values.flatten()
obama_counts = df_obama[['scl_labels', 'label']].groupby('scl_labels').count().values.flatten()

f, ax = plt.subplots()
bars11 = ax.bar(np.arange(2)-0.15, trump_counts, 0.3, color=PARTY_COLOURS['trump'], label='Trump')
bars12 = ax.bar(np.arange(2)+0.15, obama_counts, 0.3, color=PARTY_COLOURS['obama'], label='Obama')
plt.legend(fontsize=12)
plt.ylabel('Count', fontsize=15)
plt.xticks([0, 1])
plt.xlabel('Cluster', fontsize=15)
plt.show()

**Print a selection of Obama tweets that ended up in the Trump cluster, and a selection of Trump tweets that ended up in the Obama cluster. If the clustering is working well, the Obama tweets should look Trumpian, and the Trump tweets should look Obama-like.**

In [None]:
# print tweets that ended up in the wrong clusters

## Topic Analysis

### Latent Dirichlet Allocation

Latent Dirichlet Allocation (LDA) is a statistical model of topic distribution that can be trained using bayesian inference and Markov Chain Monte Carlo methods.

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
tf_vectorizer = TfidfVectorizer(
    stop_words='english', max_df=0.95, min_df=2, use_idf=False, max_features=5000
)
tf_vectors = tf_vectorizer.fit_transform(df['text'])

lda = LatentDirichletAllocation(
    n_components=5, max_iter=20, random_state=42, learning_method='batch'
)

lda_vectors = lda.fit_transform(tf_vectors)

In [None]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        top_words = " ".join(
            [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        )
        print("Topic #{}: {}".format(topic_idx, top_words))
    print()

print_top_words(lda, tf_vectorizer.get_feature_names(), 10)  

In [None]:
obama_tf = tf_vectorizer.transform(
    df[df['label'] == 1]['text']
)
obama_lda = lda.transform(obama_tf)

trump_tf = tf_vectorizer.transform(
    df[df['label'] == 0]['text']
)
trump_lda = lda.transform(trump_tf)

obama_topics = np.mean(obama_lda, axis=0)
trump_topics = np.mean(trump_lda, axis=0)

In [None]:
import matplotlib.ticker as ticker

fig, ax = plt.subplots()

feature_names = tf_vectorizer.get_feature_names()

topic_words = [[feature_names[i] for i in topic.argsort()[:-6:-1]] for topic in lda.components_]


def major_formatter(x, pos):
    return str('\n'.join(topic_words[int(x)][0:5]))

bars1 = ax.bar(np.arange(5)-0.15, obama_topics, 0.3, color=PARTY_COLOURS['obama'], label='Obama')
bars2 = ax.bar(np.arange(5)+0.15, trump_topics, 0.3, color=PARTY_COLOURS['trump'], label='Trump')
plt.legend(fontsize=12)
plt.ylabel('Relevance', fontsize=15)
plt.xticks([0, 1, 2, 3, 4])
plt.gca().xaxis.set_major_formatter(ticker.FuncFormatter(major_formatter))

plt.show()

## Sentiment analysis

Some stuff about sentiment analysis

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

df_sentiment = (
    df[df['label'] == 0].sample(200).append(
        df[df['label'] == 1].sample(200)
    ).copy()
)

for i, tweet in df_sentiment['text'].iteritems():
    ss = sid.polarity_scores(str(tweet))
    for k in sorted(ss):
        df_sentiment.loc[i, k] = ss[k]

df_sentiment
    

Do a plot here comparing sentiment distribution of Trump vs. Obama

In [None]:
trump_sentiment = df_sentiment[df_sentiment['label'] == 0][['compound', 'neg', 'pos']]
obama_sentiment = df_sentiment[df_sentiment['label'] == 1][['compound', 'neg', 'pos']]

trump_neg_sentiment = trump_sentiment['neg'].sort_values().reset_index(drop=True)
obama_neg_sentiment = obama_sentiment['neg'].sort_values().reset_index(drop=True)

trump_pos_sentiment = trump_sentiment['pos'].sort_values().reset_index(drop=True)
obama_pos_sentiment = obama_sentiment['pos'].sort_values().reset_index(drop=True)


fig, ax = plt.subplots(2)
ax[0].plot(trump_neg_sentiment, c=PARTY_COLOURS['trump'], label='Trump')
ax[0].plot(obama_neg_sentiment, c=PARTY_COLOURS['obama'], label='Obama')
ax[1].plot(trump_pos_sentiment, c=PARTY_COLOURS['trump'], label='Trump')
ax[1].plot(obama_pos_sentiment, c=PARTY_COLOURS['obama'], label='Obama')
ax[0].legend()
ax[1].legend()
ax[0].set_ylabel('Negative sentiment', fontsize=10)
ax[1].set_ylabel('Positive sentiment', fontsize=10)


plt.tight_layout()
plt.show()