## Natural Language Processing

We will be using the venturebeat data that we have scrapped and stored. We will begin with loading the data, inspecting it and then convert text into numeric features. 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('venturebeat.csv')
print(df.shape)
df.head()

In [None]:
df.info()

Our task/problem here is to build a natural language processing model that can take the information of the article and determine the topic it belongs to. 

In [None]:
data = df.copy()

### 1. Data Preprocessing

We can extract date, month and day from the url using regular expression and datatime functionalities. We can also add a length and nwords column that represent the number of characters and the number of words in the article text, respectively. 

In [None]:
import re

In [None]:
def extract_date(string):
    match = re.search(r'\d{4}/\d{1,2}/\d{1,2}', str(string))
    return match.group() 

In [None]:
data['date'] = pd.to_datetime(data['url'].apply(extract_date))
data['month'] = data['date'].dt.month
data['day'] = data['date'].dt.day

data['length'] = data['text'].str.len()
data['nwords'] = data['text'].str.split().str.len()
data.head()

__Lexical diversity__ is one aspect of 'lexical richness' and refers to the ratio of different unique words to the total number of words. 

In [None]:
def lexical_diversity(text):
    return len( set(text) ) / len( text.split() )

data['text'] = data['text'].astype(str)
data['lex_div'] = data['text'].apply(lexical_diversity)
data.head()

In [None]:
corpus = data['text'].values.tolist()
print(len(corpus))

### 1a. Tokenization

Tokenization is the process of splitting text into meaningul elements called tokens.

In [None]:
import nltk
nltk.download('popular', quiet=True)
from nltk import word_tokenize, wordpunct_tokenize

In [None]:
example = "I haven't watched the show at the theatre."
tokenized = nltk.word_tokenize(example)
print(tokenized)

In [None]:
example.split()

In [None]:
print( wordpunct_tokenize(example) )

The simplest vector encoding model is to simply fill in the vector with the frequency of each word as it appears in the document. 

### 1b. Stopwords

In [None]:
from nltk.corpus import stopwords

In [None]:
def is_stopword(token):
    stops  = set(stopwords.words('english'))
    return token.lower() in stops

print(tokenized)
print( [ is_stopword(i) for i in tokenized])

In [None]:
print(example.split() )
print( [ is_stopword(i) for i in example.split()])

### 1c. Punctuations

In [None]:
import unicodedata
def is_punct(token):
    return all(unicodedata.category(char).startswith('P') for char in token)

print(tokenized)
print( [ is_punct(i) for i in tokenized])

In [None]:
print( wordpunct_tokenize(example) ) 
print( [ is_punct(i) for i in wordpunct_tokenize(example)] )

### 1d. Stemming

In [None]:
from nltk.stem import SnowballStemmer

In [None]:
stemmer = SnowballStemmer('english')
stemmed = [ stemmer.stem(token) for token in tokenized ]
print( [example] )
print(tokenized)
print(stemmed)

In [None]:
def normalizer(text):
    stem = nltk.stem.SnowballStemmer('english')
    text = text.lower()
    
    tokenized = []
    for token in nltk.word_tokenize(text):
        tokenized.append(stem.stem(token))
    
    tokenized = [token for token in tokenized 
                 if not is_punct(token)            # remove tokens that are punctuations
                 and not is_stopword(token)        # remove stopwords
                 and token.isascii()               # remove non-english characters
               ]
            
    return ' '.join(tokenized)                     # join b/c we are inputting a list

In [None]:
print( example )
print( '---> ' + normalizer(example) )

In [None]:
norm_corpus = [ normalizer(i) for i in corpus ]
print(corpus[0][:999])
norm_corpus[0][:999]

### 1e. Lemmatization

In [None]:
from nltk import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet as wn

In [None]:
def is_punct(token):
    return all(unicodedata.category(char).startswith('P') for char in token)

def lemmatizer(token, postag):
    lemm = WordNetLemmatizer()
    tag= {
        'N':wn.NOUN,
        'V':wn.VERB,
        'R':wn.ADV,
        'J':wn.ADJ
    }.get(postag[0], wn.NOUN)
    
    return lemm.lemmatize(token, tag)

def normalizer_lemm(text):
    
    tagged_tokenized = pos_tag(wordpunct_tokenize(text))
    
    tokenized = [ lemmatizer(token, tag).lower() 
                 for (token, tag) in tagged_tokenized
                 if not is_punct(token) 
                 and token.isascii()
                ]
    
    # remove extended stopwords
    stop_words = stopwords.words('english')
    stop_words.extend(['game', 'compani'])
    stops = set(stop_words)
    tokenized = [token for token in tokenized if not token in stops]
    
    return ' '.join(tokenized)                     # join b/c we are inputting a list

In [None]:
normlemm_corpus = [ normalizer_lemm(i) for i in corpus ]
normlemm_corpus[0][:999]

## 2. Feature Extraction: Vectorization

The simplest vector encoding model is to simply fill in the vector with the frequency of each word as it appears in the document.

In [None]:
from collections import defaultdict

In [None]:
words = defaultdict(int)
for token in word_tokenize(example):
    words[token] += 1
words 

 ### 2a. Count Vectorizer 
 
 Scikit-Learn has a CountVectorizer transformer which does this for us easily. 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer()
vector = vectorizer.fit_transform(norm_corpus)
vector

In [None]:
vector.toarray()

In [None]:
features = vectorizer.get_feature_names()
nfeatures = len(features)
print(nfeatures)

In [None]:
vocab = vectorizer.vocabulary_
vocab

In [None]:
from yellowbrick.text.freqdist import FreqDistVisualizer

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15,10))
visualizer = FreqDistVisualizer(features=features, n=30, ax=ax )
visualizer.fit(vector)
visualizer.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15,5))
lists_asc = sorted(vocab.items())
x = [i for (i,j) in lists_asc]
y = [j for (i,j) in lists_asc]

n=30
plt.bar(x[:n], y[:n])
plt.xticks(rotation=45)
plt.show()

In [None]:
def normalizer(text):
    stem = nltk.stem.SnowballStemmer('english')
    text = text.lower()
    
    tokenized = []
    for token in nltk.word_tokenize(text):
        tokenized.append(stem.stem(token))
    
    tokenized = [token for token in tokenized 
                 if not is_punct(token)            # remove tokens that are punctuations
                 and token.isascii()               # remove non-english characters
               ]
    
    # remove extended stopwords
    stop_words = stopwords.words('english')
    stop_words.extend(['data','compani'])
    stops = set(stop_words)
    tokenized = [token for token in tokenized if not token in stops]
    
    return ' '.join(tokenized)                     # join b/c we are inputting a list

normalizer(example)

In [None]:
norm_corpus = [ normalizer(i) for i in corpus ]

### 2b. TFIDF Vectorizer

Again, Scikit-learn has provided an easy to work with functin for this. There is also a "ngram_range" parameter, which will help to create vocabulary with one or phrases of two words or both. 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer(analyzer='word')
tfidf_vector = tfidf.fit_transform(norm_corpus)
tfidf_vector

In [None]:
tfidf_vector.toarray()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15,8))
visualizer = FreqDistVisualizer(features=tfidf.get_feature_names(), n=30, ax=ax )
visualizer.fit(tfidf_vector)
visualizer.show()

## 3. MODELLING

In [None]:
X = pd.DataFrame(vector.toarray(), columns=features)
X

In [None]:
cols = ['month','day','length','nwords','lex_div']
X = pd.concat([data[cols], X], axis=1)
print(X.shape)
X.head()

### 3a. Clustering

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html # in class notes 1 is the best cluster. It is a highly dense. Higher the better.

In [None]:
k = 3
clusterer = KMeans(n_clusters=k, random_state=10)
cluster_labels = clusterer.fit_predict(X)
inertia = clusterer.inertia_
avg_silhouette = silhouette_score(X, cluster_labels)
print(k, inertia, avg_silhouette)

In [None]:
from sklearn.cluster import MiniBatchKMeans

In [None]:
clusters = range(3,12)
silhouette_scores = []
inertia_scores = []
for k in clusters:
    clstr = MiniBatchKMeans(n_clusters=k, random_state=24) # create an instance of Kmeans
    pred = clstr.fit_predict(X)
    inertia_scores.append(clstr.inertia_) 
    avg_silhouette = silhouette_score(X, pred)
    silhouette_scores.append((k, avg_silhouette))

In [None]:
fig, ax = plt.subplots()

ax.plot(clusters, inertia_scores, 'bx-')
for i, txt in enumerate(silhouette_scores):
    ax.annotate(round(txt[1],2), (clusters[i], inertia_scores[i]))

plt.xlabel('k')
plt.ylabel('Inertia')
plt.title('The Elbow Method showing the optimal k\n (Silhouette score annotated for each k)')

plt.show()

In [None]:
from yellowbrick.cluster import KElbowVisualizer

In [None]:
visualizer = KElbowVisualizer(KMeans(), k=(3,12))
visualizer.fit(X)        # Fit the data to the visualizer
visualizer.show()        # Finalize and render the figure

## 4. TOPIC MODELLING

In [None]:
from sklearn.decomposition import LatentDirichletAllocation as LDA

In [None]:
# higher alpha assumes documents to be made up of more topics and result in more specific topic distribution per document.
# with high beta, topics are assumed to made of up most of the words and result in a more specific word distribution per topic.
number_topics = 3
number_words = 20

def topics(model, count_vectorizer, n_top_words):

    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    

In [None]:
lda = LDA(n_components=number_topics, n_jobs=-1)
lda_fit = lda.fit(X)

# Print the topics found by the LDA model
print("Topics found via LDA:")
topics(lda, vectorizer, number_words)

In [None]:
tfidf = TfidfVectorizer(analyzer='word')
tfidf_vector = tfidf.fit_transform(normlemm_corpus)

tfidf_features = tfidf.get_feature_names()
X = pd.DataFrame(tfidf_vector.toarray(), columns=tfidf.get_feature_names())

number_topics = 6
lda = LDA(n_components=number_topics, n_jobs=-1)
lda_fit = lda.fit(X)

# Print the topics found by the LDA model
print("Topics found via LDA:")
topics(lda, tfidf, number_words)

In [None]:
import pyLDAvis
import pyLDAvis.sklearn

In [None]:
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.sklearn.prepare(lda, tfidf_vector, tfidf)
pyLDAvis.save_html(LDAvis_prepared, 'lda.html')