<a href="https://colab.research.google.com/github/yzmsp7/NCCU-DS4CS2020/blob/master/sparkAmplify_hoemwork.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SparkAmplify Code Challenge

### Description

We have 15,000 articles in our hands which are from N categories.

### Problem

1. Please plot the position of the articles in two dimensional space and cluster the articles in several groups. Then, you need to pick the representative article toward each group. (note: you have to explain the reason of picking group number)
2. Based on the 1, what are the top-5 keywords of each representative article.
3. Based on the 1 and 2, please rank the top-10 document order with the top-5 keywords from the representative article toward each group.
4. Based on the 1 and 2, please rank the top-10 document order with the content from the representative article toward each group.
5. Please combine both methods you applied in the 3 and 4 to output the similar format result.


### EDA

1. Missing article name (na): 397
2. Different article name but same contents. e.g. 1259184-1259200: 160
3. Aritcle name contains the different language e.g russian

In [None]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm

In [None]:
articles = pd.read_csv("/content/articles_raw.csv")
articles

In [None]:
dup = articles[articles.duplicated(subset=['text'])]

In [None]:
dup_id = dup.id.to_list()
na_id = articles[articles.title.isna()].id.to_list()

In [None]:
articles_uniq = articles[~articles.id.isin(na_id+dup_id)]
print("legnth: ", len(articles_uniq))

### Preprocessing

spacy: https://allenai.github.io/scispacy/

In [None]:
!pip install pyLDAvis

In [None]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim import corpora, models
import gensim
import string
import pyLDAvis.gensim

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
pattern = r'\b[^\d\W]+\b'
tokenizer = RegexpTokenizer(pattern)
en_stop = nltk.corpus.stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [None]:
def normalizeText(data):
    raw = str(data).lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [raw for raw in tokens if not raw in en_stop]
    
    # lemmatize tokens
    lemma_tokens = [lemmatizer.lemmatize(tokens) for tokens in stopped_tokens]
    
    # remove word containing only single char
    new_lemma_tokens = [raw for raw in lemma_tokens if not len(raw) == 1]
    
    return new_lemma_tokens

In [None]:
texts = []
for row in articles_uniq.itertuples():
    texts.append(normalizeText(row[3]))

In [None]:
# nlp = en_core_web_sm.load()

# doc = nlp(articles_uniq.text[0])
# spacy_words = [token.text for token in doc]
# print(f"Tokenized words: {spacy_words}")

### Document Clustering - TF-IDF + KMeans

Dimension reduction to vis: pca v.s. t-SNE

In [None]:
from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [None]:
articles_uniq['nor_text'] = texts
articles_uniq['nor_text'] = articles_uniq['nor_text'].copy().apply(lambda x: ' '.join(x))
tfidf = TfidfVectorizer(
    min_df = 5,
    max_df = 0.95,
    max_features = 8000,
    stop_words = 'english'
)
tfidf.fit(articles_uniq.nor_text)
contents = tfidf.transform(articles_uniq.nor_text)

In [None]:
def find_optimal_clusters(data, max_k):
    iters = range(2, max_k+1, 2)
    
    sse = []
    for k in iters:
        sse.append(MiniBatchKMeans(n_clusters=k, init_size=1024, batch_size=2048, random_state=20).fit(data).inertia_)
        print('Fit {} clusters'.format(k))
        
    f, ax = plt.subplots(1, 1)
    ax.plot(iters, sse, marker='o')
    ax.set_xlabel('Cluster Centers')
    ax.set_xticks(iters)
    ax.set_xticklabels(iters)
    ax.set_ylabel('SSE')
    ax.set_title('SSE by Cluster Center Plot')
    
find_optimal_clusters(contents, 20)

In [None]:
clusters = MiniBatchKMeans(n_clusters=8, init_size=1024, batch_size=2048, random_state=20).fit_predict(contents)

In [None]:
def plot_tsne_pca(data, labels):
    max_label = max(labels)
    max_items = np.random.choice(range(data.shape[0]), size=3000, replace=False)
    
    pca = PCA(n_components=2).fit_transform(data[max_items,:].todense())
    tsne = TSNE().fit_transform(PCA(n_components=50).fit_transform(data[max_items,:].todense()))
    
    
    idx = np.random.choice(range(pca.shape[0]), size=300, replace=False)
    label_subset = labels[max_items]
    label_subset = [cm.hsv(i/max_label) for i in label_subset[idx]]
    
    f, ax = plt.subplots(1, 2, figsize=(14, 6))
    
    ax[0].scatter(pca[idx, 0], pca[idx, 1], c=label_subset)
    ax[0].set_title('PCA Cluster Plot')
    
    ax[1].scatter(tsne[idx, 0], tsne[idx, 1], c=label_subset)
    ax[1].set_title('TSNE Cluster Plot')
    
plot_tsne_pca(contents, clusters)

In [None]:
 def get_top_keywords(data, clusters, labels, n_terms):
    df = pd.DataFrame(data.todense()).groupby(clusters).mean()
    
    for i,r in df.iterrows():
        print('\nCluster {}'.format(i))
        print(','.join([labels[t] for t in np.argsort(r)[-n_terms:]]))
            
get_top_keywords(contents, clusters, tfidf.get_feature_names(), 5)

### Document Clustering - LDA

In [None]:
from sklearn.decomposition import LatentDirichletAllocation as LDA
 
def print_topics(model, vectorizer, n_top_words):
    words = vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
        
number_topics = 8
number_words = 5
# Create and fit the LDA model
lda = LDA(n_components=number_topics, n_jobs=-1)
lda.fit(contents)
# Print the topics found by the LDA model
print("Topics found via LDA:")
print_topics(lda, tfidf, number_words)

In [None]:
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

In [None]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=8, id2word = dictionary, passes=20)

In [None]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

In [None]:
twords = {}
for i, tp in enumerate(ldamodel.show_topics(num_words=5)):
    twords[i] = re.sub('[^A-Za-z ]+', '', tp[1]).split('  ')

twords

### Rank top 10 document with top-5 keywords

In [None]:
from gensim import similarities

index = similarities.MatrixSimilarity(ldamodel[corpus])

In [None]:
print(index)

In [None]:
def get_similarity(lda, query_vector):
    index = similarities.MatrixSimilarity(lda[corpus])
    sims = index[query_vector]
    return sims

In [None]:
print("Top 10 documents with top-5 keywords")
for topic_i in range(len(twords)):
    print("topic {} 's keywords: {}".format(topic_i, twords[topic_i]))
    query = ldamodel[dictionary.doc2bow(twords[topic_i])]
    sims = get_similarity(ldamodel, query)
    sims = sorted(enumerate(sims), key=lambda item: -item[1]) # ranking
    
    for i in range(10):
        target = articles_uniq.iloc[sims[i][0], :]
        print("id: {} / title: {}".format(target['id'], target['title']))