# Data Sciense Flow

###1. Target formulation

###2. Text cleaning

###3. Model building

###4. Results using

# Text Categorization

# <img src="classify.png", style="max-width:90%; width: 80%">

# LDA

<img src="LDA.png", style="max-width:90%; width: 80%">

In [10]:

import gensim
from sklearn.datasets import fetch_20newsgroups
import logging
from collections import defaultdict

from nltk.corpus import stopwords
stopwords = stopwords.words("english")
from pattern.en import lemma

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


corpus_savepath = 'gensimldatest.mm'


#basic preprocessing and lemmatization 

texts = fetch_20newsgroups(subset='train').data
texts = [unicode(d.lower()) for d in texts]
texts = ["".join((char if char.isalpha() else " ") for char in text).split() for text in texts]
texts = [[word for word in text[:1000] if word not in stopwords] for text in texts]

#creating frequency dictionary for tokens in text
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

#removing very infrequent and very frequent tokens in corpus
texts = [[token for token in text if (frequency[token] > 10 and len(token) > 2 and frequency[token] < len(texts)*0.2)] for text in texts]

#creating an LDA model
dictionary = gensim.corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
gensim.corpora.MmCorpus.serialize(corpus_savepath, corpus)
modelled_corpus = gensim.corpora.MmCorpus(corpus_savepath)
lda1 = gensim.models.ldamodel.LdaModel(modelled_corpus, num_topics=10, update_every=100, passes=20, id2word=dictionary, alpha='auto', eval_every=5)

#returning the resulting topics
lda1.show_topics(num_topics=10, num_words=30, formatted=True)

# Results analysis

In [145]:
news = "Indianes women hockey could never recover from three early setbacks and ended up losing the Hawkes \
Bay Cup quarterfinal 1-3 to Japan, today. "

texts = [unicode(d.lower()) for d in new.split()]
text1 = dictionary.doc2bow(texts)
sorted(lda[text1], key = lambda x: x[1], reverse = True)

[(6, 0.53758295629888098),
 (2, 0.18588873448018561),
 (8, 0.15471837618089992),
 (1, 0.10088456902754719)]

In [143]:
new = " The bible is not the WORD OF GOD. And if we believe it as such, then we have made the bible into an idol. \
The Bible makes the claim \
of being the inspired word of God, God's message to humanity."

texts = [unicode(d.lower()) for d in new.split()]
text1 = dictionary.doc2bow(texts)
sorted(lda[text1], key = lambda x: x[1], reverse = True)

[(7, 0.48670069241050118),
 (0, 0.24817498942542299),
 (2, 0.13441145401025376),
 (5, 0.10996778497322443)]

In [149]:
new = "Having had windows live mail working satisfactorily for some time "

texts = [unicode(d.lower()) for d in new.split()]
text1 = dictionary.doc2bow(texts)
lda[text1]
sorted(lda[text1], key = lambda x: x[1], reverse = True)

[(6, 0.71185683311735759),
 (4, 0.21705128852495115),
 (9, 0.013788021255459389),
 (1, 0.011798482860266721)]

In [155]:
text1 = dictionary.doc2bow(["government"])
lda[text1]
sorted(lda[text1], key = lambda x: x[1], reverse = True)

[(9, 0.77765272176558953),
 (1, 0.03776186665686139),
 (8, 0.031314062631686457),
 (0, 0.029998017363570186),
 (5, 0.025090172634431366),
 (7, 0.022590981930248007),
 (6, 0.022449774249397886),
 (3, 0.020767332524668225),
 (4, 0.016492515697128261),
 (2, 0.015882554546418556)]

# LDA2Vec

<img src="lda2.jpg", style="max-width:90%; width: 60%">

In [3]:
from lda2vec import preprocess, LDA2Vec, Corpus
from sklearn.datasets import fetch_20newsgroups
from chainer import serializers
from chainer import cuda
import numpy as np
import os.path
import logging


# Optional: moving the model to the GPU makes it ~10x faster
# set to False if you're having problems with Chainer and CUDA
gpu = cuda.available

logging.basicConfig()

from nltk.corpus import stopwords
cachedStopWords = stopwords.words("english")

corpus_savepath = 'gensimldatest.mm'

#basic preprocessing and lemmatization 
texts = fetch_20newsgroups(subset='train').data
texts = [unicode(d.lower()) for d in texts]
texts = ["".join((char if char.isalpha() else " ") for char in text).split() for text in texts]
texts = [[word for word in text[:2000] if word not in cachedStopWords] for text in texts]

texts = [unicode(d) for d in texts]



# Preprocess data
max_length = 1000   # Limit of 1k words per document
tokens, vocab = preprocess.tokenize(texts, max_length, tag=False,
                                    parse=False, entity=False)
corpus = Corpus()
# Make a ranked list of rare vs frequent words
corpus.update_word_count(tokens)
corpus.finalize()
# The tokenization uses spaCy indices, and so may have gaps
# between indices for words that aren't present in our dataset.
# This builds a new compact index
compact = corpus.to_compact(tokens)
# Remove extremely rare words
pruned = corpus.filter_count(compact, min_count=50)
# Words tend to have power law frequency, so selectively
# downsample the most prevalent words
clean = corpus.subsample_frequent(pruned)
# Now flatten a 2D array of document per row and word position
# per column to a 1D array of words. This will also remove skips
# and OoV words
doc_ids = np.arange(pruned.shape[0])
flattened, (doc_ids,) = corpus.compact_to_flat(pruned, doc_ids)



# Model Parameters
# Number of documents
n_docs = len(texts)
# Number of unique words in the vocabulary
n_words = flattened.max() + 1
# Number of dimensions in a single word vector
n_hidden = 128
# Number of topics to fit
n_topics = 20
# Get the count for each key
counts = corpus.keys_counts[:n_words]
# Get the string representation for every compact key
words = corpus.word_list(vocab)[:n_words]

# Fit the model
model = LDA2Vec(n_words, n_hidden, counts, dropout_ratio=0.2)
model.add_categorical_feature(n_docs, n_topics, name='document_id')
model.finalize()

for _ in range(50):
    model.top_words_per_topic('document_id', words)
    if gpu:
        model.to_gpu()
    model.fit(flattened, categorical_features=[doc_ids], fraction=1e-3,
              epochs=1)
    model.to_cpu()
serializers.save_hdf5('model_film_20news.hdf5', model)

In [None]:
# Model Parameters
# Number of documents
n_docs = len(texts)
# Number of unique words in the vocabulary
n_words = flattened.max() + 1
# Number of dimensions in a single word vector
n_hidden = 128
# Number of topics to fit
n_topics = 20
# Get the count for each key
counts = corpus.keys_counts[:n_words]
# Get the string representation for every compact key
words = corpus.word_list(vocab)[:n_words]

# Fit the model
model = LDA2Vec(n_words, n_hidden, counts, dropout_ratio=0.2)
model.add_categorical_feature(n_docs, n_topics, name='document_id')
model.finalize()

for _ in range(50):
    model.top_words_per_topic('document_id', words)
    if gpu:
        model.to_gpu()
    model.fit(flattened, categorical_features=[doc_ids], fraction=1e-3,
              epochs=1)
    model.to_cpu()
serializers.save_hdf5('model_film_20news.hdf5', model)