In [1]:
import numpy as np
import pandas as pd
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import pickle
from scipy.linalg import svd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.decomposition import TruncatedSVD
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from IPython.display import display
import gensim
from scipy.stats import entropy

In [None]:
def tokenize(data, stopwords = None):
    # Lower case
    data = data.lower()
    data = data.encode('ascii', 'ignore')

    # Punctation removal
    data = data.translate(None, string.punctuation) # changes to be done for python 3

    # Tokenization
    tokenizer = TreebankWordTokenizer()
    token_list = tokenizer.tokenize(data)

    # Removing stopwords
    if stopwords is not None:
        token_list = [word for word in token_list if word not in stopwords]

    # Stemming
#     stemmer = PorterStemmer()
#     token_list = [stemmer.stem(word) for word in token_list]
    return token_list

In [None]:
def generate_document_term_matrix(data, root_folder, data_name, stop_words, k=1000):

	print('Generating document term matrix for {0}....'.format(data_name))
	token_count_map = {}

	# vocabulary of k words based on frequency after stopword word removal, punctuation aremoval and stemming 
	for text in data.Text:
		token_list = tokenize(text, stop_words)
		for token in token_list:
			if token in token_count_map:
				token_count_map[token] = token_count_map[token] + 1
			else:
				token_count_map[token] = 1

	# sort token_count_map decreasing order of count
	sorted_item_list = sorted(token_count_map.items(), key=lambda t: t[1], reverse=True)
	vocabulary = set([x[0] for x in sorted_item_list[0:k]])

	tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize, min_df=1, analyzer="word", stop_words=english_stops, vocabulary = vocabulary)
	doc_term_sparse_mat = tfidf_vectorizer.fit_transform(data.Text)

	# saving document term matrix
	with open('{0}/{1}_doc_term_matrix_{2}.pkl'.format(root_folder,data_name, k), 'wb') as fp:
		pickle.dump(doc_term_sparse_mat, fp)

	print('Finished generating document term matrix for {0}....'.format(data_name))

	return doc_term_sparse_mat, tfidf_vectorizer.vocabulary_

In [None]:
def plot_document_mat(mat , data_name, save_folder, vocab, xmin=0.4, xmax=0.8, ymin=-0.8, ymax=0.8):
    plt.figure(figsize = (12,20))
    plt.title('Data representation in reduced dimension: {0}'.format(data_name))
    plt.xlabel('dim 1')
    plt.ylabel('dim 2')
    plt.xlim(xmin, xmax)
    plt.ylim(ymin,ymax)
    plt.grid()
    plt.plot(mat[:, :1], mat[:, 1:], 'ro')
    for i, txt in enumerate(range(mat.shape[0])):
        plt.annotate(vocab[txt], (mat[:, :1][i],mat[:, 1:][i]))
    #plt.savefig('{0}/{1}_dataplot.png'.format(save_folder, data_name))
    plt.show()

In [None]:
# main intializations
english_stops = set(stopwords.words('english'))
root_folder = '/home/vparambath/Desktop/iith/IR-Assignment2'
data_folder = '/home/vparambath/Desktop/iith/IR-Assignment2'


# Read data
data = pd.read_csv('{0}/Dataset-2.txt'.format(data_folder), sep=':', header=None, names=['TextId', 'Text'], nrows =10000)
doc_term_matrix, vocab = generate_document_term_matrix(data, root_folder, 'dataset2', english_stops)
inv_vocab = {v: k for k, v in vocab.iteritems()}

svd_5 = TruncatedSVD(n_components=5, n_iter=7, random_state=42)
svd_2 = TruncatedSVD(n_components=2, n_iter=7, random_state=42)
reduced_mat_2 = svd_2.fit_transform(doc_term_matrix.T)
reduced_mat_5 = svd_5.fit_transform(doc_term_matrix.T)

In [None]:
doc_term_matrix.T.shape

In [None]:
#https://github.com/tirthajyoti/Interactive_Machine_Learning/blob/master/Curve_fit_widget_1.ipynb
#plot_document_mat(reduced_mat_2, 'd2', root_folder, inv_vocab)

In [None]:
def plot_document_mat(mat , data_name, save_folder, vocab):
    @interact(xmin=(0,10,0.1), xmax=(0,10,0.1), ymin=(-10,10,0.1),ymax=(-10,10,0.1))
    def plot_interact(xmin,xmax, ymin,ymax):
        plt.figure(figsize = (12,10))
        plt.title('Data representation in reduced dimension: {0}'.format(data_name))
        plt.xlabel('dim 1')
        plt.ylabel('dim 2')
        plt.xlim(xmin, xmax)
        plt.ylim(ymin,ymax)
        plt.grid()
        plt.plot(mat[:, :1], mat[:, 1:], 'ro')
        for i, txt in enumerate(range(mat.shape[0])):
            plt.annotate(vocab[txt], (mat[:, :1][i],mat[:, 1:][i]))
        #plt.savefig('{0}/{1}_dataplot.png'.format(save_folder, data_name))
        plt.show()

In [None]:
plot_document_mat(reduced_mat_2, 'd2', root_folder, inv_vocab)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

scaler = StandardScaler()
X_scaled = scaler.fit_transform(reduced_mat_2)

cluster_range = range( 1, 20 )
cluster_errors = []
for num_clusters in cluster_range:
    clusters = KMeans( num_clusters )
    clusters.fit( X_scaled )
    cluster_errors.append( clusters.inertia_ )
    
clusters_df = pd.DataFrame( { "num_clusters":cluster_range, "cluster_errors": cluster_errors } )
plt.figure(figsize=(12,6))
plt.plot(clusters_df.num_clusters, clusters_df.cluster_errors, marker = "o")
plt.xticks(np.arange(1,20), np.arange(1,20))
plt.show()

In [None]:
kmeans = KMeans(n_clusters=6)
kmeans.fit(X_scaled)
y_kmeans = kmeans.predict(X_scaled)
centers = kmeans.cluster_centers_


plt.figure(figsize=(12,6))
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=y_kmeans, s=50, cmap='viridis')
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5)
# for i, txt in enumerate(range(X_scaled.shape[0])):
#         plt.annotate(txt, (X_scaled[:, 0][i], X_scaled[:, 1][i]))
plt.show()

In [None]:
kmeans.labels_

In [2]:
with open('/home/vparambath/Desktop/iith/IR-Assignment2/dataset1_doc_term_matrix_50000.pkl','rb')as fp:
    doc_term_matrix = pickle.load(fp)
    
with open('/home/vparambath/Desktop/iith/IR-Assignment2/dataset1_vocabulary_50000.pkl','rb')as fp:
    vocab = pickle.load(fp)
    
# print(doc_term_matrix.shape)
# print(vocab)

In [3]:
lda = gensim.models.ldamodel.LdaModel

In [4]:
lda_model = lda(gensim.matutils.Sparse2Corpus(doc_term_matrix, documents_columns=False), num_topics=5, id2word = {v: k for k, v in vocab.items()}, passes=50)

KeyboardInterrupt: 

In [None]:
print(lda_model.print_topics(num_topics=5, num_words=10))

In [None]:
lda_model.show_topic(1)

In [None]:
# select top 50 words for each of the 5 LDA topics
top_words = [[_ for _, word in lda_model.show_topic(topicno, topn=50)] for topicno in range(lda_model.num_topics)]
print(top_words)

In [None]:
# https://stackoverflow.com/questions/45310925/how-to-get-a-complete-topic-distribution-for-a-document-using-gensim-lda
corpus = gensim.matutils.Sparse2Corpus(doc_term_matrix, documents_columns=False)
[prob for _, prob in lda_model.get_document_topics(corpus[0])][:2]

In [None]:
import random
random.seed(9001)
docs = list(range(len(corpus)))
random.shuffle(docs)
docs[:5]

In [None]:
# https://stackoverflow.com/questions/22433884/python-gensim-how-to-calculate-document-similarity-using-the-lda-model

In [None]:
num_topics = 5
doc_topic_dist = []
for i in range(len(corpus)):
    dist = [prob for _, prob in lda_model.get_document_topics(corpus[i])]
    if len(dist) == num_topics:
        doc_topic_dist.append(np.array(dist))
doc_topic_dist = np.array(doc_topic_dist)

In [None]:
dist = [prob for _, prob in lda_model.get_document_topics(corpus[22996])]
print(dist)
np.array(dist).reshape(-1,5).shape

In [None]:
def jensen_shannon(query, matrix):
    """
    This function implements a Jensen-Shannon similarity
    between the input query (an LDA topic distribution for a document)
    and the entire corpus of topic distributions.
    It returns an array of length M where M is the number of documents in the corpus
    """
     # take transpose
    p = query[None,:].T
     # transpose matrix
    q = matrix.T
    m = 0.5*(p + q)
    return np.sqrt(0.5*(entropy(p,m) + entropy(q,m)))


def get_most_similar_documents(query, matrix, k=5):
	"""
    This function implements the Jensen-Shannon distance above
    and retruns the top k indices of the smallest jensen shannon distances,
    The smaller the Jensen-Shannon Distance, the more similar two distributions are
    """
    # list of jensen shannon distances
	sims = jensen_shannon(query,matrix)
	# the top k positional index of the smallest Jensen Shannon distances
	return sims.argsort()[:k]

In [None]:
from scipy.stats import entropy
from numpy.linalg import norm
import numpy as np

def JSD(P, Q):
    _P = P / norm(P, ord=1)
    _Q = Q / norm(Q, ord=1)
    _M = 0.5 * (_P + _Q)
    return 0.5 * (entropy(_P, _M) + entropy(_Q, _M))

In [None]:
def jsd(p, q, base=np.e):
    '''
        Implementation of pairwise `jsd` based on  
        https://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence
    '''
    p = np.asarray(p)
    q = np.asarray(q)
    m = 1./2*(p + q)
    return sp.stats.entropy(p,m, base=base)/2. + sp.stats.entropy(q, m, base=base)/2.

In [None]:
JSD(query, doc_topic_dist[2])

In [None]:
import pandas as pd
from scipy.spatial.distance import euclidean, pdist, squareform

dists = pdist(doc_topic_dist, JSD)