In [187]:
"""
This program implements the retrieval of XKCD comics
Written by Tim Burke and Anuj Ramakrishnan
"""
import numpy as np
import pandas as pd
import xlwt
import xlrd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk import word_tokenize # Tokenization tool


########################################################################################################################
# Read in the data 
########################################################################################################################

df = pd.read_csv('data/comic_final_data.csv',encoding='UTF-8')
df = df.fillna(" ")
df["all_text"] = df["title"] + " " + df["topic_category"] + " " + df["title_text"] + " " + df["explanation"] + " " + df["transcript"]

# To remove duplicates of comics in multiple categories, first find all categories for all comics
def get_all_categories(row):
    categories = df['topic_category'].loc[df['url'] == row['url']]
    cats = categories.str.cat(sep=' ')
    return cats
df['all_categories'] = df.apply(get_all_categories,axis=1)

# Now we remove duplicate comics, keeping only 1 of each
df = df.drop_duplicates(subset='url',keep='first')
df = df.drop('topic_category',axis=1)

In [194]:
########################################################################################################################
# Turn all comic text data into tf-idf vectors 
########################################################################################################################

vectorizer = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True, tokenizer=word_tokenize,lowercase=True)
vectorizer = vectorizer.fit(df['all_text'].str.lower()) 

In [195]:
title_vectors = vectorizer.transform(df['title'])
rollover_vectors = vectorizer.transform(df['title_text'])
category_vectors = vectorizer.transform(df['all_categories'])
explanation_vectors = vectorizer.transform(df['explanation'])
transcript_vectors = vectorizer.transform(df['transcript'])

In [198]:
########################################################################################################################
# Get the similarity score matrix between a query and comic texts 
########################################################################################################################
query = ['debugging']
query_vec = vectorizer.transform(query)

def jaccard_similarity(str1,str2):
    str1 = set(word_tokenize(str1))
    str2 = set(word_tokenize(str2))
    return float(len(str1 & str2)) / float(len(str1 | str2)) 

title_sim = cosine_similarity(query_vec,title_vectors).transpose()
rollover_sim = cosine_similarity(query_vec,rollover_vectors).transpose()
category_sim = cosine_similarity(query_vec,category_vectors).transpose()
explanation_sim = cosine_similarity(query_vec,explanation_vectors).transpose()
transcript_sim = cosine_similarity(query_vec,transcript_vectors).transpose()

jaccard_title = np.array([jaccard_similarity(query[0],string) for string in df['title']]).reshape((len(df['title']),1))
jaccard_rollover = np.array([jaccard_similarity(query[0],string) for string in df['title_text']]).reshape((len(df['title']),1))
jaccard_category = np.array([jaccard_similarity(query[0],string) for string in df['all_categories']]).reshape((len(df['title']),1))
jaccard_explanation = np.array([jaccard_similarity(query[0],string) for string in df['explanation']]).reshape((len(df['title']),1))
jaccard_transcript = np.array([jaccard_similarity(query[0],string) for string in df['transcript']]).reshape((len(df['title']),1))

score_matrix = np.column_stack((title_sim,rollover_sim,category_sim,explanation_sim,transcript_sim))
jaccard_matrix = np.column_stack((jaccard_title,jaccard_rollover,jaccard_category,jaccard_explanation,jaccard_transcript))

print(jaccard_matrix.shape,score_matrix.shape)

(398, 5) (398, 5)


In [200]:
weights = np.array([5,3,6,1,3]) # Weights in order title, rollover, category, explanation, transcript
j_weights = np.array([20,1,1,1,40])
results = np.matmul(score_matrix,weights)
# results = np.matmul(jaccard_matrix,j_weights)

count = 1
for idx in results.argsort()[:-4:-1]:
    print("RESULT #%i" % count,": ",df['title'].loc[idx],"\n",df.loc[idx],"\n")
    count += 1
print(results)

RESULT #1 :  1722: Debugging 
 title                                               1722: Debugging
url               http://www.explainxkcd.com/wiki/index.php/1722...
title_text        When you Google an error message and it gets n...
explanation       Cueball is telling White Hat about his attempt...
transcript        [Cueball and White Hat are walking, while Cueb...
all_text          1722: Debugging computers When you Google an e...
all_categories                                            computers
Name: 119, dtype: object 

RESULT #2 :  979: Wisdom of the Ancients 
 title                                   979: Wisdom of the Ancients
url               http://www.explainxkcd.com/wiki/index.php/979:...
title_text        All long help threads should have a sticky glo...
explanation       This comic refers to a common experience that ...
transcript        [A poem is written outside and right justified...
all_text          979: Wisdom of the Ancients computers All long...
all_categories 