In [1]:
    
import os
import re
import math
import random
import warnings

from bs4 import BeautifulSoup

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import nltk
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import gensim
from nltk.tokenize import word_tokenize
import dateutil.parser

import time
import pickle


%matplotlib inline


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jeffb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Read all files:

qs = pd.read_csv('data/stackoverflow/PostQuestionsFiltered_V4_parsed.tsv',delimiter='\t',keep_default_na=False, encoding='utf-8')
#comments = pd.read_csv('CommentsFiltered_v3.tsv',delimiter='\t',encoding='utf-8')
#tags = pd.read_csv('tags.csv')
answers = pd.read_csv('data/stackoverflow/PostAnswersFiltered_V4_cleaned_answer_bodies.tsv',delimiter='\t', keep_default_na=False, encoding='utf-8')


#### add new_tags to question file using space separators between tags

In [None]:
qs.head()

In [None]:
row_iterator = qs.iterrows()
keywords = [
   'javascript', 'java', 'c#', 'php', 'python', 'c++',
       'node.js', 'objective-c', 'vb.net', 'scala', 'matlab', 'perl', 'delphi',
       'matplotlib', 'animation', 'd3', 'ggplot2', 'plot', 'graph', 'chart',
       'highcharts', 'vbscript', 'colors', 'pyspark', 'dplyr', 'f#', '3d',
       'sas', 'fortran', 'maps', 'lisp', 'julia', 'powerbi', 'drawing', 'line',
       'plotly', 'bar-chart', 'visualization', 'tableau', 'seaborn',
       'geospatial', 'stata', 'plyr', 'pie-chart', 'graphviz', 'spss',
       'diagram', 'qlikview', 'altair'
]
row_keywords = []
for row in row_iterator:
    single_row_keywords = [keyword for keyword in keywords if row[1][keyword] == 1]
    single_row_keystring = " ".join(single_row_keywords)
    row_keywords.append(single_row_keystring)
row_keywords[0:10]

In [None]:
np_array_of_row_keywords = np.array(row_keywords)
qs["new_tags"] = np_array_of_row_keywords

In [13]:
qs.head()

Unnamed: 0,id,title,body,accepted_answer_id,answer_count,comment_count,community_owned_date,creation_date,favorite_count,last_activity_date,...,plyr,pie-chart,graphviz,spss,diagram,qlikview,altair,r,parsedtags,new_tags
0,15537402,using command on a gnplot script,<p>Im using a script on a mac bash shell that ...,,1,2,,2013-03-21 00:34:11.173000+00:00,,2013-03-21 00:44:12.973000+00:00,...,0,0,0,0,0,0,0,0,plot,plot
1,15856146,Applying Orthographic projection or frustum ef...,<p>I know that normalised coordinates should b...,15858157.0,1,0,,2013-04-06 21:21:10.723000+00:00,,2013-04-07 02:13:30.390000+00:00,...,0,0,0,0,0,0,0,0,graph,graph
2,15428854,How to implement both scalar and vector additi...,"<p>I'm working on a Vector2D class, and I thin...",15429296.0,3,0,,2013-03-15 09:33:56.357000+00:00,,2013-03-15 10:20:14.973000+00:00,...,0,0,0,0,0,0,0,0,c++ graph,c++ graph
3,15517350,how to increase speed of tchart refresh()?,<p>I have 16 graphs[maximum ] with 4 fastlines...,15526751.0,1,0,,2013-03-20 07:07:09.313000+00:00,0.0,2013-06-10 13:13:10.997000+00:00,...,0,0,0,0,0,0,0,0,c# chart,c# chart
4,15445313,What does $ROOT mean in a Mac Terminal?,<p>I received some command line instructions:<...,15445328.0,2,0,,2013-03-16 03:45:52.347000+00:00,,2013-03-16 03:50:07.557000+00:00,...,0,0,0,0,0,0,0,0,line,line


In [12]:
qs.to_csv('data/stackoverflow/new_qs.csv')

In [13]:
len(qs)

429665

#### create 11k and 22k random sample files, corresponding to 2.5% and 5% of the question file

In [15]:
qs22k=qs.sample(n=22000,random_state=2019)

In [16]:
len(qs22k)

22000

In [17]:
qs11k=qs.sample(n=11000,random_state=2019)

In [18]:
len(qs11k)

11000

In [72]:
qs22k.to_csv('data/stackoverflow/new_qs_22k.csv', index=False)

In [73]:
qs11k.to_csv('data/stackoverflow/new_qs_11k.csv', index=False)

#### create combined question + answers files to use in creating the models

#### combined files will be created for the full question and answer files and for the 11k and 22k sample files

#### note that all answers for each question are included, not just accpeted answers.  The corresponding question is attached 
#### to each answer for questions with multiple answers, so questions will appear to repeat in the combined file in some cases

#### for the model using just accepted answers, the data is filtered when the model is built below

In [21]:
# modified to include all answers (not just accept answers) and to specify the answers file as the 'left' file
combined = pd.merge(answers, qs, how='inner', left_on = 'parent_id', right_on = 'id')

In [22]:
len(combined)

542216

In [74]:
# write the full combined file
combined.to_csv('data/stackoverflow/combined_all_ans.csv', index=False)

In [24]:
combined.head()

Unnamed: 0,id_x,title_x,body_x,accepted_answer_id_x,answer_count_x,comment_count_x,community_owned_date_x,creation_date_x,favorite_count_x,last_activity_date_x,...,plyr,pie-chart,graphviz,spss,diagram,qlikview,altair,r,parsedtags,new_tags
0,15956654,,<p>Go to Edit --> Preferences --> Python and s...,,,0,,2013-04-11 18:56:38.187000+00:00,,2013-04-11 18:56:38.187000+00:00,...,0,0,0,0,0,0,0,0,python matplotlib plot line,python matplotlib plot line
1,15972122,,<p>The preference option only changes the <a h...,,,0,,2013-04-12 13:02:08.130000+00:00,,2014-03-31 18:15:58.977000+00:00,...,0,0,0,0,0,0,0,0,python matplotlib plot line,python matplotlib plot line
2,15957626,,"<p>Like @Baptiste said, just use facetting. An...",,,0,,2013-04-11 19:51:27.630000+00:00,,2013-04-11 19:51:27.630000+00:00,...,0,0,0,0,0,0,0,0,ggplot2 plot,ggplot2 plot
3,15958480,,<p>The answers already given would be the 'bes...,,,1,,2013-04-11 20:41:23.260000+00:00,,2013-04-11 20:41:23.260000+00:00,...,0,0,0,0,0,0,0,0,ggplot2 plot,ggplot2 plot
4,15957907,,<p>Without spending a lot of time digging thro...,,,0,,2013-04-11 20:07:16.230000+00:00,,2013-04-11 20:07:16.230000+00:00,...,0,0,1,0,0,0,0,0,graph drawing graphviz,graph drawing graphviz


In [66]:
combined11k = pd.merge(answers, qs11k, how='inner', left_on = 'parent_id', right_on = 'id')

In [67]:
len(combined11k)
# note that the length is > 11K due to some questions having multiple answers

13938

In [75]:
combined11k.to_csv('data/stackoverflow/combined_all_ans_11k.csv', index=False)

In [69]:
combined22k = pd.merge(answers, qs22k, how='inner', left_on = 'parent_id', right_on = 'id')

In [70]:
len(combined22k)
# lenght is > 22k due to some questions having multiple answers

28038

In [76]:
combined22k.to_csv('data/stackoverflow/combined_all_ans_22k.csv', index=False)

####  create model files using the combined file as input in order to get answer data too

In [3]:
combined11k=pd.read_csv('data/stackoverflow/combined_all_ans_11k.csv', keep_default_na=False, encoding='utf-8')
len(combined11k)

13938

In [4]:
# add a column for the number of images with this answer
# note that answers with no images have a num_of_images value of 2, representing the "[]" in the images_list
# for the emply image list.  So, answers with number_of_images > 2 have a image with them
combined11k['number_of_images'] = combined11k['images_list'].map(lambda x: len(x))

In [5]:
combined11k['number_of_images']

0         85
1          2
2          2
3          2
4          2
5        179
6          2
7          2
8          2
9          2
10         2
11         2
12         2
13         2
14         2
15         2
16         2
17         2
18         2
19        85
20         2
21         2
22         2
23         2
24         2
25         2
26         2
27         2
28         2
29         2
        ... 
13908      2
13909      2
13910      2
13911      2
13912      2
13913      2
13914      2
13915      2
13916      2
13917      2
13918      2
13919      2
13920      2
13921      2
13922      2
13923      2
13924      2
13925      2
13926     67
13927      2
13928      2
13929     87
13930      2
13931      2
13932      2
13933     85
13934      2
13935      2
13936    170
13937      2
Name: number_of_images, Length: 13938, dtype: int64

In [6]:
combined11k['images_list']

0        [<img alt="enter image description here" src="...
1                                                       []
2                                                       []
3                                                       []
4                                                       []
5        [<img alt="Output when using GPC. " src="https...
6                                                       []
7                                                       []
8                                                       []
9                                                       []
10                                                      []
11                                                      []
12                                                      []
13                                                      []
14                                                      []
15                                                      []
16                                                      

#### Create the various model files

In [7]:
# create 11k model using answer body + question tags

sttime=time.time()
raw_documents = combined11k['cleaned_body'] + ' ' + combined11k['new_tags']

print("Number of Combined Docs:",len(raw_documents))

    # Tokenizing data
gen_docs = [[w.lower() for w in word_tokenize(text)] 
                for text in raw_documents]

# gen_docs = [word_tokenize(text) 
#                for text in raw_documents]


    # Create dictionary
dictionary = gensim.corpora.Dictionary(gen_docs)

    # Creat Document-Term Matrix
corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]

    # Creat TF-IDF Model
tf_idf = gensim.models.TfidfModel(corpus)

    # Creat Similarity Checker
similar_docs = gensim.similarities.Similarity("",tf_idf[corpus],num_features=len(dictionary))

print("11k Answer + Tags Model Processing Completed! Elapsed time:", time.time()-sttime, "seconds")


Number of Combined Docs: 13938
11k Answer + Tags Model Processing Completed! Elapsed time: 36.976619482040405 seconds


In [8]:
with open('data/stackoverflow/tf_idf_model_11k.p', 'wb') as model_file:
    pickle.dump(tf_idf, model_file)

In [9]:
with open('data/stackoverflow/similar_qs_11k.p', 'wb') as similar_docs_file:
    pickle.dump(similar_docs, similar_docs_file)

In [10]:
# create 11k model using answer body + question title + question tags

sttime=time.time()
raw_documents = combined11k['cleaned_body'] + ' ' + combined11k['title_y'] + ' ' + combined11k['new_tags']

print("Number of Combined Docs:",len(raw_documents))

    # Tokenizing data
gen_docs = [[w.lower() for w in word_tokenize(text)] 
                for text in raw_documents]

    # Create dictionary
dictionary = gensim.corpora.Dictionary(gen_docs)

    # Creat Document-Term Matrix
corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]

    # Creat TF-IDF Model
tf_idf = gensim.models.TfidfModel(corpus)

    # Creat Similarity Checker
similar_docs = gensim.similarities.Similarity("",tf_idf[corpus],num_features=len(dictionary))

print("11k Answer + Question Title + Tags Model Processing Completed! Elapsed time:", time.time()-sttime, "seconds")


Number of Combined Docs: 13938
11k Answer + Question Title + Tags Model Processing Completed! Elapsed time: 39.27407360076904 seconds


In [11]:
with open('data/stackoverflow/tf_idf_model_11k_ans_ques_title_tags.p', 'wb') as model_file:
    pickle.dump(tf_idf, model_file)

In [12]:
with open('data/stackoverflow/similar_qs_11k_ans_ques_title_tags.p', 'wb') as similar_docs_file:
    pickle.dump(similar_docs, similar_docs_file)

In [13]:
# create 11k model using answer body + question tags using only answers that have images with them

sttime=time.time()

combined11kimg=combined11k[combined11k['number_of_images'] > 2] # keep only answers with images

raw_documents = combined11kimg['cleaned_body'] + ' ' + combined11kimg['new_tags']

print("Number of Combined Docs:",len(raw_documents))

    # Tokenizing data
gen_docs = [[w.lower() for w in word_tokenize(text)] 
                for text in raw_documents]

    # Create dictionary
dictionary = gensim.corpora.Dictionary(gen_docs)

    # Creat Document-Term Matrix
corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]

    # Creat TF-IDF Model
tf_idf = gensim.models.TfidfModel(corpus)

    # Creat Similarity Checker
similar_docs = gensim.similarities.Similarity("",tf_idf[corpus],num_features=len(dictionary))

print("11k Answer + Tags, Only Answers with Images Model Processing Completed! Elapsed time:", time.time()-sttime, "seconds")


Number of Combined Docs: 1697
11k Answer + Tags, Only Answers with Images Model Processing Completed! Elapsed time: 5.332214593887329 seconds


In [14]:
with open('data/stackoverflow/tf_idf_model_11k_ans_with_imgs.p', 'wb') as model_file:
    pickle.dump(tf_idf, model_file)

In [15]:
with open('data/stackoverflow/similar_qs_11k_ans_with_imgs.p', 'wb') as similar_docs_file:
    pickle.dump(similar_docs, similar_docs_file)

In [16]:
# create 11k model using question title + question tags

sttime=time.time()

combined11kacc = combined11k[combined11k['id_x'] == pd.to_numeric(combined11k['accepted_answer_id_y'],downcast='integer')] # only include question with accepted answers
raw_documents = combined11kacc['title_y'] + ' ' + combined11kacc['new_tags']

print("Number of Combined Docs:",len(raw_documents))

    #Tokenizing data
gen_docs = [[w.lower() for w in word_tokenize(text)] 
                for text in raw_documents]

    # Create dictionary
dictionary = gensim.corpora.Dictionary(gen_docs)

    # Creat Document-Term Matrix
corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]

    # Creat TF-IDF Model
tf_idf = gensim.models.TfidfModel(corpus)

    # Creat Similarity Checker
similar_docs = gensim.similarities.Similarity("",tf_idf[corpus],num_features=len(dictionary))

print("11k Questions with Accepted Answers + Tags Model Processing Completed! Elapsed time:", time.time()-sttime, "seconds")


Number of Combined Docs: 5628
11k Questions with Accepted Answers + Tags Model Processing Completed! Elapsed time: 4.90790581703186 seconds


In [17]:
with open('data/stackoverflow/tf_idf_model_11k_ques.p', 'wb') as model_file:
    pickle.dump(tf_idf, model_file)

In [18]:
with open('data/stackoverflow/similar_qs_11k_ques.p', 'wb') as similar_docs_file:
    pickle.dump(similar_docs, similar_docs_file)

#### Function to retrieve questions and answers with similarity scores above a user-define threshold based on pre-loaded model given a list of input queries


In [19]:
def similar_docs_combined_corpus(query_list,corpus,test_run,threshold,top_num_to_return):
    results = pd.DataFrame()
    for input_query in query_list:
        query_doc = [w.lower() for w in word_tokenize(input_query)]
        query_doc_bow = dictionary.doc2bow(query_doc)
        query_doc_tf_idf = tf_idf[query_doc_bow]
        doc_sim=similar_docs[query_doc_tf_idf]
        sim_threshold=threshold
        # Display similar questions from the past:

        corpus['Similarity']=doc_sim
        cmbdocs=corpus.sort_values('Similarity',ascending=False)
        # combdocs=cmbdocs.loc[:,:][cmbdocs['Similarity']>=sim_threshold]
        combdocs=cmbdocs[cmbdocs['Similarity']>=sim_threshold]
        if len(combdocs['cleaned_body']) < top_num_to_return:
            rslts_len=len(combdocs['cleaned_body'])
        else:
            rslts_len = top_num_to_return
        if rslts_len == 0:
            result = pd.DataFrame()
            result = result.append({'Corpus_Size':len(corpus), \
                                   'Test_Run':test_run, \
                                   'Input_query':input_query, \
                                   'Answer':' ', \
                                   'Related_Question':' ', \
                                   'Similarity_Score':' '}, ignore_index=True)
        else:   
            result = pd.DataFrame({'Corpus_Size':[len(corpus) for x in range(rslts_len)], \
                                   'Test_Run':[test_run for x in range(rslts_len)], \
                                   'Input_query':[input_query for x in range(rslts_len)], \
                                   'Answer':combdocs['cleaned_body'][0:rslts_len].tolist(), \
                                   'Related_Question':combdocs['title_y'][0:rslts_len].tolist(), \
                                   'Similarity_Score':combdocs['Similarity'][0:rslts_len]})
        results=results.append(result,ignore_index=True)
    return results

#### Load each model files and get the list of similar questions and answers.  Add the similar questions and answers to a combined result dataframe

In [20]:
# create combined result data frame to hold the results from all tests below

combrslts=pd.DataFrame()

In [21]:
# get the list of queries to run through each model

samp_ques = pd.read_csv('data/stackoverflow/Sample Questions V 2.csv', header=None,names=['ques'],encoding='utf-8')

In [22]:
Query_List=[x for x in samp_ques['ques']]

In [23]:
Query_List

['Is there a way to visualize the distribution of my data?',
 'How do I show data on a map?',
 'How can I illustrate changes in my data over time?',
 'Is there a way to show a "heatmap" of my data?',
 'How can I plot a comparison of two data sets?',
 'How can I create a chart without coding?',
 'When should I use a bar chart versus a pie chart?',
 'What is the easiest way to create a diagram of a network?',
 'I need help creating a visualization of my data',
 'I need help creating a graph of my data',
 'When should I use a scatter plot?',
 'How do I plot 2 datasets in d3?',
 'How can I animate a bar chart in Python?',
 'I know how to create a line chart with matplotlib, how do I do it in R?',
 'What is the easiest way to create a heat map of the US?',
 'How can I animate a choropleth in Tableau?',
 'How can I animate a choropleth in PowerBI?',
 'How can I animate a choropleth in d3?']

In [24]:
# load the question title + tag models

with open('data/stackoverflow/tf_idf_model_11k_ques.p', 'rb') as model_file:
    tf_idf = pickle.load(model_file)

In [25]:
with open('data/stackoverflow/similar_qs_11k_ques.p', 'rb') as similar_qs_file:
    similar_docs = pickle.load(similar_qs_file)

In [28]:
test_run='TF-IDF on questions plus question tags'

results = similar_docs_combined_corpus(Query_List,combined11kacc,test_run,0.50,2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  sort=sort)


In [29]:
results

Unnamed: 0,Answer,Corpus_Size,Input_query,Related_Question,Similarity_Score,Test_Run
0,,5628.0,Is there a way to visualize the distribution o...,,,TF-IDF on questions plus question tags
1,,5628.0,How do I show data on a map?,,,TF-IDF on questions plus question tags
2,You can change the labels on the x-axis using...,5628.0,How can I illustrate changes in my data over t...,plotting changes over time in python/matplotlib,0.567781,TF-IDF on questions plus question tags
3,,5628.0,"Is there a way to show a ""heatmap"" of my data?",,,TF-IDF on questions plus question tags
4,,5628.0,How can I plot a comparison of two data sets?,,,TF-IDF on questions plus question tags
5,Sure. Just start from an empty plot and then...,5628.0,How can I create a chart without coding?,How can I create a legend without a plot in R?,0.74244,TF-IDF on questions plus question tags
6,,5628.0,When should I use a bar chart versus a pie chart?,,,TF-IDF on questions plus question tags
7,,5628.0,What is the easiest way to create a diagram of...,,,TF-IDF on questions plus question tags
8,From Google it would appear that this approac...,5628.0,I need help creating a visualization of my data,Need help creating a highchart histogram in a ...,0.509949,TF-IDF on questions plus question tags
9,From Google it would appear that this approac...,5628.0,I need help creating a graph of my data,Need help creating a highchart histogram in a ...,0.528085,TF-IDF on questions plus question tags


In [30]:
# add results to combined results dataframe
combrslts=combrslts.append(results,ignore_index=True)

In [31]:
# load the answer body + question tags model

with open('data/stackoverflow/tf_idf_model_11k.p', 'rb') as model_file:
    tf_idf = pickle.load(model_file)

In [32]:
with open('data/stackoverflow/similar_qs_11k.p', 'rb') as similar_qs_file:
    similar_docs = pickle.load(similar_qs_file)

In [33]:
test_run='TF-IDF on answers plus question tags'

results = similar_docs_combined_corpus(Query_List,combined11k,test_run,0.50,2)

In [34]:
results

Unnamed: 0,Answer,Corpus_Size,Input_query,Related_Question,Similarity_Score,Test_Run
0,,13938.0,Is there a way to visualize the distribution o...,,,TF-IDF on answers plus question tags
1,,13938.0,How do I show data on a map?,,,TF-IDF on answers plus question tags
2,,13938.0,How can I illustrate changes in my data over t...,,,TF-IDF on answers plus question tags
3,,13938.0,"Is there a way to show a ""heatmap"" of my data?",,,TF-IDF on answers plus question tags
4,,13938.0,How can I plot a comparison of two data sets?,,,TF-IDF on answers plus question tags
5,,13938.0,How can I create a chart without coding?,,,TF-IDF on answers plus question tags
6,,13938.0,When should I use a bar chart versus a pie chart?,,,TF-IDF on answers plus question tags
7,,13938.0,What is the easiest way to create a diagram of...,,,TF-IDF on answers plus question tags
8,,13938.0,I need help creating a visualization of my data,,,TF-IDF on answers plus question tags
9,,13938.0,I need help creating a graph of my data,,,TF-IDF on answers plus question tags


In [35]:
# add results to combined results dataframe
combrslts=combrslts.append(results,ignore_index=True)

In [36]:
# load the answer body + question title + question tags model

with open('data/stackoverflow/tf_idf_model_11k_ans_ques_title_tags.p', 'rb') as model_file:
    tf_idf = pickle.load(model_file)

In [37]:
with open('data/stackoverflow/similar_qs_11k_ans_ques_title_tags.p', 'rb') as similar_qs_file:
    similar_docs = pickle.load(similar_qs_file)

In [38]:
test_run='TF-IDF on answers plus question titles + question tags'

results = similar_docs_combined_corpus(Query_List,combined11k,test_run,0.50,2)

In [39]:
results

Unnamed: 0,Answer,Corpus_Size,Input_query,Related_Question,Similarity_Score,Test_Run
0,,13938.0,Is there a way to visualize the distribution o...,,,TF-IDF on answers plus question titles + quest...
1,,13938.0,How do I show data on a map?,,,TF-IDF on answers plus question titles + quest...
2,,13938.0,How can I illustrate changes in my data over t...,,,TF-IDF on answers plus question titles + quest...
3,,13938.0,"Is there a way to show a ""heatmap"" of my data?",,,TF-IDF on answers plus question titles + quest...
4,,13938.0,How can I plot a comparison of two data sets?,,,TF-IDF on answers plus question titles + quest...
5,,13938.0,How can I create a chart without coding?,,,TF-IDF on answers plus question titles + quest...
6,,13938.0,When should I use a bar chart versus a pie chart?,,,TF-IDF on answers plus question titles + quest...
7,,13938.0,What is the easiest way to create a diagram of...,,,TF-IDF on answers plus question titles + quest...
8,,13938.0,I need help creating a visualization of my data,,,TF-IDF on answers plus question titles + quest...
9,,13938.0,I need help creating a graph of my data,,,TF-IDF on answers plus question titles + quest...


In [40]:
# add results to combined results dataframe
combrslts=combrslts.append(results,ignore_index=True)

In [41]:
# load the answer body + question tags, only answers with images model

with open('data/stackoverflow/tf_idf_model_11k_ans_with_imgs.p', 'rb') as model_file:
    tf_idf = pickle.load(model_file)

In [42]:
with open('data/stackoverflow/similar_qs_11k_ans_with_imgs.p', 'rb') as similar_qs_file:
    similar_docs = pickle.load(similar_qs_file)

In [43]:
test_run='TF-IDF on answers plus question tags, only answers with images'

results = similar_docs_combined_corpus(Query_List,combined11kimg,test_run,0.50,2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  sort=sort)


In [44]:
results

Unnamed: 0,Answer,Corpus_Size,Input_query,Related_Question,Similarity_Score,Test_Run
0,,1697.0,Is there a way to visualize the distribution o...,,,"TF-IDF on answers plus question tags, only ans..."
1,,1697.0,How do I show data on a map?,,,"TF-IDF on answers plus question tags, only ans..."
2,,1697.0,How can I illustrate changes in my data over t...,,,"TF-IDF on answers plus question tags, only ans..."
3,,1697.0,"Is there a way to show a ""heatmap"" of my data?",,,"TF-IDF on answers plus question tags, only ans..."
4,,1697.0,How can I plot a comparison of two data sets?,,,"TF-IDF on answers plus question tags, only ans..."
5,,1697.0,How can I create a chart without coding?,,,"TF-IDF on answers plus question tags, only ans..."
6,,1697.0,When should I use a bar chart versus a pie chart?,,,"TF-IDF on answers plus question tags, only ans..."
7,,1697.0,What is the easiest way to create a diagram of...,,,"TF-IDF on answers plus question tags, only ans..."
8,,1697.0,I need help creating a visualization of my data,,,"TF-IDF on answers plus question tags, only ans..."
9,,1697.0,I need help creating a graph of my data,,,"TF-IDF on answers plus question tags, only ans..."


In [45]:
# add results to combined results dataframe
combrslts=combrslts.append(results,ignore_index=True)

In [46]:
combrslts

Unnamed: 0,Answer,Corpus_Size,Input_query,Related_Question,Similarity_Score,Test_Run
0,,5628.0,Is there a way to visualize the distribution o...,,,TF-IDF on questions plus question tags
1,,5628.0,How do I show data on a map?,,,TF-IDF on questions plus question tags
2,You can change the labels on the x-axis using...,5628.0,How can I illustrate changes in my data over t...,plotting changes over time in python/matplotlib,0.567781,TF-IDF on questions plus question tags
3,,5628.0,"Is there a way to show a ""heatmap"" of my data?",,,TF-IDF on questions plus question tags
4,,5628.0,How can I plot a comparison of two data sets?,,,TF-IDF on questions plus question tags
5,Sure. Just start from an empty plot and then...,5628.0,How can I create a chart without coding?,How can I create a legend without a plot in R?,0.74244,TF-IDF on questions plus question tags
6,,5628.0,When should I use a bar chart versus a pie chart?,,,TF-IDF on questions plus question tags
7,,5628.0,What is the easiest way to create a diagram of...,,,TF-IDF on questions plus question tags
8,From Google it would appear that this approac...,5628.0,I need help creating a visualization of my data,Need help creating a highchart histogram in a ...,0.509949,TF-IDF on questions plus question tags
9,From Google it would appear that this approac...,5628.0,I need help creating a graph of my data,Need help creating a highchart histogram in a ...,0.528085,TF-IDF on questions plus question tags


In [47]:
# write the combined results file to a csv
combrslts.to_csv('data/stackoverflow/combined_test_run_results_11k.csv', index=False)