In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import itertools
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import re

In [2]:
questions_lemmatized = pd.read_csv('LemmatizedQuestions.csv', encoding="ISO-8859-1")

In [3]:
questions_lemmatized.head()

Unnamed: 0,QID,QuestionUserId,QuestionCreateDate,QuestionScore,QuestionTitle,QuestionBody,NumAnswers,QuestionTitleAndBody,CodeText,TagFreeNonCodeText,CodeTextLemmatized,TagFreeNonCodeTextLemmatized
0,469,147.0,2008-08-02T15:11:16Z,21,How can I find the full path to a font from it...,<p>I am using the Photoshop's javascript API t...,4.0,How can I find the full path to a font from it...,,How can I find the full path to a font from it...,[],"['how', 'can', 'i', 'find', 'the', 'full', 'pa..."
1,502,147.0,2008-08-02T17:01:58Z,27,Get a preview JPEG of a PDF on Windows?,<p>I have a cross-platform (Python) applicatio...,3.0,Get a preview JPEG of a PDF on Windows? <p>I h...,,Get a preview JPEG of a PDF on Windows? I have...,[],"['get', 'a', 'preview', 'jpeg', 'of', 'a', 'pd..."
2,535,154.0,2008-08-02T18:43:54Z,40,Continuous Integration System for a Python Cod...,<p>I'm starting work on a hobby project with a...,7.0,Continuous Integration System for a Python Cod...,,Continuous Integration System for a Python Cod...,[],"['continuous', 'integration', 'system', 'for',..."
3,594,116.0,2008-08-03T01:15:08Z,25,cx_Oracle: How do I iterate over a result set?,<p>There are several ways to iterate over a re...,3.0,cx_Oracle: How do I iterate over a result set?...,,cx_Oracle: How do I iterate over a result set?...,[],"['cx_oracle', 'how', 'do', 'i', 'iterate', 'ov..."
4,683,199.0,2008-08-03T13:19:16Z,28,Using 'in' to match an attribute of Python obj...,<p>I don't remember whether I was dreaming or ...,8.0,Using 'in' to match an attribute of Python obj...,"foo in iter_attr(array of python objects, attr...",Using 'in' to match an attribute of Python obj...,"['foo', 'in', 'iter_attrarray', 'of', 'python'...","['using', 'in', 'to', 'match', 'an', 'attribut..."


In [25]:
pd.set_option('display.max_colwidth', -1)

### Methods

Transforming text to vectors:
- Use tfidf
- Use word2vec:
    - Train non-code on 50D
    - Train code on 50D
    - Concatenate vectors



After getting vector representation, use similarity metrics to find similar questions. Also hopefully use some clustering method is get cluster features for the feature engineering part.

Similarity metrics:
- top 10 most cosine similar

Clustering method:
- Maybe use DBscan
- PCA or t-SNE
- *The clustering method could possible indicate interesting sub-question types, e.g. one cluster is for non-code questions, another is for debugging, another is for conceptual.*

### Vector Representations

tf-idf

In [97]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [37]:
questions_lemmatized["TagFreeNonCodeTextLemmatized"][0]

"['how', 'can', 'i', 'find', 'the', 'full', 'path', 'to', 'a', 'font', 'from', 'it', 'display', 'name', 'on', 'a', 'mac', 'i', 'am', 'using', 'the', 'photoshops', 'javascript', 'api', 'to', 'find', 'the', 'font', 'in', 'a', 'given', 'psd', 'given', 'a', 'font', 'name', 'returned', 'by', 'the', 'api', 'i', 'want', 'to', 'find', 'the', 'actual', 'physical', 'font', 'file', 'that', 'that', 'font', 'name', 'corresponds', 'to', 'on', 'the', 'disc', 'this', 'is', 'all', 'happening', 'in', 'a', 'python', 'program', 'running', 'on', 'osx', 'so', 'i', 'guess', 'im', 'looking', 'for', 'one', 'of', 'some', 'photoshop', 'javascript', 'a', 'python', 'function', 'an', 'osx', 'api', 'that', 'i', 'can', 'call', 'from', 'python']"

In [82]:
# Non-code text
TagFreeNonCodeTextLemmatized = questions_lemmatized.TagFreeNonCodeTextLemmatized.tolist()
TagFreeNonCodeTextLemmatized = [[w[1:-1] for w in q[1:-1].split(", ")] for q in TagFreeNonCodeTextLemmatized]

In [44]:
# Code text
CodeTextLemmatized = questions_lemmatized.CodeTextLemmatized.tolist()
CodeTextLemmatized = [[w[1:-1] for w in q[1:-1].split(", ")] for q in CodeTextLemmatized]

In [86]:
# Combined non-code and code
AllTextLemmatized = []
for i in range(len(TagFreeNonCodeTextLemmatized)):
    noncode = TagFreeNonCodeTextLemmatized[i][:]
    for w in CodeTextLemmatized[i]:
        if w != '':
            noncode.append(w)
    AllTextLemmatized.append(" ".join(noncode))

In [93]:
AllTextLemmatized[0]

'how can i find the full path to a font from it display name on a mac i am using the photoshops javascript api to find the font in a given psd given a font name returned by the api i want to find the actual physical font file that that font name corresponds to on the disc this is all happening in a python program running on osx so i guess im looking for one of some photoshop javascript a python function an osx api that i can call from python'

In [89]:
# tfidf
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=10000) # limit vocabulary size to 10,000
tfidf_question = tfidf_vectorizer.fit_transform(AllTextLemmatized)

In [90]:
tfidf_question.shape

(8, 187)

In [None]:
# compute cosine similarities of questions
tfidf_question_cosine_similarities = [] # list of indices of top 10 most cosine similar
for i in range(len(questions_lemmatized.index)):
    similarity_indices = cosine_similarity(AllTextLemmatized[i], AllTextLemmatized).flatten()
    tfidf_question_cosine_similarities.append(similarity_indices.argsort()[:-11:-1])

In [None]:
tfidf_question_cosine_similarities[0]

In [None]:
# save to df
tfidf_df = pd.DataFrame((_ for _ in itertools.zip_longest(tfidf_question_cosine_similarities)), columns=['indices'])
tfidf_df.to_pickle("TfIdfSimilarity")

**word2vec for non-code**

In [4]:
import gensim
import nltk
from nltk import word_tokenize
from nltk import tokenize
from nltk.data import find



In [5]:
# word2vec model for non-code text
noncode_model = gensim.models.Word2Vec(TagFreeNonCodeTextLemmatized, min_count=10, size=50, window=5)

In [6]:
len(noncode_model.wv.vocab)

173066

In [7]:
noncode_unique_words = {}
for q in TagFreeNonCodeTextLemmatized:
    for w in q:
        if not noncode_unique_words.get(w):
            noncode_unique_words[w] = 0
        noncode_unique_words[w] += 1

In [8]:
words = list(noncode_unique_words.keys())
len(words)

4806582

*Get word vectors*

In [None]:
# Word vectors
noncode_vector_list=[] ## n by d matrix containing words and their respective vectors
for word, cnt in noncode_unique_words.items():
    if cnt >= 10:
        noncode_vector_list.append(noncode_model[word])

In [None]:
len(noncode_vector_list)

In [9]:
# save model
noncode_model.save("noncode_word2vec")

In [None]:
# load model
noncode_model = Word2Vec.load("noncode_word2vec")

*Sum up word vectors*

In [11]:
noncode_q_embedding = []
for q in TagFreeNonCodeTextLemmatized:
    q_embedding = np.zeros(50)
    for word in q:
        if noncode_unique_words[word] > 10:
            q_embedding += noncode_model[word]
    noncode_q_embedding.append(q_embedding)



In [None]:
len(noncode_q_embedding)

In [105]:
noncode_q_embedding[0]

array([ 104.7903592 ,  233.98276646, -187.92552194,  218.95368098,
        123.9839341 ,  210.74747539, -125.32897924,  -59.38568768,
        220.99190462,  165.27572969,  309.69482501,   70.60778694,
        -89.19382327,  171.32715937,  105.65731641,  197.60346127,
       -338.3882495 , -124.68667492,   93.64890997, -136.77202329,
          3.47088823, -238.19950062,   -8.11581397,   99.49042188,
        -99.58810435,  -27.32146006,   74.11278299,  -92.9729311 ,
       -134.64065651,  -29.21850541, -146.18743266,  103.05293993,
        112.29461885,   -9.56411882,  153.76655305,  128.6384503 ,
        210.67027523,   77.33066313, -174.92864283,  431.12630757,
       -124.75030518, -179.11365573,  -12.35419412, -166.64131038,
         92.66414439,  371.69936726,   17.02718088,  -12.97852471,
        -22.40040434,  -81.81657255])

In [107]:
noncode_embedding_df = pd.DataFrame((_ for _ in itertools.zip_longest(noncode_q_embedding)), columns=['embedding'])

In [108]:
noncode_embedding_df.head()

Unnamed: 0,embedding
0,"[104.790359199, 233.982766459, -187.925521936, 218.953680977, 123.983934097, 210.747475386, -125.328979243, -59.3856876823, 220.991904616, 165.275729688, 309.694825012, 70.6077869423, -89.1938232686, 171.327159369, 105.657316407, 197.603461273, -338.388249498, -124.686674917, 93.6489099711, -136.772023287, 3.47088822583, -238.19950062, -8.11581396684, 99.4904218763, -99.5881043486, -27.3214600593, 74.1127829906, -92.9729310954, -134.640656512, -29.2185054142, -146.18743266, 103.052939931, 112.294618849, -9.5641188249, 153.766553048, 128.638450302, 210.670275226, 77.3306631297, -174.928642828, 431.126307566, -124.750305183, -179.113655731, -12.3541941196, -166.641310379, 92.6641443931, 371.699367259, 17.0271808766, -12.9785247147, -22.4004043401, -81.816572547]"
1,"[55.3639270877, 133.266654551, -86.4549356252, 111.066199854, 56.2065773308, 81.6096022394, -72.7127401382, 1.670049496, 136.604849696, 95.3309267713, 138.269164908, 36.2933272626, -67.986693738, 98.9401275218, 42.8894385993, 103.230953047, -174.11741139, -62.7397340573, 50.3225486279, -90.0264730081, 4.83511715708, -92.8561457992, -19.9641806073, 72.5665770173, -56.6354180798, -15.1344212517, 36.8711684793, -34.182656683, -69.5725146532, -40.1553565077, -48.4681739286, 66.2455350407, 74.6280928254, -16.4468314983, 72.8434932679, 62.0341959745, 97.4522367325, 35.6087533683, -87.9680896625, 173.25405623, -45.824375689, -102.198631518, -27.1955427527, -99.1857887208, 58.5275302026, 189.914759338, 20.0960416943, -0.826119460166, -20.63310498, -22.029436633]"
2,"[54.3329749852, 285.638077168, -152.193748194, 252.458389223, 135.816546923, 141.300634113, -175.035026833, -114.694900859, 303.978268006, 179.963486716, 325.111237824, 105.57181856, -71.8182222936, 193.591465889, 93.8829704337, 168.555061277, -347.770992223, -153.820742648, 66.3441991098, -142.69999681, 37.9139700402, -212.810971618, -43.178185062, 78.8426956832, -105.175491161, 13.1848163671, 96.8504572809, -30.3039049134, -201.770106144, -25.0391419306, -89.9095067829, 102.289408068, 170.157912962, -117.453764483, 177.206255246, 63.4345086552, 195.50388794, 93.9019377902, -207.720366579, 435.319405917, -111.861734318, -239.765813738, -3.39318139665, -174.95971073, 76.7553634811, 392.404411126, 20.3921948206, -42.5436140969, 21.7911405712, -75.4819956459]"
3,"[4.21130882017, 81.7820248604, -45.8087611794, 43.7592364363, 31.3608980179, 13.3545329273, -64.6297825575, -37.2147189528, 53.1147626042, 62.0836489797, 83.6681051254, 2.68432169635, -21.7842616625, 31.0639320556, 3.15348035097, 63.45482409, -71.2471692837, -7.54581606388, 18.4726314247, -36.0588860922, 23.5224828215, -77.0910546482, -11.2327012978, 12.6795762107, -37.4683555514, -7.52302828431, 52.3168457896, -23.5711653307, -43.2639132887, -11.5316375047, -34.3603463322, 0.451465785503, 25.1488592923, 0.524699300528, 40.1025175452, 43.7749269903, 72.2718237936, -3.71755111217, -73.9268929064, 148.108174264, -26.8722826391, -71.5127802566, -12.7309114337, 10.1754982471, 10.039438128, 87.3739898652, -0.976289086044, -1.06623494253, 17.3094673902, -14.2749106139]"
4,"[-7.63487214409, 119.273091219, -119.881945048, 92.4632230382, 59.3196130246, 56.0633011237, -53.1829489504, -62.8890879974, 138.21860069, 108.050570503, 172.774720511, 21.5010277443, -34.9738803543, 108.472790748, 90.9307340998, 63.5067607388, -173.108571775, -37.7602649368, 60.1650732234, -75.7864731476, 35.4305654073, -143.201133817, -4.63866125105, 29.6932117119, -68.0734032243, -24.0540987961, 101.841362187, -46.643446248, -99.5825744122, -24.724611789, -100.854048193, 13.8610689417, 54.4688177705, -1.80633024871, 74.715627145, 61.1358464062, 102.231919112, 40.1402136988, -88.2506073974, 244.023586661, -58.5301206335, -108.962415516, -10.9627735007, -57.6790872961, 0.268351402134, 158.562287971, -20.9483304489, -72.3811652362, -3.95506547391, -38.4660156369]"


In [109]:
# save df
noncode_embedding_df.to_pickle("NoncodeTextWordEmbeddings")

**word2vec for code text**

In [None]:
# word2vec model for code text
code_model = gensim.models.Word2Vec(CodeTextLemmatized, min_count=10, size=50, window=5)

In [None]:
len(code_model.wv.vocab)

In [None]:
# get set of unique words and counts
code_unique_words = {}
for q in CodeTextLemmatized:
    for w in q:
        if not code_unique_words.get(w):
            code_unique_words[w] = 0
        code_unique_words[w] += 1

In [None]:
# save model
code_model.save("code_word2vec")

In [None]:
# load model
code_model = Word2Vec.load("code_word2vec")

In [None]:
# sum up word vectors
code_q_embedding = []
for q in CodeTextLemmatized:
    q_embedding = np.zeros(50)
    for word in q:
        if code_unique_words[word] > 10:
            q_embedding += code_model[word]
    code_q_embedding.append(q_embedding)

In [None]:
len(code_q_embedding)

In [None]:
code_embedding_df = pd.DataFrame((_ for _ in itertools.zip_longest(code_q_embedding)), columns=['embedding'])

In [106]:
code_embedding_df.head()

NameError: name 'code_embedding_df' is not defined

In [None]:
# save df
code_embedding_df.to_pickle("CodeTextWordEmbeddings")

**Get top 10 most cosine similar questions**

*Concatenate code and non-code embeddings*

In [None]:
combined_embedding = []
for i in range(len(noncode_q_embedding)):
    q_embedding = np.concatenate(noncode_q_embedding[i], code_q_embedding[i])
    combined_embedding.append(q_embedding)

In [None]:
len(combined_embedding)

In [None]:
combined_embedding[0]

*Get top 10 most cosine similar word embeddings*

In [98]:
word2vec_cosine_similarities = []
for i in range(len(combined_embedding)):
    word2vec_cosine_similarities.append(cosine_similarity(combined_embedding[i], combined_embedding).flatten())



In [None]:
# save to df
word2vec_similarity_df = pd.DataFrame((_ for _ in itertools.zip_longest(word2vec_cosine_similarities)), columns=['indices'])
word2vec_similarity_df.to_pickle("word2vecSimilarity")