In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import itertools
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import re

In [None]:
questions_lemmatized = pd.read_csv('LemmatizedQuestions.csv', encoding="ISO-8859-1")

In [None]:
questions_lemmatized.head()

In [None]:
pd.set_option('display.max_colwidth', -1)

### Methods

Transforming text to vectors:
- Use tfidf
- Use word2vec:
    - Train non-code on 50D
    - Train code on 50D
    - Concatenate vectors



After getting vector representation, use similarity metrics to find similar questions. Also hopefully use some clustering method is get cluster features for the feature engineering part.

Similarity metrics:
- top 10 most cosine similar

Clustering method:
- Maybe use DBscan
- PCA or t-SNE
- *The clustering method could possible indicate interesting sub-question types, e.g. one cluster is for non-code questions, another is for debugging, another is for conceptual.*

### Vector Representations

tf-idf

In [16]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
questions_lemmatized["TagFreeNonCodeTextLemmatized"][0]

In [None]:
# Non-code text
TagFreeNonCodeTextLemmatized = questions_lemmatized.TagFreeNonCodeTextLemmatized.tolist()
TagFreeNonCodeTextLemmatized = [[w[1:-1] for w in q[1:-1].split(", ")] for q in TagFreeNonCodeTextLemmatized]

In [None]:
# Code text
CodeTextLemmatized = questions_lemmatized.CodeTextLemmatized.tolist()
CodeTextLemmatized = [[w[1:-1] for w in q[1:-1].split(", ")] for q in CodeTextLemmatized]

In [None]:
# Combined non-code and code
AllTextLemmatized = []
for i in range(len(TagFreeNonCodeTextLemmatized)):
    noncode = TagFreeNonCodeTextLemmatized[i][:]
    for w in CodeTextLemmatized[i]:
        if w != '':
            noncode.append(w)
    AllTextLemmatized.append(" ".join(noncode))

In [None]:
AllTextLemmatized[0]

In [None]:
# tfidf
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=10000) # limit vocabulary size to 10,000
tfidf_question = tfidf_vectorizer.fit_transform(AllTextLemmatized)

In [None]:
tfidf_question.shape

In [None]:
# compute cosine similarities of questions
tfidf_question_cosine_similarities = [] # list of indices of top 10 most cosine similar
for i in range(len(questions_lemmatized.index)):
    similarity_indices = cosine_similarity(tfidf_question[i], tfidf_question).flatten()
    tfidf_question_cosine_similarities.append(similarity_indices.argsort()[:-11:-1])

In [None]:
tfidf_question_cosine_similarities[0]

In [None]:
# save to df
tfidf_df = pd.DataFrame((_ for _ in itertools.zip_longest(tfidf_question_cosine_similarities)), columns=['indices'])
tfidf_df.to_pickle("TfIdfSimilarity")

**word2vec for non-code**

In [None]:
import gensim
import nltk
from nltk import word_tokenize
from nltk import tokenize
from nltk.data import find

In [None]:
# word2vec model for non-code text
noncode_model = gensim.models.Word2Vec(TagFreeNonCodeTextLemmatized, min_count=10, size=50, window=5)

In [None]:
len(noncode_model.wv.vocab)

In [None]:
noncode_unique_words = {}
for q in TagFreeNonCodeTextLemmatized:
    for w in q:
        if not noncode_unique_words.get(w):
            noncode_unique_words[w] = 0
        noncode_unique_words[w] += 1

In [None]:
words = list(noncode_unique_words.keys())
len(words)

*Get word vectors*

In [None]:
# Word vectors
noncode_vector_list=[] ## n by d matrix containing words and their respective vectors
for word, cnt in noncode_unique_words.items():
    if cnt >= 10:
        noncode_vector_list.append(noncode_model[word])

In [None]:
len(noncode_vector_list)

In [None]:
# save model
noncode_model.save("noncode_word2vec")

In [None]:
# load model
noncode_model = gensim.models.Word2Vec.load("noncode_word2vec")

*Sum up word vectors*

In [None]:
noncode_q_embedding = []
for q in TagFreeNonCodeTextLemmatized:
    q_embedding = np.zeros(50)
    for word in q:
        if noncode_unique_words[word] > 10:
            q_embedding += noncode_model[word]
    noncode_q_embedding.append(q_embedding)

In [None]:
len(noncode_q_embedding)

In [None]:
noncode_q_embedding[0]

In [None]:
noncode_embedding_df = pd.DataFrame((_ for _ in itertools.zip_longest(noncode_q_embedding)), columns=['embedding'])

In [None]:
noncode_embedding_df.head()

In [None]:
# save df
noncode_embedding_df.to_pickle("NoncodeTextWordEmbeddings")

**word2vec for code text**

In [None]:
# word2vec model for code text
code_model = gensim.models.Word2Vec(CodeTextLemmatized, min_count=10, size=50, window=5)

In [None]:
len(code_model.wv.vocab)

In [None]:
# get set of unique words and counts
code_unique_words = {}
for q in CodeTextLemmatized:
    for w in q:
        if not code_unique_words.get(w):
            code_unique_words[w] = 0
        code_unique_words[w] += 1

In [None]:
# save model
code_model.save("code_word2vec")

In [None]:
# load model
code_model = gensim.models.Word2Vec.load("code_word2vec")

In [None]:
# sum up word vectors
code_q_embedding = []
for q in CodeTextLemmatized:
    q_embedding = np.zeros(50)
    for word in q:
        if code_unique_words[word] > 10:
            q_embedding += code_model[word]
    code_q_embedding.append(q_embedding)

In [None]:
len(code_q_embedding)

In [None]:
code_embedding_df = pd.DataFrame((_ for _ in itertools.zip_longest(code_q_embedding)), columns=['embedding'])

In [None]:
code_embedding_df.head()

In [None]:
# save df
code_embedding_df.to_pickle("CodeTextWordEmbeddings")

**Get top 10 most cosine similar questions**

*Concatenate code and non-code embeddings*

In [9]:
code_embedding_df = pd.read_pickle('CodeTextWordEmbeddings')
noncode_embedding_df = pd.read_pickle("NoncodeTextWordEmbeddings")

In [13]:
combined_embedding = []
for i in range(noncode_embedding_df.shape[0]):
    q_embedding = np.append(noncode_embedding_df.iloc[i,:], code_embedding_df.iloc[i,:])
    combined_embedding.append(q_embedding)

*Get top 10 most cosine similar word embeddings*

In [24]:
combined_embedding[0]

array([array([-2.33485798e+02,  1.34262846e+02, -1.61999403e+02,  2.75851354e+02,
        3.92991432e+01,  1.65180587e+01,  8.67004336e+01, -2.02523277e+02,
        6.78854387e+01,  2.48089458e+01, -1.48255976e+02,  2.34974266e+01,
        1.19403369e+02, -2.44532929e+02, -2.74176350e+02,  1.36803860e+02,
        2.13601728e+02,  2.29295494e+02, -1.56278791e+02,  2.11458459e-01,
        6.56653517e+01, -3.22754556e+02,  2.31872174e+02, -3.21034162e+02,
        1.81830842e-01, -2.67967392e+02, -6.48300108e+01, -6.94139397e+01,
       -1.22944525e+02, -1.67904527e+02, -1.01985767e+02,  7.37741048e+01,
       -6.98878253e+01, -1.52692972e+02,  1.63893139e+02,  5.44709623e+01,
        2.20797490e+02, -1.64560005e+02,  1.35766674e+01, -1.22989319e+02,
       -6.49815582e+01, -4.90088749e+01,  2.48694221e+02,  9.59186933e+01,
        2.46383301e+02,  2.01767801e+02, -3.64020447e+01, -7.09914588e+01,
       -1.31085836e+02, -9.55905345e+01]),
       array([ 0.00583147,  0.00644809, -0.0066126

In [26]:
# compute cosine similarities of questions
# tfidf_question_cosine_similarities = [] # list of indices of top 10 most cosine similar
# for i in range(len(questions_lemmatized.index)):
#     similarity_indices = cosine_similarity(tfidf_question[i], tfidf_question).flatten()
#     tfidf_question_cosine_similarities.append(similarity_indices.argsort()[:-11:-1])


# list of indices of top 10 most cosine similar for each question
word2vec_question_cosine_similarities = []
for i in range(len(combined_embedding)):
    similarity_indices = cosine_similarity(combined_embedding[i], combined_embedding).flatten()
    word2vec_question_cosine_similarities.append(similarity_indices[i].argsort()[:-11:-1])

ValueError: setting an array element with a sequence.

In [18]:
word2vec_question_cosine_similarities

[]

In [None]:
# save to df
word2vec_similarity_df = pd.DataFrame((_ for _ in itertools.zip_longest(word2vec_cosine_similarities)), columns=['indices'])
word2vec_similarity_df.to_pickle("word2vecSimilarity")

In [None]:
word2vec_similarity_df