# RUN THESE CELLS

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import itertools
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import re
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
noncode_embedding_df = pd.read_pickle("NoncodeTextWordEmbeddings")
code_embedding_df = pd.read_pickle("CodeTextWordEmbeddings")

In [3]:
combined_embedding = []
for i in range(noncode_embedding_df.shape[0]):
    q_embedding = np.append(noncode_embedding_df.iloc[i,:][0], code_embedding_df.iloc[i,:][0])
    combined_embedding.append(q_embedding)

In [6]:
combined_embedding_df = pd.DataFrame((_ for _ in itertools.zip_longest(combined_embedding)), columns=['indices'])
combined_embedding_df.to_pickle("CombinedWordEmbeddings.pkl")

In [7]:
# list of indices of top 10 most cosine similar for each question
word2vec_question_cosine_similarities = []
for i in range(len(combined_embedding)):
    similarity_indices = cosine_similarity(combined_embedding[i], combined_embedding).flatten()
    word2vec_question_cosine_similarities.append(similarity_indices.argsort()[:-11:-1])



In [10]:
# save to df
word2vec_similarity_df = pd.DataFrame((_ for _ in itertools.zip_longest(word2vec_question_cosine_similarities)), columns=['indices'])
word2vec_similarity_df.to_pickle("word2vecSimilarity.pkl")

# ONLY RUN CELLS ABOVE

In [None]:
questions_lemmatized = pd.read_csv('LemmatizedQuestions.csv', encoding="ISO-8859-1")

In [None]:
questions_lemmatized.head()

In [None]:
pd.set_option('display.max_colwidth', -1)

### Methods

Transforming text to vectors:
- Use tfidf
- Use word2vec:
    - Train non-code on 50D
    - Train code on 50D
    - Concatenate vectors



After getting vector representation, use similarity metrics to find similar questions. Also hopefully use some clustering method is get cluster features for the feature engineering part.

Similarity metrics:
- top 10 most cosine similar

Clustering method:
- Maybe use DBscan
- PCA or t-SNE
- *The clustering method could possible indicate interesting sub-question types, e.g. one cluster is for non-code questions, another is for debugging, another is for conceptual.*

### Vector Representations

tf-idf

In [None]:
questions_lemmatized["TagFreeNonCodeTextLemmatized"][0]

In [None]:
# Non-code text
TagFreeNonCodeTextLemmatized = questions_lemmatized.TagFreeNonCodeTextLemmatized.tolist()
TagFreeNonCodeTextLemmatized = [[w[1:-1] for w in q[1:-1].split(", ")] for q in TagFreeNonCodeTextLemmatized]

In [None]:
# Code text
CodeTextLemmatized = questions_lemmatized.CodeTextLemmatized.tolist()
CodeTextLemmatized = [[w[1:-1] for w in q[1:-1].split(", ")] for q in CodeTextLemmatized]

In [None]:
# Combined non-code and code
AllTextLemmatized = []
for i in range(len(TagFreeNonCodeTextLemmatized)):
    noncode = TagFreeNonCodeTextLemmatized[i][:]
    for w in CodeTextLemmatized[i]:
        if w != '':
            noncode.append(w)
    AllTextLemmatized.append(" ".join(noncode))

In [None]:
AllTextLemmatized[0]

In [None]:
# tfidf
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=10000) # limit vocabulary size to 10,000
tfidf_question = tfidf_vectorizer.fit_transform(AllTextLemmatized)

In [None]:
tfidf_question.shape

In [None]:
# compute cosine similarities of questions
tfidf_question_cosine_similarities = [] # list of indices of top 10 most cosine similar
for i in range(len(questions_lemmatized.index)):
    similarity_indices = cosine_similarity(tfidf_question[i], tfidf_question).flatten()
    tfidf_question_cosine_similarities.append(similarity_indices.argsort()[:-11:-1])

In [None]:
tfidf_question_cosine_similarities[0]

In [None]:
# save to df
tfidf_df = pd.DataFrame((_ for _ in itertools.zip_longest(tfidf_question_cosine_similarities)), columns=['indices'])
tfidf_df.to_pickle("TfIdfSimilarity.pkl")

In [None]:
tdfidf_df = pd.read_pickle("TfIdfSimilarity")

In [None]:
tdfidf_df.head()

**word2vec for non-code**

In [None]:
import gensim
import nltk
from nltk import word_tokenize
from nltk import tokenize
from nltk.data import find

In [None]:
# word2vec model for non-code text
noncode_model = gensim.models.Word2Vec(TagFreeNonCodeTextLemmatized, min_count=10, size=50, window=5)

In [None]:
len(noncode_model.wv.vocab)

In [None]:
noncode_unique_words = {}
for q in TagFreeNonCodeTextLemmatized:
    for w in q:
        if not noncode_unique_words.get(w):
            noncode_unique_words[w] = 0
        noncode_unique_words[w] += 1

In [None]:
words = list(noncode_unique_words.keys())
len(words)

*Get word vectors*

In [None]:
# Word vectors
noncode_vector_list=[] ## n by d matrix containing words and their respective vectors
for word, cnt in noncode_unique_words.items():
    if cnt >= 10:
        noncode_vector_list.append(noncode_model[word])

In [None]:
len(noncode_vector_list)

In [None]:
# save model
noncode_model.save("noncode_word2vec.bin")

In [None]:
# load model
noncode_model = gensim.models.Word2Vec.load("noncode_word2vec.bin")

*Sum up word vectors*

In [None]:
noncode_q_embedding = []
for q in TagFreeNonCodeTextLemmatized:
    q_embedding = np.zeros(50)
    for word in q:
        if noncode_unique_words[word] > 10:
            q_embedding += noncode_model[word]
    noncode_q_embedding.append(q_embedding)

In [None]:
len(noncode_q_embedding)

In [None]:
noncode_q_embedding[0]

In [None]:
noncode_embedding_df = pd.DataFrame((_ for _ in itertools.zip_longest(noncode_q_embedding)), columns=['embedding'])

In [None]:
noncode_embedding_df.head()

In [None]:
# save df
noncode_embedding_df.to_pickle("NoncodeTextWordEmbeddings.pkl")

**word2vec for code text**

In [None]:
# word2vec model for code text
code_model = gensim.models.Word2Vec(CodeTextLemmatized, min_count=10, size=50, window=5)

In [None]:
len(code_model.wv.vocab)

In [None]:
# get set of unique words and counts
code_unique_words = {}
for q in CodeTextLemmatized:
    for w in q:
        if not code_unique_words.get(w):
            code_unique_words[w] = 0
        code_unique_words[w] += 1

In [None]:
# save model
code_model.save("code_word2vec.bin")

In [None]:
# load model
code_model = gensim.models.Word2Vec.load("code_word2vec.bin")

In [None]:
# sum up word vectors
code_q_embedding = []
for q in CodeTextLemmatized:
    q_embedding = np.zeros(50)
    for word in q:
        if code_unique_words[word] > 10:
            q_embedding += code_model[word]
    code_q_embedding.append(q_embedding)

In [None]:
len(code_q_embedding)

In [None]:
code_embedding_df = pd.DataFrame((_ for _ in itertools.zip_longest(code_q_embedding)), columns=['embedding'])

In [None]:
code_embedding_df.head()

In [None]:
# save df
code_embedding_df.to_pickle("CodeTextWordEmbeddings.pkl")

**Get top 10 most cosine similar questions**

*Concatenate code and non-code embeddings*

In [None]:
code_embedding_df = pd.read_pickle("CodeTextWordEmbeddings")

In [None]:
noncode_embedding_df = pd.read_pickle("NoncodeTextWordEmbeddings")

In [None]:
noncode_embedding_df.iloc[0,:][0]#.as_matrix()[0]

In [None]:
combined_embedding = []
for i in range(5):# range(noncode_embedding_df.shape[0]):
    q_embedding = np.append(noncode_embedding_df.iloc[i,:][0], code_embedding_df.iloc[i,:][0])
    combined_embedding.append(q_embedding)

*Get top 10 most cosine similar word embeddings*

In [None]:
combined_embedding

In [None]:
# compute cosine similarities of questions
# tfidf_question_cosine_similarities = [] # list of indices of top 10 most cosine similar
# for i in range(len(questions_lemmatized.index)):
#     similarity_indices = cosine_similarity(tfidf_question[i], tfidf_question).flatten()
#     tfidf_question_cosine_similarities.append(similarity_indices.argsort()[:-11:-1])


# list of indices of top 10 most cosine similar for each question
word2vec_question_cosine_similarities = []
for i in range(len(combined_embedding)):
    similarity_indices = cosine_similarity(combined_embedding[i], combined_embedding).flatten()
    word2vec_question_cosine_similarities.append(similarity_indices.argsort()[:-11:-1])

In [None]:
word2vec_question_cosine_similarities

In [None]:
# save to df
word2vec_similarity_df = pd.DataFrame((_ for _ in itertools.zip_longest(word2vec_question_cosine_similarities)), columns=['indices'])
word2vec_similarity_df.to_pickle("word2vecSimilarity.pkl")

In [None]:
word2vec_similarity_df.head()