In [0]:
# Not all of these are needed, but most are (this cell is an artefact of a previous analysis). 
# If you don't have some of these python libraries, the easiest way to install them is using
# 
# > pip install library_name
# 
# Alternatively, you can do so using Anaconda, using 
# 
# > conda install library_name


import string
import numpy as np
import itertools
import pandas as pd
import re
import json
import sys
stdout = sys.stdout
reload(sys)
sys.setdefaultencoding('utf-8')
sys.stdout = stdout

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE
from sklearn.preprocessing import normalize
from sklearn.decomposition import TruncatedSVD


import jupyternotify
ip = get_ipython()
ip.register_magics(jupyternotify.JupyterNotifyMagics)

<IPython.core.display.Javascript object>

In [0]:
# Loading and cleaning the data structure. For this example, I'm using StackOverflow posts; specifically, 
# posts to the "Interpersonal questions" site on StackExchange. There's a bit of cleaning that's necessary here, 
# Which I take care of in this cell.

df_stack = pd.read_csv("QueryResults.csv")
df_stack_qs = df_stack[df_stack['PostTypeId']==1]
from HTMLParser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

df_stack_qs = df_stack_qs.replace(r'\n','', regex=True) 

df_stack_qs['Body']=  df_stack_qs['Body'].apply(lambda x: strip_tags(x)) 

# Here's what the end product looks like
df_stack_qs.head()

Unnamed: 0,Id,PostTypeId,ParentId,Title,Body
1,16538,1,,How to approach a narcissist parent about thei...,TL;DR: My dad is calling my sister an embarras...
5,16545,1,,Roommate Upset I Woke Him Up,I encountered a strange situation with my room...
7,16547,1,,"Grandpa has dementia, how to include him in co...",I'm not sure if this is a question that can be...
12,16556,1,,How to Tell Family Dr I am Content Helping My ...,This has been bothering me for sometime now.My...
16,16560,1,,How/whether to let people know I don't like re...,"My husband and I have never liked cut flowers,..."


In [0]:
# Creating lists of questions + their relevant titles
questions =  df_stack_qs['Body'].tolist()
titles =  df_stack_qs['Title'].tolist()


# Adding title and data to a dictionary
tempDict = {}
for title, question in zip(titles, questions):
    tempDict[title]=question


In [0]:
# Removing punctuation from the text, as well as some misc. irrelevant characters 
wordnet_lemmatizer = WordNetLemmatizer()

def tokenize(text):
    text = text.lower() # lower case
    for e in set(string.punctuation+'\n'+'\t'): # remove punctuation and line breaks/tabs
        text = text.replace(e, ' ')	
    for i in range(0,10):	# remove double spaces
        text = text.replace('  ', ' ')
    text = text.translate(string.punctuation)  # punctuation
    tokens = nltk.word_tokenize(text)
    text = [w for w in tokens if not w in stopwords.words('english')] # stopwords
    stems = []
    for item in tokens: # stem
        stems.append(wordnet_lemmatizer.lemmatize(item))
    return stems

# calculate tfidf 
print "calculating tf-idf"

tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english', min_df=0.025, max_df=.5) #NOTE
# this step takes longest & contains lots of important parameters; playing with these and experimenting
# with them is recommended. Starting here::
# https://www.kaggle.com/adamschroeder/countvectorizer-tfidfvectorizer-predict-comments
# and moving on to the official docs here:
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer
# is a good idea.

tfs = tfidf.fit_transform(tempDict.values())
print "reducing tf-idf to dimensions"
tfs_reduced = TruncatedSVD(n_components=100, random_state=0).fit_transform(tfs)
print "done"

calculating tf-idf
reducing tf-idf to dimensions
done


In [0]:
model = TSNE(n_components=2, perplexity=100, verbose=2, method='exact').fit_transform(tfs_reduced)

# save to json file
x_axis=model[:,0]
y_axis=model[:,1]
x_norm = (x_axis-np.min(x_axis)) / (np.max(x_axis) - np.min(x_axis))
y_norm = (y_axis-np.min(y_axis)) / (np.max(y_axis) - np.min(y_axis))
data = {"x":x_norm.tolist(), "y":y_norm.tolist(), "names":tempDict.keys()} #output x and y coords in data
with open('test.json', 'w') as outfile:
    json.dump(data, outfile)

[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 1000 / 2619
[t-SNE] Computed conditional probabilities for sample 2000 / 2619
[t-SNE] Computed conditional probabilities for sample 2619 / 2619
[t-SNE] Mean sigma: 0.203230
[t-SNE] Iteration 25: error = 19.8171082, gradient norm = 0.0783719
[t-SNE] Iteration 50: error = 20.6833585, gradient norm = 0.0641763
[t-SNE] Iteration 75: error = 21.0182451, gradient norm = 0.0662294
[t-SNE] Iteration 100: error = 20.7684213, gradient norm = 0.0635221
[t-SNE] Error after 100 iterations with early exaggeration: 20.768421
[t-SNE] Iteration 125: error = 1.9375553, gradient norm = 0.0038172
[t-SNE] Iteration 150: error = 1.8434535, gradient norm = 0.0032058
[t-SNE] Iteration 175: error = 1.8383726, gradient norm = 0.0041525
[t-SNE] Iteration 200: error = 1.8414299, gradient norm = 0.0035076
[t-SNE] Iteration 225: error = 1.8365453, gradient norm = 0.0040166
[t-SNE] Iteration 250: error = 1.8425447, gradient

In [0]:
# Importing json of results, merging it with the original file, and outputting a CSV which will contain
# the X and Y coords of each point, which we'll use to create our tSNE plot
df_xyplot = pd.read_json("test.json")
result = pd.merge(df_stack_qs, df_xyplot, left_on='Title', right_on='names')
result.to_csv("test.csv")