In [1]:
# Not all of these are needed, but most are (this cell is an artefact of a previous analysis). 
# If you don't have some of these python libraries, the easiest way to install them is using
# 
# > pip install library_name
# 
# Alternatively, you can do so using Anaconda, using 
# 
# > conda install library_name


import string
import numpy as np
import itertools
import pandas as pd
import re
import json
import sys
# stdout = sys.stdout
# sys.reload(sys)
# sys.setdefaultencoding('utf-8')
# sys.stdout = stdout
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE
from sklearn.preprocessing import normalize
from sklearn.decomposition import TruncatedSVD


# import jupyternotify
# ip = get_ipython()
# ip.register_magics(jupyternotify.JupyterNotifyMagics)

In [2]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/iblinderman/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/iblinderman/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Loading and cleaning the data structure. For this example, I'm using StackOverflow posts; specifically, 
# posts to the "Interpersonal questions" site on StackExchange. There's a bit of cleaning that's necessary here, 
# Which I take care of in this cell.

df_stack = pd.read_csv("oreo-clues.tsv", sep='\t')
df_stack
# df_stack_qs = df_stack[df_stack['PostTypeId']==1]
# from HTMLParser import HTMLParser

# class MLStripper(HTMLParser):
#     def __init__(self):
#         self.reset()
#         self.fed = []
#     def handle_data(self, d):
#         self.fed.append(d)
#     def get_data(self):
#         return ''.join(self.fed)

# def strip_tags(html):
#     s = MLStripper()
#     s.feed(html)
#     return s.get_data()

# df_stack_qs = df_stack_qs.replace(r'\n','', regex=True) 

# df_stack_qs['Body']=  df_stack_qs['Body'].apply(lambda x: strip_tags(x)) 

# Here's what the end product looks like
# df_stack_qs.head()

Unnamed: 0,clue
0,sandwich cookie
1,black-and-white cookie
2,popular cookie
3,mountain: prefix
4,sweet sandwich
5,creme-filled cookie
6,nabisco cookie
7,cookie favorite
8,mountain: comb form
9,layered cookie


In [4]:
df_stack = df_stack.reset_index()
df_stack.columns = ['title','question']
df_stack.head()

Unnamed: 0,title,question
0,0,sandwich cookie
1,1,black-and-white cookie
2,2,popular cookie
3,3,mountain: prefix
4,4,sweet sandwich


In [43]:
df_stack[df_stack['question'].str.contains('musta')]

Unnamed: 0,title,question
1056,1056,cookie that some people eat with mustard


In [5]:
# Creating lists of questions + their relevant titles
questions =  df_stack['question'].tolist()
titles =  df_stack['title'].tolist()


# Adding title and data to a dictionary
tempDict = {}
for title, question in zip(titles, questions):
    tempDict[title]=question


In [86]:
# Removing punctuation from the text, as well as some misc. irrelevant characters 
wordnet_lemmatizer = WordNetLemmatizer()

def tokenize(text):
    text = text.lower() # lower case
    for e in set(string.punctuation+'\n'+'\t'): # remove punctuation and line breaks/tabs
        text = text.replace(e, ' ')	
    for i in range(0,10):	# remove double spaces
        text = text.replace('  ', ' ')
    text = text.translate(string.punctuation)  # punctuation
    tokens = nltk.word_tokenize(text)
    text = [w for w in tokens if not w in stopwords.words('english')] # stopwords
    stems = []
    for item in tokens: # stem
        stems.append(wordnet_lemmatizer.lemmatize(item))
    return stems

# calculate tfidf 
print("calculating tf-idf")

tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english', min_df=0.025, max_df=.5) #NOTE
# this step takes longest & contains lots of important parameters; playing with these and experimenting
# with them is recommended. Starting here::
# https://www.kaggle.com/adamschroeder/countvectorizer-tfidfvectorizer-predict-comments
# and moving on to the official docs here:
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer
# is a good idea.

tfs = tfidf.fit_transform(tempDict.values())
print("reducing tf-idf to dimensions")
tfs_reduced = TruncatedSVD(n_components=10, random_state=0).fit_transform(tfs)
print("done")

calculating tf-idf


  'stop_words.' % sorted(inconsistent))


reducing tf-idf to dimensions
done


In [87]:
model = TSNE(n_components=10, perplexity=10, verbose=2, method='exact').fit_transform(tfs_reduced)

# save to json file
x_axis=model[:,0]
y_axis=model[:,1]
x_norm = (x_axis-np.min(x_axis)) / (np.max(x_axis) - np.min(x_axis))
y_norm = (y_axis-np.min(y_axis)) / (np.max(y_axis) - np.min(y_axis))
data = {"x":x_norm.tolist(), "y":y_norm.tolist(), "names":tempDict.keys()} #output x and y coords in data
with open('test.json', 'w') as outfile:
    json.dump(list(data), outfile)

[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 1000 / 1303
[t-SNE] Computed conditional probabilities for sample 1303 / 1303
[t-SNE] Mean sigma: 0.000000
[t-SNE] Iteration 50: error = 46.8079256, gradient norm = 0.0413921 (50 iterations in 5.927s)
[t-SNE] Iteration 100: error = 43.3915136, gradient norm = 0.0066409 (50 iterations in 5.400s)
[t-SNE] Iteration 150: error = 43.6301050, gradient norm = 0.0143428 (50 iterations in 5.877s)
[t-SNE] Iteration 200: error = 43.6371546, gradient norm = 0.0061388 (50 iterations in 9.583s)
[t-SNE] Iteration 250: error = 43.4846728, gradient norm = 0.0048782 (50 iterations in 5.662s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 43.484673
[t-SNE] Iteration 300: error = 4.4242262, gradient norm = 0.0003310 (50 iterations in 5.385s)
[t-SNE] Iteration 350: error = 4.0997117, gradient norm = 0.0000417 (50 iterations in 6.203s)
[t-SNE] Iteration 400: error = 3.9377756, gradient norm = 0

In [88]:
# Importing json of results, merging it with the original file, and outputting a CSV which will contain
# the X and Y coords of each point, which we'll use to create our tSNE plot
df_xyplot = pd.read_json("test.json")



test_df_coords = pd.DataFrame(data)[['x','y']].reset_index()
test_df_coords.columns = ['names','x','y']

result = pd.merge(df_stack, test_df_coords, left_on='title', right_on='names')
result.to_csv("oreos_tsn_v3_5.csv")

In [10]:
# data
df_xyplot = pd.read_json("test.json")
df_xyplot

Unnamed: 0,0
0,x
1,y
2,names


In [19]:
result

Unnamed: 0,title,question,names,x,y
0,0,sandwich cookie,0,0.579764,1.456600e-01
1,1,black-and-white cookie,1,0.248767,2.973879e-01
2,2,popular cookie,2,0.779867,9.100980e-01
3,3,mountain: prefix,3,0.520758,2.977358e-01
4,4,sweet sandwich,4,0.566940,1.309917e-01
5,5,creme-filled cookie,5,0.632455,4.255381e-01
6,6,nabisco cookie,6,0.659073,2.865407e-01
7,7,cookie favorite,7,0.652972,9.571073e-01
8,8,mountain: comb form,8,0.410280,2.848355e-01
9,9,layered cookie,9,0.754130,1.000000e+00


In [None]:
data = {"x":x_norm.tolist(), "y":y_norm.tolist(), "names":tempDict.keys()}