In [1]:
# Not all of these are needed, but most are (this cell is an artefact of a previous analysis). 
# If you don't have some of these python libraries, the easiest way to install them is using
# 
# > pip install library_name
# 
# Alternatively, you can do so using Anaconda, using 
# 
# > conda install library_name


import string
import numpy as np
import itertools
import pandas as pd
import re
import json
import sys
# stdout = sys.stdout
# sys.reload(sys)
# sys.setdefaultencoding('utf-8')
# sys.stdout = stdout
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE
from sklearn.preprocessing import normalize
from sklearn.decomposition import TruncatedSVD


# import jupyternotify
# ip = get_ipython()
# ip.register_magics(jupyternotify.JupyterNotifyMagics)

In [2]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/iblinderman/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/iblinderman/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Loading and cleaning the data structure. For this example, I'm using StackOverflow posts; specifically, 
# posts to the "Interpersonal questions" site on StackExchange. There's a bit of cleaning that's necessary here, 
# Which I take care of in this cell.

df_stack = pd.read_csv("oreo-clues.tsv", sep='\t')
df_stack
# df_stack_qs = df_stack[df_stack['PostTypeId']==1]
# from HTMLParser import HTMLParser

# class MLStripper(HTMLParser):
#     def __init__(self):
#         self.reset()
#         self.fed = []
#     def handle_data(self, d):
#         self.fed.append(d)
#     def get_data(self):
#         return ''.join(self.fed)

# def strip_tags(html):
#     s = MLStripper()
#     s.feed(html)
#     return s.get_data()

# df_stack_qs = df_stack_qs.replace(r'\n','', regex=True) 

# df_stack_qs['Body']=  df_stack_qs['Body'].apply(lambda x: strip_tags(x)) 

# Here's what the end product looks like
# df_stack_qs.head()

Unnamed: 0,clue
0,sandwich cookie
1,black-and-white cookie
2,popular cookie
3,sweet sandwich
4,creme-filled cookie
5,nabisco cookie
6,cookie favorite
7,layered cookie
8,black-and-white treat
9,two-tone treat


In [4]:
df_stack = df_stack.reset_index()
df_stack.columns = ['title','question']
df_stack.head()

Unnamed: 0,title,question
0,0,sandwich cookie
1,1,black-and-white cookie
2,2,popular cookie
3,3,sweet sandwich
4,4,creme-filled cookie


In [5]:
df_stack[df_stack['question'].str.contains('musta')]

Unnamed: 0,title,question
1052,1052,cookie that some people eat with mustard


In [6]:
# Creating lists of questions + their relevant titles
questions =  df_stack['question'].tolist()
titles =  df_stack['title'].tolist()


# Adding title and data to a dictionary
tempDict = {}
for title, question in zip(titles, questions):
    tempDict[title]=question


In [13]:
# Removing punctuation from the text, as well as some misc. irrelevant characters 
wordnet_lemmatizer = WordNetLemmatizer()

def tokenize(text):
    text = text.lower() # lower case
    for e in set(string.punctuation+'\n'+'\t'): # remove punctuation and line breaks/tabs
        text = text.replace(e, ' ')	
    for i in range(0,10):	# remove double spaces
        text = text.replace('  ', ' ')
    text = text.translate(string.punctuation)  # punctuation
    tokens = nltk.word_tokenize(text)
    text = [w for w in tokens if not w in stopwords.words('english')] # stopwords
    stems = []
    for item in tokens: # stem
        stems.append(wordnet_lemmatizer.lemmatize(item))
    return stems

# calculate tfidf 
print("calculating tf-idf")

tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english', min_df=0.025, max_df=.5) #NOTE
# this step takes longest & contains lots of important parameters; playing with these and experimenting
# with them is recommended. Starting here::
# https://www.kaggle.com/adamschroeder/countvectorizer-tfidfvectorizer-predict-comments
# and moving on to the official docs here:
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer
# is a good idea.

tfs = tfidf.fit_transform(tempDict.values())
print("reducing tf-idf to dimensions")
tfs_reduced = TruncatedSVD(n_components=5, random_state=0).fit_transform(tfs)
print("done")

calculating tf-idf


  'stop_words.' % sorted(inconsistent))


reducing tf-idf to dimensions
done


In [14]:
model = TSNE(n_components=5, perplexity=5, verbose=2, method='exact').fit_transform(tfs_reduced)

# save to json file
x_axis=model[:,0]
y_axis=model[:,1]
x_norm = (x_axis-np.min(x_axis)) / (np.max(x_axis) - np.min(x_axis))
y_norm = (y_axis-np.min(y_axis)) / (np.max(y_axis) - np.min(y_axis))
data = {"x":x_norm.tolist(), "y":y_norm.tolist(), "names":tempDict.keys()} #output x and y coords in data
with open('test.json', 'w') as outfile:
    json.dump(list(data), outfile)

[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 1000 / 1299
[t-SNE] Computed conditional probabilities for sample 1299 / 1299
[t-SNE] Mean sigma: 0.000000
[t-SNE] Iteration 50: error = 50.4335849, gradient norm = 0.0675093 (50 iterations in 3.316s)
[t-SNE] Iteration 100: error = 46.6442038, gradient norm = 0.0305864 (50 iterations in 3.158s)
[t-SNE] Iteration 150: error = 45.4378952, gradient norm = 0.0155149 (50 iterations in 3.219s)
[t-SNE] Iteration 200: error = 45.2679894, gradient norm = 0.0145035 (50 iterations in 3.197s)
[t-SNE] Iteration 250: error = 45.0659055, gradient norm = 0.0093343 (50 iterations in 3.256s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 45.065906
[t-SNE] Iteration 300: error = 1.1063509, gradient norm = 0.0017769 (50 iterations in 3.204s)
[t-SNE] Iteration 350: error = 1.1890628, gradient norm = 0.0004607 (50 iterations in 3.240s)
[t-SNE] Iteration 400: error = 1.2215307, gradient norm = 0

In [15]:
# Importing json of results, merging it with the original file, and outputting a CSV which will contain
# the X and Y coords of each point, which we'll use to create our tSNE plot
# df_xyplot = pd.read_json("test.json")



test_df_coords = pd.DataFrame(data)[['x','y']].reset_index()
test_df_coords.columns = ['names','x','y']

result = pd.merge(df_stack, test_df_coords, left_on='title', right_on='names')
result.to_csv("oreos_tsn_reduced_sample_v1_3.csv")

In [20]:
for x in range(1,11):
    tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english', min_df=0.025, max_df=.5) #NOTE
    # this step takes longest & contains lots of important parameters; playing with these and experimenting
    # with them is recommended. Starting here::
    # https://www.kaggle.com/adamschroeder/countvectorizer-tfidfvectorizer-predict-comments
    # and moving on to the official docs here:
    # http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer
    # is a good idea.

    tfs = tfidf.fit_transform(tempDict.values())
    print("reducing tf-idf to dimensions")
    tfs_reduced = TruncatedSVD(n_components=10, random_state=0).fit_transform(tfs)
    print("done")
    
    model = TSNE(n_components=10, perplexity=5, verbose=2, method='exact').fit_transform(tfs_reduced)

    # save to json file
    x_axis=model[:,0]
    y_axis=model[:,1]
    x_norm = (x_axis-np.min(x_axis)) / (np.max(x_axis) - np.min(x_axis))
    y_norm = (y_axis-np.min(y_axis)) / (np.max(y_axis) - np.min(y_axis))
    data = {"x":x_norm.tolist(), "y":y_norm.tolist(), "names":tempDict.keys()} #output x and y coords in data
    
    

    test_df_coords = pd.DataFrame(data)[['x','y']].reset_index()
    test_df_coords.columns = ['names','x','y']

    result = pd.merge(df_stack, test_df_coords, left_on='title', right_on='names')
    filename = 'oreos_tsn_reduced_sample_v2_'+str(x)+'.csv'
    result.to_csv(filename)
    
    del tfidf
    del tfs
    del tfs_reduced
    del x_axis
    del y_axis
    del x_norm
    del y_norm
    del data
    del test_df_coords
    del result
    del filename


  'stop_words.' % sorted(inconsistent))


reducing tf-idf to dimensions
done
[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 1000 / 1299
[t-SNE] Computed conditional probabilities for sample 1299 / 1299
[t-SNE] Mean sigma: 0.000000
[t-SNE] Iteration 50: error = 50.6768160, gradient norm = 0.0265493 (50 iterations in 4.169s)
[t-SNE] Iteration 100: error = 48.2324722, gradient norm = 0.0108722 (50 iterations in 4.002s)
[t-SNE] Iteration 150: error = 47.5962654, gradient norm = 0.0102426 (50 iterations in 4.115s)
[t-SNE] Iteration 200: error = 47.2916364, gradient norm = 0.0042732 (50 iterations in 4.064s)
[t-SNE] Iteration 250: error = 47.0882932, gradient norm = 0.0025209 (50 iterations in 4.074s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 47.088293
[t-SNE] Iteration 300: error = 4.8111344, gradient norm = 0.0003089 (50 iterations in 4.498s)
[t-SNE] Iteration 350: error = 4.4628243, gradient norm = 0.0000407 (50 iterations in 4.150s)
[t-SNE] Iteration 400: e

  'stop_words.' % sorted(inconsistent))


reducing tf-idf to dimensions
done
[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 1000 / 1299
[t-SNE] Computed conditional probabilities for sample 1299 / 1299
[t-SNE] Mean sigma: 0.000000
[t-SNE] Iteration 50: error = 50.1748531, gradient norm = 0.0298623 (50 iterations in 4.065s)
[t-SNE] Iteration 100: error = 47.7006130, gradient norm = 0.0109433 (50 iterations in 3.933s)
[t-SNE] Iteration 150: error = 47.0879828, gradient norm = 0.0046425 (50 iterations in 4.147s)
[t-SNE] Iteration 200: error = 46.7915701, gradient norm = 0.0051323 (50 iterations in 4.269s)
[t-SNE] Iteration 250: error = 46.6032948, gradient norm = 0.0050040 (50 iterations in 3.992s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 46.603295
[t-SNE] Iteration 300: error = 4.8583763, gradient norm = 0.0002629 (50 iterations in 4.081s)
[t-SNE] Iteration 350: error = 4.4811283, gradient norm = 0.0000408 (50 iterations in 4.628s)
[t-SNE] Iteration 400: e

  'stop_words.' % sorted(inconsistent))


reducing tf-idf to dimensions
done
[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 1000 / 1299
[t-SNE] Computed conditional probabilities for sample 1299 / 1299
[t-SNE] Mean sigma: 0.000000
[t-SNE] Iteration 50: error = 51.8811464, gradient norm = 0.0310574 (50 iterations in 4.305s)
[t-SNE] Iteration 100: error = 50.6216795, gradient norm = 0.0146580 (50 iterations in 4.512s)
[t-SNE] Iteration 150: error = 50.3927565, gradient norm = 0.0064963 (50 iterations in 4.360s)
[t-SNE] Iteration 200: error = 50.3634547, gradient norm = 0.0043554 (50 iterations in 4.430s)
[t-SNE] Iteration 250: error = 50.4651920, gradient norm = 0.0034739 (50 iterations in 4.833s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 50.465192
[t-SNE] Iteration 300: error = 5.0469498, gradient norm = 0.0003140 (50 iterations in 4.464s)
[t-SNE] Iteration 350: error = 4.6754025, gradient norm = 0.0000393 (50 iterations in 4.461s)
[t-SNE] Iteration 400: e

  'stop_words.' % sorted(inconsistent))


reducing tf-idf to dimensions
done
[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 1000 / 1299
[t-SNE] Computed conditional probabilities for sample 1299 / 1299
[t-SNE] Mean sigma: 0.000000
[t-SNE] Iteration 50: error = 52.5095189, gradient norm = 0.0298421 (50 iterations in 4.198s)
[t-SNE] Iteration 100: error = 51.2429737, gradient norm = 0.0217898 (50 iterations in 4.165s)
[t-SNE] Iteration 150: error = 51.1697712, gradient norm = 0.0069945 (50 iterations in 4.108s)
[t-SNE] Iteration 200: error = 51.3620644, gradient norm = 0.0036903 (50 iterations in 4.181s)
[t-SNE] Iteration 250: error = 51.3661496, gradient norm = 0.0032426 (50 iterations in 4.120s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 51.366150
[t-SNE] Iteration 300: error = 5.1077832, gradient norm = 0.0002438 (50 iterations in 4.054s)
[t-SNE] Iteration 350: error = 4.7436109, gradient norm = 0.0000383 (50 iterations in 4.133s)
[t-SNE] Iteration 400: e

  'stop_words.' % sorted(inconsistent))


reducing tf-idf to dimensions
done
[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 1000 / 1299
[t-SNE] Computed conditional probabilities for sample 1299 / 1299
[t-SNE] Mean sigma: 0.000000
[t-SNE] Iteration 50: error = 50.4585029, gradient norm = 0.0352106 (50 iterations in 4.327s)
[t-SNE] Iteration 100: error = 48.4710550, gradient norm = 0.0098859 (50 iterations in 4.820s)
[t-SNE] Iteration 150: error = 47.9374773, gradient norm = 0.0081175 (50 iterations in 4.654s)
[t-SNE] Iteration 200: error = 47.6054560, gradient norm = 0.0040622 (50 iterations in 4.545s)
[t-SNE] Iteration 250: error = 47.7072972, gradient norm = 0.0026430 (50 iterations in 4.466s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 47.707297
[t-SNE] Iteration 300: error = 5.0463543, gradient norm = 0.0002934 (50 iterations in 4.359s)
[t-SNE] Iteration 350: error = 4.6577323, gradient norm = 0.0000404 (50 iterations in 4.321s)
[t-SNE] Iteration 400: e

  'stop_words.' % sorted(inconsistent))


reducing tf-idf to dimensions
done
[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 1000 / 1299
[t-SNE] Computed conditional probabilities for sample 1299 / 1299
[t-SNE] Mean sigma: 0.000000
[t-SNE] Iteration 50: error = 50.2624765, gradient norm = 0.0382067 (50 iterations in 4.350s)
[t-SNE] Iteration 100: error = 47.3543594, gradient norm = 0.0142258 (50 iterations in 4.286s)
[t-SNE] Iteration 150: error = 46.8681367, gradient norm = 0.0054000 (50 iterations in 4.211s)
[t-SNE] Iteration 200: error = 46.7870574, gradient norm = 0.0051133 (50 iterations in 4.276s)
[t-SNE] Iteration 250: error = 46.8985738, gradient norm = 0.0029820 (50 iterations in 4.345s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 46.898574
[t-SNE] Iteration 300: error = 4.9651349, gradient norm = 0.0002603 (50 iterations in 4.227s)
[t-SNE] Iteration 350: error = 4.6120800, gradient norm = 0.0000400 (50 iterations in 4.137s)
[t-SNE] Iteration 400: e

  'stop_words.' % sorted(inconsistent))


reducing tf-idf to dimensions
done
[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 1000 / 1299
[t-SNE] Computed conditional probabilities for sample 1299 / 1299
[t-SNE] Mean sigma: 0.000000
[t-SNE] Iteration 50: error = 51.0977086, gradient norm = 0.0432464 (50 iterations in 4.201s)
[t-SNE] Iteration 100: error = 48.5515865, gradient norm = 0.0207282 (50 iterations in 4.583s)
[t-SNE] Iteration 150: error = 48.1605306, gradient norm = 0.0161618 (50 iterations in 5.207s)
[t-SNE] Iteration 200: error = 47.8889216, gradient norm = 0.0033309 (50 iterations in 4.653s)
[t-SNE] Iteration 250: error = 47.6986835, gradient norm = 0.0027256 (50 iterations in 4.722s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 47.698683
[t-SNE] Iteration 300: error = 4.8527009, gradient norm = 0.0003495 (50 iterations in 4.632s)
[t-SNE] Iteration 350: error = 4.4764242, gradient norm = 0.0000404 (50 iterations in 4.865s)
[t-SNE] Iteration 400: e

  'stop_words.' % sorted(inconsistent))


reducing tf-idf to dimensions
done
[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 1000 / 1299
[t-SNE] Computed conditional probabilities for sample 1299 / 1299
[t-SNE] Mean sigma: 0.000000
[t-SNE] Iteration 50: error = 50.4018700, gradient norm = 0.0291540 (50 iterations in 4.453s)
[t-SNE] Iteration 100: error = 47.8521143, gradient norm = 0.0172877 (50 iterations in 4.277s)
[t-SNE] Iteration 150: error = 47.5506960, gradient norm = 0.0083787 (50 iterations in 4.630s)
[t-SNE] Iteration 200: error = 47.2708540, gradient norm = 0.0043176 (50 iterations in 4.747s)
[t-SNE] Iteration 250: error = 47.1051798, gradient norm = 0.0028086 (50 iterations in 4.770s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 47.105180
[t-SNE] Iteration 300: error = 4.8738531, gradient norm = 0.0002819 (50 iterations in 5.047s)
[t-SNE] Iteration 350: error = 4.4904312, gradient norm = 0.0000397 (50 iterations in 4.689s)
[t-SNE] Iteration 400: e

  'stop_words.' % sorted(inconsistent))


reducing tf-idf to dimensions
done
[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 1000 / 1299
[t-SNE] Computed conditional probabilities for sample 1299 / 1299
[t-SNE] Mean sigma: 0.000000
[t-SNE] Iteration 50: error = 50.2703857, gradient norm = 0.0334724 (50 iterations in 4.212s)
[t-SNE] Iteration 100: error = 47.9990907, gradient norm = 0.0118225 (50 iterations in 4.155s)
[t-SNE] Iteration 150: error = 47.4268128, gradient norm = 0.0041747 (50 iterations in 4.156s)
[t-SNE] Iteration 200: error = 47.1097046, gradient norm = 0.0044691 (50 iterations in 4.261s)
[t-SNE] Iteration 250: error = 47.1863817, gradient norm = 0.0025699 (50 iterations in 4.795s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 47.186382
[t-SNE] Iteration 300: error = 5.0185799, gradient norm = 0.0002755 (50 iterations in 4.720s)
[t-SNE] Iteration 350: error = 4.6544473, gradient norm = 0.0000401 (50 iterations in 4.688s)
[t-SNE] Iteration 400: e

  'stop_words.' % sorted(inconsistent))


reducing tf-idf to dimensions
done
[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 1000 / 1299
[t-SNE] Computed conditional probabilities for sample 1299 / 1299
[t-SNE] Mean sigma: 0.000000
[t-SNE] Iteration 50: error = 50.7266963, gradient norm = 0.0376141 (50 iterations in 4.139s)
[t-SNE] Iteration 100: error = 48.1241411, gradient norm = 0.0170083 (50 iterations in 4.142s)
[t-SNE] Iteration 150: error = 47.6962859, gradient norm = 0.0065613 (50 iterations in 4.107s)
[t-SNE] Iteration 200: error = 47.3383434, gradient norm = 0.0025313 (50 iterations in 4.110s)
[t-SNE] Iteration 250: error = 47.1295317, gradient norm = 0.0042342 (50 iterations in 4.119s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 47.129532
[t-SNE] Iteration 300: error = 4.8515285, gradient norm = 0.0002839 (50 iterations in 4.069s)
[t-SNE] Iteration 350: error = 4.4707891, gradient norm = 0.0000399 (50 iterations in 4.172s)
[t-SNE] Iteration 400: e