In [1]:
#Hi! Thanks so much for helping us out with this!

#my main question is: I'm not 100% sure of the arguments passed in the last two cells
#should I do something differently? I'm not sure if this gives the best results

#I used this site as a source: https://dataplatform.cloud.ibm.com/analytics/notebooks/v2/3bee909a-5b69-4a50-9cd3-fdcd98fa5bd1/view?access_token=178b412e00a89ec0bca53f6fa65e9be58e392b23f684cb7377418ead7b8af4c2
#The code from above link was used in this story: https://pudding.cool/2018/11/dearabby/

import pandas as pd
import numpy as np
import re, os, time, json, csv

import nltk
from nltk.stem import WordNetLemmatizer

from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE

from gensim.models import word2vec

In [2]:
#load master speeches dataframe
df=pd.read_excel(r'C:\Users\Vivek Rao\Desktop\IDS\Digital\McRobbie NLP\speeches\master_speeches.xlsx')

In [3]:
#find missing values in the contents column
df[df.contents.isna()==True]

Unnamed: 0,date,venue,title,contents


In [4]:
#write function to clean sentences
#will be used as tokenizer in next cell
df_cleaned = df.copy()

wordnet_lemmatizer = WordNetLemmatizer()

#write function
def clean_text(text):
    #lowercase text first
    text=text.lower()
    text=text.replace('\n', ' ')
    text=text.replace('\t', ' ')
    tokens = nltk.word_tokenize(text)
    text=[word for word in tokens if not word in set(nlp.corpus.stopwords.words('english'))]
    stems = []
    for item in text:
        stems.append(wordnet_lemmatizer.lemmatize(item))
    return stems

In [None]:
t0=time.time()

#first step: set up the model by loading in the sentence cleaner.
#the following step does not run the model
tfidf = TfidfVectorizer(stop_words='english', tokenizer=clean_text, min_df=0.025, max_df=.5)

#next, run the model on speeches, i.e. the 'Contents' column of the DataFrame
tfs = tfidf.fit_transform(df.contents)
print('Time elapsed: {} seconds'.format(time.time()-t0))

In [None]:
#reduce number of dimensions of text to make analysis easier
#n_components is 3 because we want the visualization to be three-dimensional
tfs_reduced = TruncatedSVD(n_components=3, random_state=0).fit_transform(tfs)
print(tfs_reduced, len(tfs_reduced))

#convert speech titles to a list to append to each x-y-z coordinate,
#so users know what speech they're hovering over
#to see a demo of the visualization, check out tsne3d.html
speech_title = df.title.to_list()
speech = df.contents.to_list()
dict={}
for speech_title, speech in zip(speech_title, speech):
    dict[speech_title]=speech
    
#print for my sanity
list(dict.keys())

In [None]:
#run above dictionary through the TSNE model
#n_components is 3 because we want this to be three-dimensional
#function documentation can be found here: https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html
model = TSNE(n_components=3, perplexity=35, verbose=6, method='exact').fit_transform(tfs_reduced)

# save to json
x_axis=model[:,0]
y_axis=model[:,1]
z_axis=model[:,2]

x_norm = (x_axis-np.min(x_axis)) / (np.max(x_axis) - np.min(x_axis))
y_norm = (y_axis-np.min(y_axis)) / (np.max(y_axis) - np.min(y_axis))
z_norm = (z_axis-np.min(z_axis)) / (np.max(z_axis) - np.min(z_axis))

data = {"x":x_norm.tolist(), "y":y_norm.tolist(), "z":z_norm.tolist(), "names":list(dict.keys())}

with open(r'C:\Users\Vivek Rao\Desktop\IDS\Digital\McRobbie NLP\model_120620.json', 'w') as outfile:
    json.dump(data, outfile)