In [1]:
import numpy as np 
import pandas as pd 
import re
import nltk
from nltk.stem import WordNetLemmatizer, snowball
from nltk.stem.porter import *
from nltk import word_tokenize
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models
from gensim.models import word2vec
import seaborn as sns
from pprint import pprint

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt


nltk.download('wordnet')
%matplotlib inline

## 1.0 Import dataset and explore dataset

In [2]:
df = pd.read_csv('../input/covid19-vaccine-news-reddit-discussions/comments.csv')
df.head()

In [3]:
df.shape

In [4]:
np.sum(pd.isnull(df))

The data has 14 columns with 34768 observations with no null values. The data contains the post that the comment has been left, and the content of the post. Some of the features can be dropped since those features are redundant (i.e. post_id, post_author,comment_parent_id). We'll leave post_id and drop the other features. Also, since the post date for both post and comments are not the key factor in this assignment, therefore will drop those columns.

Moreover, since the data observes the comments in the reddit website, there are parent post data called post_title. We can group comments based on these posts and explore more about it. 

Comment score and post score can be further explored.

In [5]:
# drop post_author,comment_parent_id

df = df.drop(['post_author','comment_parent_id', 'post_date','post_permalink', 'comment_author', 'comment_date'], axis = 1)
df.head()

In [6]:
grouped = df[['post_score','comment_id','comment_score','comment_body']].groupby(df['post_title'])

In [7]:
group_freq = grouped.size().sort_values()
group_freq = group_freq.to_frame()
group_freq.columns = ['comment_freq']
group_freq.head()

In [8]:
group_score = grouped.mean().sort_values(by = 'comment_score')
group_score.head()

In [9]:
numeric_df = group_score.merge(group_freq, how='inner', on='post_title')
numeric_df.head()

In [10]:
print(f"Shape of group_score: {group_score.shape}")
print(f"Shape of group_freq: {group_freq.shape}")
print(f"Shape of merged dataframe: {numeric_df.shape}")

In [11]:
grouped = df[['post_score','comment_id','comment_score','comment_body']].groupby(df['post_id'])

group_freq = grouped.size().sort_values()
group_freq = group_freq.to_frame()
group_freq.columns = ['comment_freq']

group_score = grouped.mean().sort_values(by = 'comment_score')

numeric_df = group_score.merge(group_freq, how='inner', on='post_id')

print(f"Shape of group_score: {group_score.shape}")
print(f"Shape of group_freq: {group_freq.shape}")
print(f"Shape of merged dataframe: {numeric_df.shape}")

Since the post title is a bit lengthy to be a suitable index, we could use post_id as a substitution. Just to be safe, we need to check if one user has posted multiple posts (then the rows of df grouped by 'post_title' and 'post_id' will be different). However, since the shape of the two different dfs are exactly same, we use df grouped by 'post_id'.

## 1.1 Use pre-trained embedding to find similar words

In [12]:
STOP_WORDS = nltk.corpus.stopwords.words()

def lemmatizer(text):
    Lemmatizer = WordNetLemmatizer()
    return Lemmatizer.lemmatize(text)

def clean_sentence(val):
    "remove chars that are not letters or numbers, downcase, then remove stop words"
    regex = re.compile('([^\s\w]|_)')
    sentence = regex.sub('', val).lower()
    sentence = sentence.split(" ")
    
    for word in list(sentence):
        if word in STOP_WORDS:
            sentence.remove(word)
            
    sentence = " ".join(sentence)
    return sentence

def clean_dataframe(data):
    "drop nans, then apply 'clean_sentence' function to question1 and 2"
    data = data.dropna(how="any")
    
    data['comment_body'] = data['comment_body'].apply(clean_sentence)
    
    return data


In [13]:
embedded_df = clean_dataframe(df)
embedded_df.head(5)

In [14]:
def build_corpus(data):
    "Creates a list of lists containing words from each sentence"
    corpus = []
    for sentence in df['comment_body'].iteritems():
        #print(sentence)
        word_list = sentence[1]
        word_list = nltk.word_tokenize(word_list)
        lemmatized_output = ' '.join([lemmatizer(w) for w in word_list])
        corpus.append(lemmatized_output.split(" "))
            
    return corpus

corpus = build_corpus(embedded_df)        
corpus[0:2]

In [15]:
def index_2d(myList, v):
    for i, x in enumerate(myList):
        if v in x:
            return i, x.index(v)

print(index_2d(corpus,'sad'))

In [16]:
model = word2vec.Word2Vec(corpus, window=15, min_count=50, workers=4)

In [17]:
model.wv['vaccine']

In [18]:
model.wv.most_similar('vaccine')[:5]

In [19]:
model.wv.most_similar('sad')[:5]

## 1.2 Visualization using T-SNE

Since the Gensim 4.0.0 model doesn't support vocab function, we use .index_to_key instead

In [20]:
def tsne_plot1(model):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []
    

    for word in model.wv.index_to_key:
        tokens.append(model.wv[word])
        labels.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=4300)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(16, 16)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     size = 1,
                     ha='right',
                     va='bottom')
        plt.title("T-SNE Model for all Word2vec Vocabulary",fontsize=15)
    plt.show()

In [21]:
tsne_plot1(model)

We have intentionally reduced the size of the cells and the annotation since it was too messy to see any kind of pattern. However, although we have reduced the size, the t-sne plot is still chaotic, showing no special trend. Now we can do the t-sne plot for most frequent words.

In [22]:
labels = []
tokens = []
    

for word in model.wv.index_to_key:
    tokens.append(model.wv[word])
    labels.append(word)
    
tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=4300)
new_values = tsne_model.fit_transform(tokens)



In [23]:
new_corpus = [single for sublist in corpus for single in sublist]

In [24]:
top_1000_freq = pd.value_counts(np.array(new_corpus))[:1000]

In [25]:
def tsne_plot2(model):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []
        
    for word in top_1000_freq.index:
        tokens.append(model.wv[word])
        labels.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=4300)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=[16, 16]) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i],s=200)
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     size = 1,
                     ha='right',
                     va='bottom')
        plt.title("T-SNE Model for top 1000 frequent Word2vec Vocabulary",fontsize=15)

    plt.show()

In [26]:
tsne_plot2(model)

The T-SNE plot for top 1000 frequent vocab also shows the random pattern similar to the "T-SNE Model for all Word2vec Vocabulary". Since looking at this individually doesn't show any special observations, we can try comparing the "T-SNE Model for all Word2vec Vocabulary" and "T-SNE Model for top 1000 frequent Word2vec Vocabulary".

In [27]:
def compare_tnse1_tnse2(model):

    fig = plt.figure(figsize = [30,15])
    ax1 = fig.add_subplot(1, 2, 1)
    ax2 = fig.add_subplot(1, 2, 2)
    
    # tnse_plot1
    labels_tnse_plot1 = []
    tokens_tnse_plot1 = []
    
    for word in model.wv.index_to_key:
        tokens_tnse_plot1.append(model.wv[word])
        labels_tnse_plot1.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=4300)
    new_values = tsne_model.fit_transform(tokens_tnse_plot1)

    x_tnse_plot1 = []
    y_tnse_plot1 = []
    for value in new_values:
        x_tnse_plot1.append(value[0])
        y_tnse_plot1.append(value[1])
        
    for i in range(len(x_tnse_plot1)):
        ax1.scatter(x_tnse_plot1[i],y_tnse_plot1[i])
        ax1.annotate(labels_tnse_plot1[i],
                     xy=(x_tnse_plot1[i], y_tnse_plot1[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     size = 1,
                     ha='right',
                     va='bottom')
        ax1.set_title("T-SNE Model for all Word2vec Vocabulary",fontsize=9)

        
    # tnse_plot2
    labels_tnse_plot2 = []
    tokens_tnse_plot2 = []
        
    for word in top_1000_freq.index:
        tokens_tnse_plot2.append(model.wv[word])
        labels_tnse_plot2.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=4300)
    new_values = tsne_model.fit_transform(tokens_tnse_plot2)

    x_tnse_plot2 = []
    y_tnse_plot2 = []
    for value in new_values:
        x_tnse_plot2.append(value[0])
        y_tnse_plot2.append(value[1])
        
    for i in range(len(x_tnse_plot2)):
        ax2.scatter(x_tnse_plot2[i],y_tnse_plot2[i])
        ax2.annotate(labels_tnse_plot2[i],
                     xy=(x_tnse_plot2[i], y_tnse_plot2[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     size = 1,
                     ha='right',
                     va='bottom')
        ax2.set_title("T-SNE Model for top 1000 frequent Word2vec Vocabulary",fontsize=9)

    plt.show()

In [28]:
compare_tnse1_tnse2(model)

The comparison between "T-SNE Model for all Word2Vec Vocabulary" and "T-SNE Model for top 1000 frequent Word2vec Vocabulary" seems similar, just the observations shown in the right part of the "T-SNE Model for all Word2Vec Vocabulary" plot has been eliminated. But the interesting part is that since we have eliminated some words which haven't appear that much, the range of the observation has decreased; the range of y axis has decreased from -50 ~ 50 to -30 ~ 35 and x axis has decreased from -70 ~ 50 to -40 ~ 45. Maybe we can try observe more by selecting data with top 500 frequent vocabulary and top 100 frequent vocabulary.

In [29]:
top_100_freq = pd.value_counts(np.array(new_corpus))[:100]
top_500_freq = pd.value_counts(np.array(new_corpus))[:500]

In [30]:
def tsne_plot3(model):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    for word in top_100_freq.index:
        tokens.append(model.wv[word])
        labels.append(word)
    
    tsne_model = TSNE(perplexity=50, n_components=2,learning_rate = 500, init='pca', n_iter=2500, random_state=4300)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(16, 16)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i], s= 100)
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     size = 15,
                     ha='right',
                     va='bottom')
        plt.legend()
        plt.title("T-SNE Model for top 100 frequent Word2vec Vocabulary",fontsize=9)

    plt.show()

In [31]:
tsne_plot3(model)

Just in case if there are some special patterns on the plot if we have limit the data with 100 most frequent vocabulary, the pattern seems random as it was for "T-SNE Model for all Word2Vec Vocabulary" and "T-SNE Model for top 1000 frequent Word2vec Vocabulary". We now try to compare all 4 models: "T-SNE Model for all Word2Vec Vocabulary", "T-SNE Model for top 1000 frequent Word2vec Vocabulary", "T-SNE Model for top 500 frequent Word2vec Vocabulary" and "T-SNE Model for top 100 frequent Word2vec Vocabulary"

In [32]:
def compare_tnse_models1(model):

    fig = plt.figure(figsize = [30,30])
    ax1 = fig.add_subplot(2, 2, 1)
    ax2 = fig.add_subplot(2, 2, 2)
    ax3 = fig.add_subplot(2, 2, 3)
    ax4 = fig.add_subplot(2, 2, 4)
    
    # tnse_plot on word2vec model vocabs
    labels_tnse_plot1 = []
    tokens_tnse_plot1 = []
    
    for word in model.wv.index_to_key:
        tokens_tnse_plot1.append(model.wv[word])
        labels_tnse_plot1.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=4300)
    new_values = tsne_model.fit_transform(tokens_tnse_plot1)

    x_tnse_plot1 = []
    y_tnse_plot1 = []
    for value in new_values:
        x_tnse_plot1.append(value[0])
        y_tnse_plot1.append(value[1])
        
    for i in range(len(x_tnse_plot1)):
        ax1.scatter(x_tnse_plot1[i],y_tnse_plot1[i], s = 100 )
        ax1.annotate(labels_tnse_plot1[i],
                     xy=(x_tnse_plot1[i], y_tnse_plot1[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     size = 1,
                     ha='right',
                     va='bottom')
        ax1.legend()
        ax1.set_title("T-SNE Model for all Word2vec Vocabulary",fontsize=9)
    
    # tnse_plot on top 1000 commonly seen words
    labels_tnse_plot2 = []
    tokens_tnse_plot2 = []
        
    for word in top_1000_freq.index:
        tokens_tnse_plot2.append(model.wv[word])
        labels_tnse_plot2.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=4300)
    new_values = tsne_model.fit_transform(tokens_tnse_plot2)

    x_tnse_plot2 = []
    y_tnse_plot2 = []
    for value in new_values:
        x_tnse_plot2.append(value[0])
        y_tnse_plot2.append(value[1])
        
    for i in range(len(x_tnse_plot2)):
        ax2.scatter(x_tnse_plot2[i],y_tnse_plot2[i], s = 100 )
        ax2.annotate(labels_tnse_plot2[i],
                     xy=(x_tnse_plot2[i], y_tnse_plot2[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     size = 1,
                     ha='right',
                     va='bottom')
        ax2.legend()
        ax2.set_title("T-SNE Model for top 1000 frequent Word2vec Vocabulary",fontsize=9)

    # tnse_plot on top 500 commonly seen words
    labels_tnse_plot3 = []
    tokens_tnse_plot3 = []
        
    for word in top_500_freq.index:
        tokens_tnse_plot3.append(model.wv[word])
        labels_tnse_plot3.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=4300)
    new_values = tsne_model.fit_transform(tokens_tnse_plot3)

    x_tnse_plot3 = []
    y_tnse_plot3 = []
    for value in new_values:
        x_tnse_plot3.append(value[0])
        y_tnse_plot3.append(value[1])
        
    for i in range(len(x_tnse_plot3)):
        ax3.scatter(x_tnse_plot3[i],y_tnse_plot3[i], s = 100 )
        ax3.annotate(labels_tnse_plot3[i],
                     xy=(x_tnse_plot3[i], y_tnse_plot3[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     size = 1,
                     ha='right',
                     va='bottom')
        ax3.legend()
        ax3.set_title("T-SNE Model for top 100 frequent Word2vec Vocabulary",fontsize=9)
        
    # tnse_plot on top 100 commonly seen words
    labels_tnse_plot4 = []
    tokens_tnse_plot4 = []
        
    for word in top_100_freq.index:
        tokens_tnse_plot4.append(model.wv[word])
        labels_tnse_plot4.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=4300)
    new_values = tsne_model.fit_transform(tokens_tnse_plot4)

    x_tnse_plot4 = []
    y_tnse_plot4 = []
    for value in new_values:
        x_tnse_plot4.append(value[0])
        y_tnse_plot4.append(value[1])
        
    for i in range(len(x_tnse_plot4)):
        ax4.scatter(x_tnse_plot4[i],y_tnse_plot4[i], s = 100 )
        ax4.annotate(labels_tnse_plot4[i],
                     xy=(x_tnse_plot4[i], y_tnse_plot4[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     size = 10,
                     ha='right',
                     va='bottom')
        ax4.legend()
        ax4.set_title("T-SNE Model for top 100 frequent Word2vec Vocabulary",fontsize=9)

    plt.show()

In [33]:
compare_tnse_models1(model)

We can see that the range of the x axis and y axis in the plot has dramatically shrunken. we can conclude that mostly used word in comments are positioned in the center, compactly gathered together. Just to see how it's significant, we gave a weight based on the frequency for the next plot. 

In [34]:
def compare_tnse_models2(model):

    fig = plt.figure(figsize = [30,30])
    ax1 = fig.add_subplot(2, 2, 1)
    ax2 = fig.add_subplot(2, 2, 2)
    ax3 = fig.add_subplot(2, 2, 3)
    ax4 = fig.add_subplot(2, 2, 4)
    
    # tnse_plot on word2vec model vocabs
    labels_tnse_plot1 = []
    tokens_tnse_plot1 = []
    
    for word in model.wv.index_to_key:
        tokens_tnse_plot1.append(model.wv[word])
        labels_tnse_plot1.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=4300)
    new_values = tsne_model.fit_transform(tokens_tnse_plot1)

    x_tnse_plot1 = []
    y_tnse_plot1 = []
    for value in new_values:
        x_tnse_plot1.append(value[0])
        y_tnse_plot1.append(value[1])
        
    for i in range(len(x_tnse_plot1)):
        ax1.scatter(x_tnse_plot1[i],y_tnse_plot1[i], s = 100)
        ax1.annotate(labels_tnse_plot1[i],
                     xy=(x_tnse_plot1[i], y_tnse_plot1[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     size = 1,
                     ha='right',
                     va='bottom')
        ax1.legend()
        ax1.set_title("T-SNE Model for all Word2vec Vocabulary",fontsize=9)
    
    # tnse_plot on top 1000 commonly seen words
    labels_tnse_plot2 = []
    tokens_tnse_plot2 = []
        
    for word in top_1000_freq.index:
        tokens_tnse_plot2.append(model.wv[word])
        labels_tnse_plot2.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=4300)
    new_values = tsne_model.fit_transform(tokens_tnse_plot2)

    x_tnse_plot2 = []
    y_tnse_plot2 = []
    for value in new_values:
        x_tnse_plot2.append(value[0])
        y_tnse_plot2.append(value[1])
        
    for i in range(len(x_tnse_plot2)):
        ax2.scatter(x_tnse_plot2[i],y_tnse_plot2[i], s = (100 * top_1000_freq[i])/50, alpha=0.5)
        ax2.annotate(labels_tnse_plot2[i],
                     xy=(x_tnse_plot2[i], y_tnse_plot2[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     size = 1,
                     ha='right',
                     va='bottom')
        ax2.legend()
        ax2.set_title("T-SNE Model for top 1000 frequent Word2vec Vocabulary",fontsize=9)

    # tnse_plot on top 500 commonly seen words
    labels_tnse_plot3 = []
    tokens_tnse_plot3 = []
        
    for word in top_500_freq.index:
        tokens_tnse_plot3.append(model.wv[word])
        labels_tnse_plot3.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=4300)
    new_values = tsne_model.fit_transform(tokens_tnse_plot3)

    x_tnse_plot3 = []
    y_tnse_plot3 = []
    for value in new_values:
        x_tnse_plot3.append(value[0])
        y_tnse_plot3.append(value[1])
        
    for i in range(len(x_tnse_plot3)):
        ax3.scatter(x_tnse_plot3[i],y_tnse_plot3[i], s = (100 * top_500_freq[i])/50, alpha=0.5)
        ax3.annotate(labels_tnse_plot3[i],
                     xy=(x_tnse_plot3[i], y_tnse_plot3[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     size = 1,
                     ha='right',
                     va='bottom')
        ax3.legend()
        ax3.set_title("T-SNE Model for top 100 frequent Word2vec Vocabulary",fontsize=9)
        
    # tnse_plot on top 100 commonly seen words
    labels_tnse_plot4 = []
    tokens_tnse_plot4 = []
        
    for word in top_100_freq.index:
        tokens_tnse_plot4.append(model.wv[word])
        labels_tnse_plot4.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=4300)
    new_values = tsne_model.fit_transform(tokens_tnse_plot4)

    x_tnse_plot4 = []
    y_tnse_plot4 = []
    for value in new_values:
        x_tnse_plot4.append(value[0])
        y_tnse_plot4.append(value[1])
        
    for i in range(len(x_tnse_plot4)):
        ax4.scatter(x_tnse_plot4[i],y_tnse_plot4[i], s = (100 * top_100_freq[i])/50, alpha=0.5)
        ax4.annotate(labels_tnse_plot4[i],
                     xy=(x_tnse_plot4[i], y_tnse_plot4[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     size = 10,
                     ha='right',
                     va='bottom')
        ax4.legend()
        ax4.set_title("T-SNE Model for top 100 frequent Word2vec Vocabulary",fontsize=9)

    plt.show()

In [35]:
compare_tnse_models2(model)

As seen in this bubble graph, most of the bigger bubbles are positioned in the center which indicates mostly used words are positioned in the center. Looking at the frequently used words, they mostly have neutral nuances, which makes the plot showing compactly gathered bubbles. The extreme words, either positive or negative, tend to stay on the edge of the t-sne plot for all Word2Vec vocabulary. This is explainable for example, some people might have negative perspective on vaccine while the others have positive perspective on vaccine, thus the word vaccine is the frequently appeared word, while the words with extreme persepective tends to stay in edge. Just to make sure if our hypothesis is right. We'll now run tsne plot based on closest words. If the points in the plot are scatterd, then the hypothesis of mostly used vocabs are neutral words.

In [36]:
def display_closestwords_tsnescatterplot(model, word, size):
    
    arr = np.empty((0,size), dtype='f')
    word_labels = [word]

    close_words = model.wv.most_similar(word)

    arr = np.append(arr, np.array([model.wv[word]]), axis=0)
    for wrd_score in close_words:
        wrd_vector = model.wv[wrd_score[0]]
        word_labels.append(wrd_score[0])
        arr = np.append(arr, np.array([wrd_vector]), axis=0)
        
    tsne = TSNE(n_components=2, random_state=0)
    np.set_printoptions(suppress=True)
    Y = tsne.fit_transform(arr)

    x_coords = Y[:, 0]
    y_coords = Y[:, 1]
    plt.scatter(x_coords, y_coords)

    for label, x, y in zip(word_labels, x_coords, y_coords):
        plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
    plt.xlim(x_coords.min()+0.0005, x_coords.max()+0.0005)
    plt.ylim(y_coords.min()+0.0005, y_coords.max()+0.0005)
    plt.show()

In [37]:
display_closestwords_tsnescatterplot(model, 'vaccine', 100)

In [38]:
display_closestwords_tsnescatterplot(model, 'covid', 100)

In [39]:
display_closestwords_tsnescatterplot(model, 'distancing', 100)

In [40]:
display_closestwords_tsnescatterplot(model, 'government', 100)

The plot created by "display_closestwords_tsnescatterplot" function shows that the closest words are scattered all over the plot, thus reinforcing our hypothesis.

**Next part will be covered in different ipynb file since the computation time until this part is already too long to run all the part.**