# 1. Import Packages

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import nltk

In [3]:
import re
import string

In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [6]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [7]:
import warnings
warnings.filterwarnings('ignore')

In [10]:
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence
import spacy
import itertools as it
import codecs
import os

In [66]:
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore

import pyLDAvis
import pyLDAvis.gensim_models
import pickle

In [11]:
# create empty list at beginning of every jupyter notebook
hyperparameter_table  = []

# 2. Load Datasets

In [12]:
# load dataset
news = pd.read_csv('../Data/news_classification_Feb06_24.csv', index_col=0, parse_dates=['date', 'election_day'])
news.head()

Unnamed: 0,title,text,subject,date,label,char_count_text,word_count_text,avg_word_length_text,char_count_title,word_count_title,avg_word_length_title,year,month,day,day_of_week,election_day,days_to_election
22216,Ben Stein Calls Out 9th Circuit Court: Committ...,"21st Century Wire says Ben Stein, reputable pr...",US_News,2017-02-13,fake_news,1028,171,6.011696,89,14,6.357143,2017,2,13,0,2016-11-08,97
27917,Trump drops Steve Bannon from National Securit...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,2017-04-05,real_news,4820,771,6.251621,55,8,6.875,2017,4,5,2,2016-11-08,148
25007,Puerto Rico expects U.S. to lift Jones Act shi...,(Reuters) - Puerto Rico Governor Ricardo Rosse...,politicsNews,2017-09-27,real_news,1848,304,6.078947,64,10,6.4,2017,9,27,2,2016-11-08,323
1377,OOPS: Trump Just Accidentally Confirmed He Le...,"On Monday, Donald Trump once again embarrassed...",News,2017-05-22,fake_news,1244,183,6.797814,89,12,7.416667,2017,5,22,0,2016-11-08,195
32476,Donald Trump heads for Scotland to reopen a go...,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",politicsNews,2016-06-24,real_news,3137,529,5.930057,55,10,5.5,2016,6,24,4,2016-11-08,-137


In [13]:
# check shape, rows and columns
news.shape

(38343, 17)

In [14]:
# check column names
news.columns

Index(['title', 'text', 'subject', 'date', 'label', 'char_count_text',
       'word_count_text', 'avg_word_length_text', 'char_count_title',
       'word_count_title', 'avg_word_length_title', 'year', 'month', 'day',
       'day_of_week', 'election_day', 'days_to_election'],
      dtype='object')

In [15]:
news.info()

<class 'pandas.core.frame.DataFrame'>
Index: 38343 entries, 22216 to 28254
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   title                  38343 non-null  object        
 1   text                   38343 non-null  object        
 2   subject                38343 non-null  object        
 3   date                   38343 non-null  datetime64[ns]
 4   label                  38343 non-null  object        
 5   char_count_text        38343 non-null  int64         
 6   word_count_text        38343 non-null  int64         
 7   avg_word_length_text   38343 non-null  float64       
 8   char_count_title       38343 non-null  int64         
 9   word_count_title       38343 non-null  int64         
 10  avg_word_length_title  38343 non-null  float64       
 11  year                   38343 non-null  int64         
 12  month                  38343 non-null  int64         
 13  da

In [16]:
news['month'] = news['month'].map({1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun',\
                                  7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'})
news['day_of_week'] = news['day_of_week'].map({0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 4:'Friday',\
                                               5:'Saturday', 6:'Sunday'})

In [17]:
news.info()

<class 'pandas.core.frame.DataFrame'>
Index: 38343 entries, 22216 to 28254
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   title                  38343 non-null  object        
 1   text                   38343 non-null  object        
 2   subject                38343 non-null  object        
 3   date                   38343 non-null  datetime64[ns]
 4   label                  38343 non-null  object        
 5   char_count_text        38343 non-null  int64         
 6   word_count_text        38343 non-null  int64         
 7   avg_word_length_text   38343 non-null  float64       
 8   char_count_title       38343 non-null  int64         
 9   word_count_title       38343 non-null  int64         
 10  avg_word_length_title  38343 non-null  float64       
 11  year                   38343 non-null  int64         
 12  month                  38343 non-null  object        
 13  da

# 3. Data Preprocessing

## 3.1 Data Preparation

In [18]:
def preprocess_text(text):
    
    # remove urls
    text = re.sub(r'https?://[^\s\n\r]+', '', text)
    # remove @blablabla
    text = re.sub(r'@[a-zA-Z0-9_]+', '', text)
    # remove :blablabla
    text = re.sub(r':[a-zA-Z0-9./]+', '', text)
    
    tokens = word_tokenize(text)

    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

In [26]:
news['clean_text'] = news['text'].apply(preprocess_text)

In [27]:
real_news_text = news[news.label=='real_news']['clean_text'].to_list()
fake_news_text = news[news.label=='fake_news']['clean_text'].to_list()

In [31]:
directory = '..\\Intermediate'
real_news_text_filepath = os.path.join(directory,
                                   'real_news_text.txt')
real_news_text_filepath

'..\\Intermediate\\real_news_text.txt'

In [33]:
if 0 ==0:
    with codecs.open(real_news_text_filepath, 'w', encoding='utf_8') as f:
        for text in real_news_text:
            f.write(text + '\n')
else:
    real_news_text = codecs.open(real_news_text_filepath, encoding='utf_8')

check one example

In [34]:
with codecs.open(real_news_text_filepath, encoding='utf_8') as f:
    sample = list(it.islice(f, 2, 3))[0] 
print(sample)

GLASGOW , Scotland ( Reuters ) - Most U.S. presidential candidate go abroad sharpen foreign policy credential . Donald Trump arrives Scotland Friday reopen golf resort . The presumptive Republican nominee , 70 , visit family ’ ancestral homeland showcase far-flung business empire . His mother born Scotland ’ Isle Lewis . With throng reporter watching , make dramatic arrival helicopter seaside Trump Turnberry resort . He scheduled news conference 9th hole noon ( 7 a.m. ET/1100 GMT ) . His visit Turnberry , followed stop resort Aberdeen Saturday , allow comment outcome Britain ’ vote Thursday whether remain European Union . “ I ’ think opening golf resort get many foreign policy chop , ” said Saul Anuzis , former chairman Michigan Republican Party . “ But since ’ right middle EU vote , may end PR bonanza him. ” The risk real-estate tycoon , yet hold public office rate unfavorably 70 percent Americans opinion poll , make foreign policy misstep time Republican leader urging serious demeano

In [35]:
nlp = spacy.load('en_core_web_sm')

In [36]:
parsed = nlp(sample)

In [None]:
parsed.vector

In [1]:
dir(parsed)

NameError: name 'parsed' is not defined

In [37]:
for num, sentence in enumerate(parsed.sents):
    print ('Sentence {}:'.format(num + 1))
    print (sentence)
    print ('')

Sentence 1:
GLASGOW , Scotland ( Reuters ) - Most U.S. presidential candidate go abroad sharpen foreign policy credential .

Sentence 2:
Donald Trump arrives Scotland Friday reopen golf resort .

Sentence 3:
The presumptive Republican nominee , 70 , visit family ’ ancestral homeland showcase far-flung business empire .

Sentence 4:
His mother born Scotland ’ Isle Lewis .

Sentence 5:
With throng reporter watching , make dramatic arrival helicopter seaside Trump Turnberry resort .

Sentence 6:
He scheduled news conference 9th hole noon ( 7 a.m. ET/1100 GMT ) .

Sentence 7:
His visit Turnberry , followed stop resort Aberdeen Saturday , allow comment outcome Britain ’ vote Thursday whether remain European Union .

Sentence 8:
“ I ’ think opening golf resort get many foreign policy chop , ” said Saul Anuzis , former chairman Michigan Republican Party .

Sentence 9:
“ But since ’ right middle EU vote , may end PR bonanza him. ”

Sentence 10:
The risk real-estate tycoon , yet hold public off

### Unigram Sentences

In [38]:
def punct_space(token):
    
    return token.is_punct or token.is_space

def line(filename):
    
    with codecs.open(filename, encoding='utf_8') as f:
        for line in f:
            yield line
            
def lemmatized_sentence_corpus(filename):
    
    for parsed in nlp.pipe(line(filename),
                                  batch_size=1000):
        for sent in parsed.sents:                        # one sentence per line, be careful
            yield ' '.join([token.lemma_ for token in sent
                             if not punct_space(token)])

In [39]:
real_news_unigram_sentences_filepath = os.path.join(directory,
                                   'real_news_unigram_sentences.txt')
real_news_unigram_sentences_filepath

'..\\Intermediate\\real_news_unigram_sentences.txt'

In [40]:
if 0==0:
    with codecs.open(real_news_unigram_sentences_filepath, 'w', encoding='utf_8') as f:
            for sentence in lemmatized_sentence_corpus(real_news_text_filepath):
                f.write(sentence + '\n')

In [42]:
real_news_unigram_sentences = LineSentence(real_news_unigram_sentences_filepath)

In [45]:
for unigram_sentence in it.islice(real_news_unigram_sentences, 0, 1):
    print (' '.join(unigram_sentence))
    print ('')

WASHINGTON Reuters U.S. President Donald Trump remove chief strategist Steve Bannon National Security Council Wednesday reverse controversial decision early year give political adviser unprecedented role security discussion



### Bigram Sentences

In [46]:
real_news_bigram_model_filepath = os.path.join('..\\Intermediate', 'real_news_bigram_model')
real_news_bigram_model_filepath

'..\\Intermediate\\real_news_bigram_model'

In [47]:
if 0 == 0:
    
    real_news_bigram_model = Phrases(real_news_unigram_sentences)      # use Phrases() build a bigram model first

    real_news_bigram_model.save(real_news_bigram_model_filepath)
else:
    real_news_bigram_model = Phrases.load(real_news_bigram_model_filepath)

In [48]:
real_news_bigram_sentences_filepath = os.path.join(directory,
                                   'real_news_bigram_sentences.txt')
real_news_bigram_sentences_filepath

'..\\Intermediate\\real_news_bigram_sentences.txt'

In [49]:
if 0 == 0:

    with codecs.open('..\\Intermediate\\real_news_bigram_sentences.txt', 'w', encoding='utf_8') as f:
        
        for unigram_sentence in real_news_unigram_sentences:
            
            bigram_sentence = ' '.join(real_news_bigram_model[unigram_sentence])   # use the bigram model to treat unigram_sentence in unigram_sentences.
            
            f.write(bigram_sentence + '\n')

In [50]:
real_news_bigram_sentences = LineSentence(real_news_bigram_sentences_filepath)

In [58]:
for bigram_sentence in it.islice(real_news_bigram_sentences, 0, 1):
    print (' '.join(bigram_sentence))
    print ('')

WASHINGTON_Reuters U.S. President_Donald Trump remove chief_strategist Steve_Bannon National_Security Council Wednesday reverse controversial decision early year give political adviser unprecedented role security discussion



### Trigram Sentences

In [52]:
real_news_trigram_model_filepath = os.path.join('..\\Intermediate', 'real_news_trigram_model')
real_news_trigram_model_filepath

'..\\Intermediate\\real_news_trigram_model'

In [53]:
if 0 == 0:
    
    real_news_trigram_model = Phrases(real_news_bigram_sentences)      # use Phrases() build a bigram model first

    real_news_bigram_model.save(real_news_bigram_model_filepath)
else:
    real_news_bigram_model = Phrases.load(real_news_bigram_model_filepath)

In [54]:
real_news_trigram_sentences_filepath = os.path.join(directory,
                                   'real_news_trigram_sentences.txt')
real_news_trigram_sentences_filepath

'..\\Intermediate\\real_news_trigram_sentences.txt'

In [55]:
if 0 == 0:

    with codecs.open('..\\Intermediate\\real_news_trigram_sentences.txt', 'w', encoding='utf_8') as f:
        
        for bigram_sentence in real_news_bigram_sentences:
            
            trigram_sentence = ' '.join(real_news_trigram_model[bigram_sentence])   # use the bigram model to treat unigram_sentence in unigram_sentences.
            
            f.write(trigram_sentence + '\n')

In [56]:
real_news_trigram_sentences = LineSentence(real_news_trigram_sentences_filepath)

In [59]:
for trigram_sentence in it.islice(real_news_trigram_sentences, 0, 1):
    print (' '.join(trigram_sentence))
    print ('')

WASHINGTON_Reuters_U.S. President_Donald_Trump remove chief_strategist_Steve_Bannon National_Security_Council Wednesday reverse controversial decision early_year give political adviser unprecedented role security discussion



### Trigram_All One News per Line

In [60]:
real_news_trigram_all_filepath = os.path.join(directory,
                                        'real_news_trigram_all.txt')
real_news_trigram_all_filepath

'..\\Intermediate\\real_news_trigram_all.txt'

In [62]:
if 0 == 0:

    with codecs.open(real_news_trigram_all_filepath, 'w', encoding='utf_8') as f:
        
        for parsed in nlp.pipe(line(real_news_text_filepath),
                                      batch_size=1000):                     # this is each review per loop, different with unigram_sentences
            
            # lemmatize the text, removing punctuation and whitespace
            unigram = [token.lemma_ for token in parsed
                              if not punct_space(token)]
            
            # apply the first-order and second-order phrase models
            bigram = real_news_bigram_model[unigram]
            trigram = real_news_trigram_model[bigram]
            
            # remove any remaining stopwords
            trigram = [term for term in trigram
                              if term not in spacy.lang.en.STOP_WORDS]
            
            # write the transformed review as a line in the new file
            trigram = ' '.join(trigram)
            f.write(trigram + '\n')

In [63]:
print ('Original:' + '\n')

for sent in it.islice(line(real_news_text_filepath), 0, 1):
    print (sent)

print ('----' + '\n')
print ('Transformed:' + '\n')

with codecs.open(real_news_trigram_all_filepath, encoding='utf_8') as f:
    for sent in it.islice(f, 0, 1):
        print (sent)

Original:

WASHINGTON ( Reuters ) - U.S. President Donald Trump removed chief strategist Steve Bannon National Security Council Wednesday , reversing controversial decision early year give political adviser unprecedented role security discussion . Trump ’ overhaul NSC , confirmed White House official , also elevated General Joseph Dunford , chairman Joint Chiefs Staff Dan Coats , director National Intelligence head 17 U.S. intelligence agency . The official said change move NSC “ back core function ’ supposed do. ” It also appears mark victory national security adviser H.R . McMaster , told national security expert felt “ battle death ” Bannon others White House staff . Vice President Mike Pence said Bannon would continue play important role policy played shake-up routine . “ This natural evolution ensure National Security Council organized way best serf president resolving making difficult decision , ” Pence said Fox News . Bannon said statement succeeded returning NSC back traditiona

### LDA

### Dictionary

In [64]:
real_news_trigram_dictionary_filepath = os.path.join(directory,
                                           'real_news_trigram_dict.dict')
real_news_trigram_dictionary_filepath

'..\\Intermediate\\real_news_trigram_dict.dict'

In [67]:
if 0 == 0:

    real_news_trigram_all = LineSentence(real_news_trigram_all_filepath)

    # learn the dictionary by iterating over all of the reviews
    real_news_trigram_dictionary = Dictionary(real_news_trigram_all)
    
    # filter tokens that are very rare or too common from
    # the dictionary (filter_extremes) and reassign integer ids (compactify)
    real_news_trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)
    real_news_trigram_dictionary.compactify()

    real_news_trigram_dictionary.save(real_news_trigram_dictionary_filepath)
    
# load the finished dictionary from disk
real_news_trigram_dictionary = Dictionary.load(real_news_trigram_dictionary_filepath)

#### Bags Of Words

In [68]:
real_news_trigram_bow_filepath = os.path.join(directory,
                                    'real_news_trigram_bow.mm')
real_news_trigram_bow_filepath

'..\\Intermediate\\real_news_trigram_bow.mm'

In [69]:
def trigram_bow_generator(dictionary, filepath):
    
    for sent in LineSentence(filepath):
        yield dictionary.doc2bow(sent)

In [70]:
if 0 == 0:

    # generate bag-of-words representations for
    # all reviews and save them as a matrix
    MmCorpus.serialize(real_news_trigram_bow_filepath,
                       trigram_bow_generator(real_news_trigram_dictionary, real_news_trigram_all_filepath))
    
# load the finished bag-of-words corpus from disk
real_news_trigram_bow = MmCorpus(real_news_trigram_bow_filepath)

In [71]:
real_news_lda_model_filepath = os.path.join(directory, 'real_news_lda_model')
real_news_lda_model_filepath

'..\\Intermediate\\real_news_lda_model'

In [72]:
if 0 == 0:

    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        
        # workers => sets the parallelism, and should be
        # set to your number of physical cores minus one
        lda = LdaMulticore(real_news_trigram_bow,
                           num_topics=50,
                           id2word=real_news_trigram_dictionary,
                           workers=3)
    
    lda.save(real_news_lda_model_filepath)
    
# load the finished LDA model from disk
real_news_lda_model = LdaMulticore.load(real_news_lda_model_filepath)

### LDA Vis

In [76]:
real_news_lda_vis_filepath = os.path.join(directory, 'real_news_lda_vis')
real_news_lda_vis_filepath

'..\\Intermediate\\real_news_lda_vis'

In [78]:
import pyLDAvis
import pyLDAvis.gensim_models

In [79]:
if 0 == 0:

    real_news_lda_vis = pyLDAvis.gensim_models.prepare(real_news_lda_model, real_news_trigram_bow,
                                              real_news_trigram_dictionary)

    with open(real_news_lda_vis_filepath, 'wb') as f:
        pickle.dump(real_news_lda_vis, f)
        
# load the pre-prepared pyLDAvis data from disk
with open(real_news_lda_vis_filepath, 'rb') as f:
    real_news_lda_vis = pickle.load(f)

In [None]:
pyLDAvis.display(real_news_lda_vis)

### Word2Vec

In [None]:
from gensim.models import Word2Vec
real_news_word2vec_filepath = os.path.join(directory, 'real_news_word2vec')

In [90]:
real_news_trigram_sentences

<gensim.models.word2vec.LineSentence at 0x2702013dd10>

In [91]:
if 0 == 1:

    # initiate the model and perform the first training
    real_news_word2vec = Word2Vec(real_news_trigram_sentences, vector_size=100, window=5,
                        min_count=20, sg=1, workers=4)
    
    real_news_word2vec.save(real_news_word2vec_filepath)
real_news_word2vec = Word2Vec.load(real_news_word2vec_filepath)

In [94]:
real_news_word2vec.corpus_count, len(real_news_word2vec.wv.key_to_index)

(340635, 18410)

In [96]:
ordered_vocab = [(term, real_news_word2vec.wv.key_to_index[term], real_news_word2vec.wv.get_vecattr(term, "count"))
                 for term in real_news_word2vec.wv.index_to_key]

# sort by the term counts, so the most common terms appear first
ordered_vocab = sorted(ordered_vocab, key=lambda x: -x[2])

# unzip the terms, integer indices, and counts into separate lists
ordered_terms, term_indices, term_counts = zip(*ordered_vocab)

# create a DataFrame with the food2vec vectors as data,
# and the terms as row labels
real_news_word_vectors = pd.DataFrame(real_news_word2vec.wv.vectors[term_indices, :],
                            index=ordered_terms)

real_news_word_vectors.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
say,0.110145,0.02225,0.055408,0.142933,-0.03949,-0.246861,0.415353,0.344622,-0.216672,0.105172,0.272264,-0.321129,-0.25054,0.153642,-0.178507,-0.328745,0.204094,-0.049332,0.009555,-0.186989,0.47168,0.305057,0.35522,0.148379,0.36324,-0.276479,-0.00906,-0.232297,-0.295443,-0.110312,-0.135962,-0.120934,0.353548,-0.322947,-0.18785,0.149778,0.201058,-0.162788,-0.039673,-0.143629,-0.06052,-0.103554,-0.279324,0.173735,0.227587,0.180856,0.126691,-0.027608,-0.049896,0.060028,-0.003239,-0.192492,-0.002336,0.260607,0.236607,-0.083376,0.40922,0.04086,-0.394696,0.425207,0.165911,0.194524,-0.131225,-0.164808,0.087429,0.085021,0.011671,0.220102,0.040846,0.5215,-0.249261,0.403087,0.013919,0.024781,0.308117,-0.075383,0.130454,0.009785,0.080878,-0.211569,-0.301078,0.088849,0.182327,-0.134489,0.07449,0.137081,0.019823,-0.119592,-0.073802,0.316004,0.05052,0.018615,-0.06126,0.157121,0.668265,0.327907,-0.136216,0.065093,0.334142,-0.121183
the,-0.125943,0.429348,0.13277,0.115367,-0.041841,-0.134927,-0.108476,0.330035,-0.309472,0.182577,-0.037074,-0.499255,0.494982,0.45795,0.258865,-0.109128,0.062675,0.083937,-0.084367,-0.273813,0.276421,0.096095,0.0079,-0.402031,0.170778,0.094794,0.042656,0.284756,-0.286918,0.065139,0.466598,-0.301699,0.277622,-0.320715,0.160685,0.322008,0.780954,0.340576,0.189809,-0.032409,0.253641,-0.339487,-0.20568,-0.138491,-0.25956,0.301743,0.308593,0.053164,-0.075965,0.515956,-0.064779,-0.448316,0.106503,-0.143768,-0.095386,0.508993,0.574223,0.164373,-0.256414,0.245802,-0.151521,0.124532,-0.406753,-0.148186,-0.552024,-0.058343,0.25989,-0.161202,-0.244414,0.585582,-0.142425,-0.344425,0.231574,-0.099862,0.423298,-0.04241,-0.367312,-0.392531,0.038351,-0.468196,-0.591707,0.172718,0.373988,0.569106,0.139431,0.051205,0.179728,0.194714,-0.151443,-0.132362,0.312088,0.145607,0.210489,-0.319813,0.056565,0.171183,0.19446,-0.569328,-0.151574,0.079923
Trump,-0.457619,0.363527,-0.063962,0.037507,0.369757,-0.111666,-0.537992,0.46675,-0.43961,0.108536,-0.173973,-0.007074,0.367864,-0.028437,0.479539,0.063177,0.482697,0.227937,-0.04776,-0.5498,-0.386619,0.208604,0.200864,0.115421,0.237754,0.192053,0.419061,-0.18273,-0.403593,-0.007667,0.258249,-0.520789,-0.092485,-0.330339,-0.226717,0.140558,-0.009435,-0.239304,0.033586,0.150427,0.044613,0.102228,-0.337978,0.671564,0.266134,0.071934,0.177383,-0.352279,0.103319,0.152833,0.110331,-0.200609,-0.445257,-0.361655,-0.370046,-0.175088,0.349046,-0.159559,-0.135702,-0.024931,0.327306,-0.051884,0.095576,0.060507,-0.462314,0.591025,-0.178062,0.554397,-0.309613,0.450108,0.234949,0.360778,0.08992,0.029797,-0.304243,0.207989,0.269013,0.149643,-0.124127,-0.259683,-0.193828,-0.440118,-0.304619,-0.474476,0.006413,-0.267955,0.14894,-0.239152,0.389752,-0.363627,-0.074849,0.07548,-0.108334,0.112652,0.670363,0.008398,-0.232719,-0.472795,0.064881,-0.252219
would,-0.417113,0.208625,0.082007,0.220912,0.270327,-0.412485,0.225899,0.548629,-0.255715,-0.003837,0.098908,-0.140883,-0.089983,0.080333,0.136232,-0.030559,0.189429,-0.4381,-0.037387,-0.311271,0.3905,-0.017768,0.119441,0.134507,-0.271147,-0.101759,0.058344,0.117257,0.110779,0.090422,-0.049041,0.106531,0.051917,-0.224935,-0.104212,0.498313,0.407752,-0.310452,-0.310087,-0.038088,0.218685,-0.064073,-0.090064,0.182948,0.184501,0.118341,-0.046851,-0.367239,-0.288256,-0.10272,0.056133,0.073472,-0.170931,-0.244357,-0.287223,-0.065049,-0.004553,-0.077127,0.078588,-0.037982,0.03655,0.404629,0.078957,0.180622,0.13487,-0.344218,-0.073942,0.290039,-0.184428,0.449531,0.198782,0.107902,0.192531,0.145769,0.346908,-0.298105,-0.216542,-0.069512,0.094792,-0.28545,-0.554842,0.108575,0.116455,-0.190565,-0.109895,-0.162146,0.372308,-0.22931,0.049488,0.1496,-0.166978,0.126189,0.222463,0.195286,0.785563,0.182109,0.052971,-0.252531,-0.305694,-0.074304
U.S.,-0.357539,0.555569,0.025334,-0.695839,-0.053642,-0.166754,0.218581,0.310311,-0.246104,-0.354281,0.015784,-0.309145,-0.091083,0.001104,0.074342,-0.37818,0.26311,0.168644,0.327984,-0.030569,0.23141,-0.048283,-0.126365,-0.174481,-0.135309,0.156365,-0.129408,-0.070077,-0.532682,-0.114979,0.233937,-0.237596,-0.029343,-0.364317,0.367345,-0.228986,-0.381886,-0.302417,0.029292,-0.249661,0.33274,-0.190205,0.04133,0.55031,0.300577,-0.560136,0.054828,-0.140188,-0.218935,-0.144511,0.346553,0.447323,0.349369,-0.3949,-0.094099,-0.45653,-0.000465,-0.259643,-0.128856,-0.50975,0.168275,0.078938,0.0608,-0.069234,-0.284729,0.022667,0.344668,0.607191,-0.540314,0.729984,-0.182655,0.419211,0.105571,-0.077553,-0.096295,0.116728,0.352063,0.136438,0.164897,-0.641014,-0.830788,-0.269426,0.375405,0.107253,0.112597,-0.227222,0.511392,0.194309,0.084119,-0.178225,0.40194,0.010482,-0.009412,-0.289332,0.160756,0.334874,0.134538,-0.109831,0.063723,-0.126012


In [97]:
real_news_word_vectors.shape

(18410, 100)

In [98]:
def get_related_terms(token, topn=10):

    for word, similarity in real_news_word2vec.wv.most_similar(positive=[token], topn=topn):

        print ('{:20} {}'.format(word, round(similarity, 3)))

In [99]:
get_related_terms('China')

Beijing              0.84
South_China_Sea      0.797
chinese              0.775
Taiwan               0.769
Vietnam              0.764
dispute_South_China_Sea 0.681
India                0.674
Xi                   0.672
Chinese              0.669
self_rule_Taiwan     0.668


In [102]:
get_related_terms('Iran')

Tehran               0.842
Islamic_Republic     0.75
iranian              0.747
Hezbollah            0.704
nuclear_deal         0.691
2015_nuclear_deal    0.687
Saudi_Arabia         0.668
government_sweeping_collection 0.655
Riyadh               0.649
support_terrorism    0.648


### Word Vector Visualization with t-SNE

In [107]:
from sklearn.manifold import TSNE

In [113]:
real_news_tsne_df = real_news_word_vectors.drop(spacy.lang.en.STOP_WORDS, errors='ignore')
real_news_tsne_df = real_news_tsne_df.head(5000)

In [114]:
real_news_tsne_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
Trump,-0.457619,0.363527,-0.063962,0.037507,0.369757,-0.111666,-0.537992,0.46675,-0.43961,0.108536,-0.173973,-0.007074,0.367864,-0.028437,0.479539,0.063177,0.482697,0.227937,-0.04776,-0.5498,-0.386619,0.208604,0.200864,0.115421,0.237754,0.192053,0.419061,-0.18273,-0.403593,-0.007667,0.258249,-0.520789,-0.092485,-0.330339,-0.226717,0.140558,-0.009435,-0.239304,0.033586,0.150427,0.044613,0.102228,-0.337978,0.671564,0.266134,0.071934,0.177383,-0.352279,0.103319,0.152833,0.110331,-0.200609,-0.445257,-0.361655,-0.370046,-0.175088,0.349046,-0.159559,-0.135702,-0.024931,0.327306,-0.051884,0.095576,0.060507,-0.462314,0.591025,-0.178062,0.554397,-0.309613,0.450108,0.234949,0.360778,0.08992,0.029797,-0.304243,0.207989,0.269013,0.149643,-0.124127,-0.259683,-0.193828,-0.440118,-0.304619,-0.474476,0.006413,-0.267955,0.14894,-0.239152,0.389752,-0.363627,-0.074849,0.07548,-0.108334,0.112652,0.670363,0.008398,-0.232719,-0.472795,0.064881,-0.252219
U.S.,-0.357539,0.555569,0.025334,-0.695839,-0.053642,-0.166754,0.218581,0.310311,-0.246104,-0.354281,0.015784,-0.309145,-0.091083,0.001104,0.074342,-0.37818,0.26311,0.168644,0.327984,-0.030569,0.23141,-0.048283,-0.126365,-0.174481,-0.135309,0.156365,-0.129408,-0.070077,-0.532682,-0.114979,0.233937,-0.237596,-0.029343,-0.364317,0.367345,-0.228986,-0.381886,-0.302417,0.029292,-0.249661,0.33274,-0.190205,0.04133,0.55031,0.300577,-0.560136,0.054828,-0.140188,-0.218935,-0.144511,0.346553,0.447323,0.349369,-0.3949,-0.094099,-0.45653,-0.000465,-0.259643,-0.128856,-0.50975,0.168275,0.078938,0.0608,-0.069234,-0.284729,0.022667,0.344668,0.607191,-0.540314,0.729984,-0.182655,0.419211,0.105571,-0.077553,-0.096295,0.116728,0.352063,0.136438,0.164897,-0.641014,-0.830788,-0.269426,0.375405,0.107253,0.112597,-0.227222,0.511392,0.194309,0.084119,-0.178225,0.40194,0.010482,-0.009412,-0.289332,0.160756,0.334874,0.134538,-0.109831,0.063723,-0.126012
government,0.19457,-0.415184,-0.258677,-0.136898,-0.277751,-0.382263,0.222627,0.192875,-0.231817,0.259647,0.468333,-0.424434,-0.180429,0.116922,0.075645,-0.428767,0.409667,0.07982,0.297554,-0.165696,0.129286,0.081313,0.341184,0.02534,-0.253308,-0.065015,-0.031941,-0.215064,0.108924,-0.177154,0.093365,0.732164,-0.115695,0.112598,-0.164738,-0.236848,0.121379,-0.175765,-0.564788,-0.566401,-0.17287,-0.329744,0.370337,0.146282,-0.06863,0.065397,-0.701678,-0.520004,-0.23542,0.197183,-0.001782,-0.693277,0.117895,-0.105228,-0.388643,-0.131823,0.512784,-0.319088,-0.131763,0.002348,0.006708,0.131652,0.463538,0.404696,-0.152116,-0.084159,-0.617708,-0.207159,-0.138119,0.335952,-0.16979,0.39857,0.311048,0.046723,0.627117,-0.094072,0.251978,-0.24246,-0.221223,-0.313149,-0.067361,-0.210459,0.331315,0.305747,0.431946,-0.027316,0.039212,-0.135369,-0.226628,-0.117484,0.289979,0.165851,0.142013,0.180227,0.285263,0.087772,0.412525,-0.901963,-0.00926,0.371881
state,0.276575,-0.313395,0.681762,0.2319,-0.931349,-0.309266,0.474286,0.269192,-0.305134,-0.382344,-0.355919,-0.127155,-0.010365,0.118978,0.392968,0.143861,0.289159,-0.218833,0.114522,-0.226164,-0.02557,0.783709,-0.015308,-0.375356,-0.33863,-0.181913,-0.572766,0.078931,-0.302087,0.385651,0.24298,0.263023,0.024335,-0.402007,-0.373663,0.460485,-0.094593,0.229506,0.056478,0.131678,0.346556,0.037772,-0.006308,0.35586,0.240583,-0.41525,-0.70717,-0.214748,0.26689,0.186125,0.059906,0.222081,0.307232,-0.577307,-0.119092,0.365852,0.419974,-0.47848,0.263656,0.223976,0.189387,-0.040866,-0.607954,0.116216,0.024666,-0.029956,-0.497739,0.461264,-0.496338,0.092988,-0.112788,0.144778,0.910346,-0.285108,0.068278,0.186745,-0.280264,-0.39501,-0.026921,-0.331851,-0.157012,-0.269375,-0.08427,-0.247667,0.163837,-0.140256,0.235801,0.190437,0.301103,0.281683,-0.01668,0.21875,-0.067786,-0.006265,0.240683,0.716338,-0.025414,-0.206202,0.061642,0.177039
include,-0.203445,-0.255499,0.322242,-0.569754,-0.335734,-0.41082,0.32505,-0.023817,-0.080356,-0.03304,-0.041648,-0.158259,-0.051475,-0.033378,0.061017,-0.318776,0.710663,-0.094388,0.09309,-0.379512,0.062376,-0.231003,0.240118,-0.284572,0.043354,0.201609,0.121879,0.170025,0.073571,0.484795,0.134598,-0.110322,0.564773,-0.31707,0.315326,0.868766,0.097046,-0.139116,-0.359426,-0.344012,0.093776,-0.12307,-0.025307,0.51007,-0.111308,-0.363198,-0.375828,0.045164,-0.09348,0.233825,-0.237195,0.020411,-0.198066,-0.00726,0.072743,0.089027,0.082163,0.051927,0.148667,-0.155989,-0.111034,-0.103929,-0.095993,-0.173322,-0.015196,0.521757,0.082387,0.260704,-0.414951,0.166191,0.127905,0.398275,-0.13073,-0.067541,0.077764,0.156156,-0.122672,-0.248262,-0.338977,-0.144493,-0.108957,0.117237,0.472985,0.022357,0.050015,0.063112,-0.40406,-0.017062,0.151211,0.044581,0.515815,0.310183,-0.469103,-0.15799,0.26814,-0.133713,-0.060234,-0.27617,0.718409,-0.508562


In [115]:
real_news_tsne_filepath = os.path.join(directory,
                             'tsne')

real_news_tsne_vectors_filepath = os.path.join(directory,
                                     'real_news_tsne_vectors.npy')

In [116]:
if 0 == 0:
    
    real_news_tsne = TSNE()
    real_news_tsne_vectors = real_news_tsne.fit_transform(real_news_tsne_df.values)
    
    with open(real_news_tsne_filepath, 'wb') as f:
        pickle.dump(real_news_tsne, f)

    np.save(real_news_tsne_vectors_filepath, real_news_tsne_vectors)
    
with open(real_news_tsne_filepath, 'rb') as f:
    real_news_tsne = pickle.load(f)
    
real_news_tsne_vectors = np.load(real_news_tsne_vectors_filepath)

real_news_tsne_vectors = pd.DataFrame(real_news_tsne_vectors,
                            index=pd.Index(real_news_tsne_df.index),
                            columns=['x_coord', 'y_coord'])

In [121]:
real_news_tsne_vectors['word'] = real_news_tsne_vectors.index

In [123]:
real_news_tsne_vectors.head()

Unnamed: 0,x_coord,y_coord,word
Trump,30.219902,53.837284,Trump
U.S.,47.580402,-36.82328,U.S.
government,-25.584827,-26.750015,government
state,-19.097225,50.023891,state
include,20.938898,-7.178703,include


### Plotting with Bokeh

In [118]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource
output_notebook()

In [124]:
plot_data = ColumnDataSource(real_news_tsne_vectors)

# create the plot and configure the
# title, dimensions, and tools
real_news_tsne_plot = figure(title='t-SNE Word Embeddings',
                   width = 800,
                   height = 800,
                   tools= ('pan, wheel_zoom, box_zoom,'
                           'box_select, reset'),
                   active_scroll='wheel_zoom')

# add a hover tool to display words on roll-over
real_news_tsne_plot.add_tools( HoverTool(tooltips = '@word') )

# draw the words as circles on the plot
real_news_tsne_plot.circle('x_coord', 'y_coord', source=plot_data,
                 color='blue', line_alpha=0.2, fill_alpha=0.1,
                 size=10, hover_line_color='black')

In [125]:
# configure visual elements of the plot
real_news_tsne_plot.title.text_font_size = '16pt'
real_news_tsne_plot.xaxis.visible = False
real_news_tsne_plot.yaxis.visible = False
real_news_tsne_plot.grid.grid_line_color = None
real_news_tsne_plot.outline_line_color = None

# engage!
show(real_news_tsne_plot);