In [117]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from typing import *

class WordVectors:
    def __init__(self, vocab_path: str, vectors_path: str):
        self.vocab = pd.read_csv(vocab_path ,delimiter="\t",header=None)[0].to_list()
        self.vectors = np.genfromtxt(fname=vectors_path, delimiter="\t")
        
    def vector(self, word: str):
        word_idx = self.vocab.index(word)
        print(word_idx)
        return self.vectors[word_idx]
    
    def vocab_length(self):
        return len(self.vocab)
    
    def vector_shape(self):
        return self.vectors.shape
    
    def most_similar(self,word: str, n: int) -> list:
        w2v = self.vector(word)
        sim = cosine_similarity(w2v.reshape(1, -1),self.vectors)
        sim = np.argsort(sim)[0,::-1][1:n+1] #want the highest
        sim_words = [self.vocab[idx] for idx in sim]
        return sim_words


CNN = WordVectors("cnn_vocab.tsv", "cnn_vectors.tsv")
FoxNews = WordVectors("foxnews_vocab.tsv", "foxnews_vectors.tsv")



In [143]:
word = "donald_trump"
CNN.most_similar(word, 30)


843


['president',
 'housing_and_urban_development',
 'vice',
 'office_of_management_and_budget',
 'icml',
 '(cnn)former',
 'swathe',
 'buttigieg',
 'kamala_harris',
 "memo'",
 'corrupt',
 'ronald_reagan',
 'rob_chan',
 'prospects',
 'isolationist',
 'environment_policy_and_social_initiatives',
 'candidacy',
 'raphael_bostic',
 'vows',
 'reject',
 'contradicting',
 'posterior',
 'objected',
 'naacp_parker_county',
 'los_angeles_clippers',
 'pledged',
 'john_verdi',
 'brady_campaign_to_prevent_gun_violence',
 'vaulting',
 'secretaries']

In [144]:
FoxNews.most_similar(word,30)


126


['president',
 'iranian_presidency',
 'alexander_lukashenko',
 'ashraf_ghani',
 'george_w_bush',
 'jerome_adams',
 'ilan_yeshua',
 'waukegan_school_board',
 '(then',
 'nikolai_patrushev',
 'fiirst',
 'afgan',
 'alexei_woltornist',
 'joe_biden',
 'jessica_francos',
 'vice',
 '<quote>_council',
 'briefed',
 'vladimir_putin',
 'international_ice_hockey_federation',
 'carol_l_folt',
 'rejoin',
 'american_kennel_club',
 'neil_gorsuch',
 'roberta_jacobson',
 'chorus',
 "kamala_harris'",
 'pardoned',
 '(uscca)',
 'open_borders']