In [1]:
import numpy as np

In [6]:
a = np.array([[1,2,3],[3,4,0]])
a

array([[1, 2, 3],
       [3, 4, 0]])

In [9]:
np.argsort(a)

array([[0, 1, 2],
       [2, 0, 1]], dtype=int64)

In [10]:
a.shape

(2, 3)

In [1]:
import joblib
import pandas as pd
import numpy as np
from datetime import datetime

from nltk.tokenize import word_tokenize
from string import punctuation


from nltk.corpus import stopwords
from gensim.parsing.preprocessing import STOPWORDS
import spacy

from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from gensim.parsing.preprocessing import remove_stopwords

In [2]:
df_articles = pd.read_csv("../data/interim/articles_processed.csv")

In [3]:
df_articles.article_published_on = df_articles.article_published_on.astype(np.datetime64)

In [4]:
df_train,df_test = df_articles[df_articles.article_published_on<datetime(year=2021,day=20,month=8)],df_articles[df_articles.article_published_on>=datetime(year=2021,day=20,month=8)]

In [5]:
df_train.shape

(17655, 9)

In [6]:
df_test.shape

(736, 9)

In [7]:
vectorizer = joblib.load("../models/vectorizer_0830_0350")
model = joblib.load("../models/lda_model_0830_0350")
topic_vectors_train = joblib.load("../models/topic_vector_train_0830_0350")

In [22]:
import random
test_indices = random.sample(range(df_test.shape[0]), 1)
# test_indices=[0]
test_lemmas = text_pipeline(df_test.iloc[test_indices].article_body)
lemma_test_vectors = text_vectorizer(test_lemmas,vectorizer)
topic_vectors_test = get_topic_vectors(lemma_test_vectors,model)
similarity_scores = get_similar_articles(topic_vectors_test,topic_vectors_train)
print_similar_articles(test_indices,similarity_scores)



Why full FDA approval of Pfizer’s coronavirus vaccine matters


2005     Swing voters, week 1: ‘Election is no slam-dunk’                              
15454    US could see 200,000 Covid cases a day again: ‘Unvaccinated are sitting ducks’
2493     Nation hits 70 percent vaccination goal amid surging Delta variant            
10164    Who Are the Unvaccinated in America? There’s No One Answer.                   
15690    ‘It’s too late’: US doctor says dying patients begging for Covid vaccine      
2877     Biden administration sends more cash to hard-hit areas as Delta variant surges
15821    99.2% of US Covid deaths in June were unvaccinated, says Fauci                
3363     Delta variant said to be far more widespread than federal estimates           
17497    Biden: US 'on track' to have enough vaccines for all adults by May            
9051     Maryland confirms case of Covid-19 variant from South Africa                  
Name: article_heading, dtype: object


In [23]:
test = "Why full FDA approval of Pfizer’s coronavirus vaccine matters"
test_lemmas = text_pipeline(test)
lemma_test_vectors = text_vectorizer(test_lemmas,vectorizer)
topic_vectors_test = get_topic_vectors(lemma_test_vectors,model)
similarity_scores = get_similar_articles(topic_vectors_test,topic_vectors_train)
print_similar_articles(test_indices,similarity_scores)




Why full FDA approval of Pfizer’s coronavirus vaccine matters


4091     FDA tells J&J to scrap 60 million vaccine doses made at troubled plant                  
4454     Biden admin probes risk of low-level contamination in vaccines made at troubled plant   
11672    100 Million Vaccine Doses Held Up Over Contamination Concerns, Firm Reveals             
4960     J&J vaccine production could restart in U.S. 'within days,' Emergent executive testifies
6178     FDA orders J&J contractor Emergent to stop vaccine production during inspection         
6544     Pfizer asks to OK Covid vaccine for younger teens                                       
6816     Where will J&J produce millions of vaccines?                                            
6842     FDA allows Moderna to put more coronavirus vaccine doses in each vial                   
6854     Emergent admits to manufacturing issues with J&J vaccine                                
7186     FDA authorizes J&J partner to help with vac

In [17]:
def text_pipeline(X):
    if isinstance(X, str):
        X = pd.Series(X)
    elif isinstance(X, (pd.Series, pd.DataFrame)):
        pass
    else:
        raise Exception(
            f"Input should either be in 'str' format or a 'series' or 'Dataframe' with a column of text. Received an object of type {type(X)}"
        )

    # punctuations
    removed_punctuation = X.apply(
        lambda x: "".join([c for c in x if c not in punctuation])
    )

    # stop words
    stop_words = stopwords.words("english")
    removed_stop_words = removed_punctuation.apply(
        lambda x: " ".join(
            [word for word in word_tokenize(x) if word not in stop_words]
        )
    )
    removed_stop_words = removed_stop_words.apply(lambda x: remove_stopwords(x))
    all_stopwords_gensim = STOPWORDS.union(
        set(["the", "say", "said", "get", "it", "in", "like", "new", "year"])
    )
    removed_stop_words = removed_stop_words.apply(
        lambda x: " ".join(
            [word for word in word_tokenize(x) if word not in all_stopwords_gensim]
        )
    )
    sp = spacy.load('en_core_web_sm')
    all_stopwords = sp.Defaults.stop_words
    removed_stop_words = removed_stop_words.apply(
        lambda x: " ".join(
            [word for word in word_tokenize(x) if word not in all_stopwords]
        )
    )

    # Stemming and Lematizing
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    stem = removed_stop_words.apply(
        lambda x: " ".join([stemmer.stem(word) for word in word_tokenize(x)])
    )
    lemma = stem.apply(
        lambda x: " ".join([lemmatizer.lemmatize(word) for word in word_tokenize(x)])
    )

    return lemma   

In [11]:
def text_vectorizer(X,vectorizer,fit=False):
    if fit:
        return vectorizer.fit_transform(X)
    else:
        return vectorizer.transform(X)

In [12]:
def get_topic_vectors(X,model,fit=False):
    if fit:
        return model.fit_transform(X)
    else:
        return model.transform(X)

In [13]:
from sklearn.metrics.pairwise import cosine_similarity
def get_similar_articles(text_vectors, X, top_n_values=10):
    """
    Evalute the cosine similarity between provided 'text_vectors' and trained X (articles trained and stored as a vecotr of topics).
    Return dataframe with index as trained articles and columns as text_vector indices with values as similarity scores
    """
    similarity_scores = cosine_similarity(X,text_vectors,dense_output=True)
    return np.argsort(similarity_scores, axis=0)[::-1,:][:top_n_values,:]

In [14]:
pd.set_option("display.max_colwidth", 3)
def print_similar_articles(test_indices,similarity_array):
    for i in range(similarity_array.shape[1]):
        indices = similarity_array[:,i]
        print("\n")
        print(df_test.iloc[test_indices[i]].article_heading)
        print("\n")
        print(df_train.iloc[indices].sort_values(['article_published_on'],ascending=False).article_heading)

In [24]:
a = np.array([[1,2],[3,4]])

In [25]:
a

array([[1, 2],
       [3, 4]])

In [34]:
df_train.iloc[[1]].article_url.iloc[0]

'https://www.foxnews.com/politics/afghanistan-evacuation-us-f-18s-overwatch-flights-kabul'