In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [3]:
df_articles = pd.read_csv("../data/interim/articles_processed.csv")

In [4]:
df_articles.article_published_on = df_articles.article_published_on.astype(np.datetime64)

In [5]:
df_train,df_test = df_articles[df_articles.article_published_on<datetime(year=2021,day=20,month=8)],df_articles[df_articles.article_published_on>=datetime(year=2021,day=20,month=8)]

In [6]:
df_train.shape

(17655, 9)

In [7]:
df_test.shape

(736, 9)

# Text preprocessing

In [15]:
from nltk.tokenize import word_tokenize

## Punctuations

In [16]:
from string import punctuation

In [17]:
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [18]:
removed_punctuation_train = df_train.article_body.apply(lambda x: "".join([c for c in x if c not in punctuation]))
removed_punctuation_test = df_test.article_body.apply(lambda x: "".join([c for c in x if c not in punctuation]))

## Stop words

In [19]:
from nltk.corpus import stopwords

In [20]:
stop_words = stopwords.words('english')

In [21]:
removed_stop_words_train = removed_punctuation_train.apply(lambda x: " ".join([word for word in word_tokenize(x) if word not in stop_words]))
removed_stop_words_test = removed_punctuation_test.apply(lambda x: " ".join([word for word in word_tokenize(x) if word not in stop_words]))

In [22]:
removed_punctuation_train.iloc[0][:60]

'A federal judge on Thursday imposed a preliminary injunction'

In [23]:
from gensim.parsing.preprocessing import remove_stopwords

In [24]:
removed_stop_words_train = removed_stop_words_train.apply(lambda x: remove_stopwords(x))
removed_stop_words_test = removed_stop_words_test.apply(lambda x: remove_stopwords(x))

In [25]:
removed_punctuation_train.iloc[0][:60]

'A federal judge on Thursday imposed a preliminary injunction'

In [26]:
import gensim
all_stopwords = gensim.parsing.preprocessing.STOPWORDS

In [27]:
from gensim.parsing.preprocessing import STOPWORDS

all_stopwords_gensim = STOPWORDS.union(set(['the','say','said','get','it','in','like','new','year']))

In [28]:
removed_stop_words_train = removed_stop_words_train.apply(lambda x: " ".join([word for word in word_tokenize(x) if word not in all_stopwords_gensim]))
removed_stop_words_test = removed_stop_words_test.apply(lambda x: " ".join([word for word in word_tokenize(x) if word not in all_stopwords_gensim]))

In [29]:
import spacy
sp = spacy.load('en_core_web_sm')

all_stopwords = sp.Defaults.stop_words

In [30]:
removed_stop_words_train = removed_stop_words_train.apply(lambda x: " ".join([word for word in word_tokenize(x) if word not in all_stopwords]))
removed_stop_words_test = removed_stop_words_test.apply(lambda x: " ".join([word for word in word_tokenize(x) if word not in all_stopwords]))

## Stem and Lemmatize

In [31]:
from nltk.stem.porter import PorterStemmer

In [32]:
stemmer = PorterStemmer()

In [33]:
stem_train = removed_stop_words_train.apply(lambda x: " ".join([stemmer.stem(word) for word in word_tokenize(x)]))
stem_test = removed_stop_words_test.apply(lambda x: " ".join([stemmer.stem(word) for word in word_tokenize(x)]))

In [34]:
from nltk.stem import WordNetLemmatizer

In [35]:
lemmatizer = WordNetLemmatizer()

In [36]:
lemma_train = stem_train.apply(lambda x: " ".join([lemmatizer.lemmatize(word) for word in word_tokenize(x)]))
lemma_test = stem_test.apply(lambda x: " ".join([lemmatizer.lemmatize(word) for word in word_tokenize(x)]))

# Topic modelling

In [37]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation


In [38]:
vectorizer = CountVectorizer(stop_words='english')
dtm = vectorizer.fit_transform(lemma_train)

## Old models

In [73]:
lda_30 = LatentDirichletAllocation(n_components=30)

In [74]:
lda_30.fit_transform(dtm)

array([[1.11482720e-04, 3.97984778e-02, 1.11482720e-04, ...,
        1.11482720e-04, 1.11482720e-04, 1.11482720e-04],
       [1.14942529e-04, 1.14942529e-04, 1.14942529e-04, ...,
        1.14942529e-04, 1.14942529e-04, 1.14942529e-04],
       [3.30033003e-04, 3.30033003e-04, 3.30033003e-04, ...,
        3.30033003e-04, 3.30033003e-04, 3.30033003e-04],
       ...,
       [3.00347024e-01, 4.90918017e-05, 3.16059712e-01, ...,
        4.90918017e-05, 4.90918017e-05, 6.61894150e-02],
       [1.29592102e-01, 5.73723465e-05, 2.06598048e-01, ...,
        6.63064510e-02, 5.73723465e-05, 5.73723465e-05],
       [4.39753738e-05, 4.39753738e-05, 4.39753738e-05, ...,
        4.39753738e-05, 4.39753738e-05, 2.18044985e-02]])

In [75]:
sorting = np.argsort(lda_30.components_)[:,::-1]
features = np.array(vectorizer.get_feature_names())

In [76]:
import mglearn
mglearn.tools.print_topics(topics=range(30),feature_names=features,
                           sorting= sorting, topics_per_chunk=10,n_words=10)

topic 0       topic 1       topic 2       topic 3       topic 4       topic 5       topic 6       topic 7       topic 8       topic 9       
--------      --------      --------      --------      --------      --------      --------      --------      --------      --------      
peopl         china         new           capitol       border        tax           citi          florida       peopl         woman         
famili        chine         white         polic         immigr        percent       school        desanti       polit         child         
home          state         polit         offic         biden         trade         new           state         trump         care          
live          world         time          attack        administr     increas       student       ron           medium        famili        
year          global        peopl         violenc       migrant       rate          mayor         miami         social        report        
day          

In [77]:
lda_40 = LatentDirichletAllocation(n_components=40)

In [78]:
lda_40.fit_transform(dtm)

array([[8.36120401e-05, 8.36120401e-05, 8.36120401e-05, ...,
        8.36120401e-05, 8.36120401e-05, 8.36120401e-05],
       [8.62068966e-05, 8.62068966e-05, 8.62068966e-05, ...,
        8.62068966e-05, 9.31281673e-01, 8.62068966e-05],
       [2.47524752e-04, 2.47524752e-04, 2.47524752e-04, ...,
        2.47524752e-04, 9.90346535e-01, 2.47524752e-04],
       ...,
       [3.68188513e-05, 2.38410412e-01, 3.32716223e-02, ...,
        3.68188513e-05, 3.68188513e-05, 3.68188513e-05],
       [4.30292599e-05, 2.80962809e-01, 4.30292599e-05, ...,
        2.59201089e-02, 4.30292599e-05, 4.30292599e-05],
       [3.29815303e-05, 3.29815303e-05, 3.29815303e-05, ...,
        3.29815303e-05, 3.29815303e-05, 3.29815303e-05]])

In [79]:
sorting = np.argsort(lda_40.components_)[:,::-1]
features = np.array(vectorizer.get_feature_names())

In [80]:
import mglearn
mglearn.tools.print_topics(topics=range(40),feature_names=features,
                           sorting= sorting, topics_per_chunk=10,n_words=10)

topic 0       topic 1       topic 2       topic 3       topic 4       topic 5       topic 6       topic 7       topic 8       topic 9       
--------      --------      --------      --------      --------      --------      --------      --------      --------      --------      
cuomo         biden         rep           polit         drug          school        vaccin        hous          florida       texa          
new           presid        hous          woman         fda           student       health        committe      desanti       abbott        
governor      white         state         think         approv        educ          dose          mccarthi      state         state         
york          hous          illinoi       peopl         agenc         state         shot          space         ron           school        
investig      administr     chicago       time          report        cyber         million       republican    miami         critic        
sexual       

In [81]:
lda_50 = LatentDirichletAllocation(n_components=50)

In [None]:
lda_50.fit_transform(dtm)

array([[6.68896321e-05, 6.68896321e-05, 6.68896321e-05, ...,
        6.68896321e-05, 6.68896321e-05, 6.68896321e-05],
       [6.89655172e-05, 6.89655172e-05, 6.89655172e-05, ...,
        6.89655172e-05, 6.89655172e-05, 6.89655172e-05],
       [1.98019802e-04, 1.98019802e-04, 1.98019802e-04, ...,
        1.98019802e-04, 1.98019802e-04, 1.98019802e-04],
       ...,
       [2.94550810e-05, 2.94550810e-05, 2.94550810e-05, ...,
        2.94550810e-05, 2.94550810e-05, 1.02842594e-01],
       [3.44234079e-05, 3.44234079e-05, 3.44234079e-05, ...,
        1.87434262e-02, 3.44234079e-05, 3.44234079e-05],
       [2.63852243e-05, 2.63852243e-05, 2.63852243e-05, ...,
        2.63852243e-05, 2.63852243e-05, 2.63852243e-05]])

In [None]:
sorting = np.argsort(lda_50.components_)[:,::-1]
features = np.array(vectorizer.get_feature_names())

In [None]:
import mglearn
mglearn.tools.print_topics(topics=range(50),feature_names=features,
                           sorting= sorting, topics_per_chunk=10,n_words=10)

topic 0       topic 1       topic 2       topic 3       topic 4       topic 5       topic 6       topic 7       topic 8       topic 9       
--------      --------      --------      --------      --------      --------      --------      --------      --------      --------      
woman         tax           polic         vaccin        polit         state         state         space         biden         compani       
epstein       infrastructur offic         health        peopl         vote          citi          forc          presid        facebook      
sex           democrat      flight        covid19       percent       elect         new           govern        russia        tech          
athlet        biden         spear         peopl         voter         texa          mayor         nation        russian       platform      
girl          senat         video         state         democrat      democrat      offic         war           administr     googl         
sexual       

In [39]:
lda_100 = LatentDirichletAllocation(n_components=100)

In [40]:
lda_100.fit_transform(dtm)

array([[3.34448161e-05, 3.34448161e-05, 3.34448161e-05, ...,
        3.34448161e-05, 3.34448161e-05, 3.34448161e-05],
       [3.44827586e-05, 3.44827586e-05, 3.44827586e-05, ...,
        3.44827586e-05, 3.44827586e-05, 3.44827586e-05],
       [9.90099010e-05, 9.90099010e-05, 9.90099010e-05, ...,
        9.90099010e-05, 9.90099010e-05, 9.90099010e-05],
       ...,
       [1.47275405e-05, 1.78846843e-02, 1.75608717e-02, ...,
        1.47275405e-05, 1.47275405e-05, 1.47275405e-05],
       [1.72117040e-05, 1.72117040e-05, 1.72117040e-05, ...,
        1.72117040e-05, 1.72117040e-05, 1.73442148e-02],
       [1.31926121e-05, 1.31926121e-05, 1.31926121e-05, ...,
        1.31926121e-05, 1.31926121e-05, 3.42146013e-02]])

In [41]:
sorting = np.argsort(lda_100.components_)[:,::-1]
features = np.array(vectorizer.get_feature_names())

In [42]:
import mglearn
mglearn.tools.print_topics(topics=range(100),feature_names=features,
                           sorting= sorting, topics_per_chunk=10,n_words=10)

topic 0       topic 1       topic 2       topic 3       topic 4       topic 5       topic 6       topic 7       topic 8       topic 9       
--------      --------      --------      --------      --------      --------      --------      --------      --------      --------      
epstein       fish          electr        vote          iran          travel        california    korea         virginia      holiday       
sex           state         vehicl        state         israel        border        newsom        north         mcauliff      juneteenth    
victim        tribe         car           elect         isra          princ         state         korean        bennett       repar         
traffick      florida       batteri       right         palestinian   restrict      san           senat         netanyahu     cathol        
maxwel        wildlif       compani       voter         nuclear       canada        la            amend         youngkin      state         
charg        

## 150 topics

In [39]:
lda_150 = LatentDirichletAllocation(n_components=150)

In [40]:
lda_150.fit_transform(dtm)

array([[2.22965440e-05, 2.22965440e-05, 2.22965440e-05, ...,
        2.22965440e-05, 2.22965440e-05, 2.22965440e-05],
       [2.29885057e-05, 2.29885057e-05, 2.29885057e-05, ...,
        2.29885057e-05, 2.29885057e-05, 2.29885057e-05],
       [6.60066007e-05, 6.60066007e-05, 6.60066007e-05, ...,
        6.60066007e-05, 6.60066007e-05, 6.60066007e-05],
       ...,
       [9.81836033e-06, 9.81836033e-06, 9.81836033e-06, ...,
        9.81836033e-06, 9.81836033e-06, 9.81836033e-06],
       [1.14744693e-05, 1.14744693e-05, 1.14744693e-05, ...,
        1.14744693e-05, 1.14744693e-05, 1.14744693e-05],
       [8.79507476e-06, 8.79507476e-06, 8.79507476e-06, ...,
        8.79507476e-06, 2.40544293e-02, 8.79507476e-06]])

In [41]:
sorting = np.argsort(lda_150.components_)[:,::-1]
features = np.array(vectorizer.get_feature_names())

In [42]:
import mglearn
mglearn.tools.print_topics(topics=range(150),feature_names=features,
                           sorting= sorting, topics_per_chunk=10,n_words=10)

topic 0       topic 1       topic 2       topic 3       topic 4       topic 5       topic 6       topic 7       topic 8       topic 9       
--------      --------      --------      --------      --------      --------      --------      --------      --------      --------      
vaccin        report        chicago       player        group         news          saudi         new           rep           bank          
state         carter        illinoi       member        llc           morn          princ         citi          hous          market        
peopl         ufo           state         committe      associ        politico      arabia        state         senat         financi       
shot          haitian       report        athlet        strategi      help          king          york          gop           compani       
health        haiti         tribun        confeder      lobbi         report        crown         offic         sen           percent       
covid19      

In [43]:
topic_train = lda_150.transform(dtm)

In [44]:
dtm_test = vectorizer.transform(lemma_test)
topic_test = lda_150.transform(dtm_test)

In [46]:
# from sklearn.neighbors import NearestNeighbors

In [47]:
# n_neighbors = 5
# KNN = NearestNeighbors(n_neighbors=n_neighbors,p=2)
# KNN.fit(topic_train)

In [48]:
# NNs = KNN.kneighbors(topic_test[0].reshape(1,-1),return_distance=False)

In [49]:
# NNs.reshape(1,-1)[0]

In [50]:
pd.set_option("display.max_colwidth", 3)
similarity_scores = pd.DataFrame(index=lemma_train.index)

In [132]:
def cosime_similarity(X_test,X_train):
    for i,test_row in X_test.iterrows():
        for j,train_row in X_train.iterrows():
            vector1 = test_row
            vector2 = train_row
            return np.dot(vector1, vector2) / (np.sqrt(np.sum(vector1**2)) * np.sqrt(np.sum(vector2**2)))

In [133]:
def get_similar_articles(idx_test,top_n_values=10):
    similarity_scores.loc[:,idx_test] = cosine_similarity([topic_test[idx_test]],topic_train).reshape(-1,1)
    idx_similar = similarity_scores.sort_values([idx_test],ascending=False)[idx_test].head(top_n_values).index.values
    return df_test.iloc[idx_test].article_heading,df_train.loc[idx_similar,:].sort_values(["article_published_on"],ascending=False)["article_heading"]

In [56]:
def print_similar_articles(test_heading,train_results):
    print("Tested for this article".center(50))
    print("\n")
    print(test_heading)
    print("\n")
    print("Recommended Articles".center(50))
    print(train_results)

In [62]:
import joblib
joblib.dump(lda_150,'../models/lda_150')

['../models/lda_150']

In [63]:
del lda_150

In [65]:
 lda_150 = joblib.load("../models/lda_150")

In [67]:
joblib.dump(topic_train,'../models/X_lda_150')

['../models/X_lda_150']

In [96]:
joblib.dump(vectorizer,'../models/text_vectorizer_150')

['../models/text_vectorizer_150']

# Deploy

In [118]:
import joblib
import pandas as pd
import numpy as np
from datetime import datetime

import contractions

from nltk.tokenize import word_tokenize
from string import punctuation


from nltk.corpus import stopwords
from gensim.parsing.preprocessing import STOPWORDS
import spacy

from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from gensim.parsing.preprocessing import remove_stopwords

In [141]:
def text_pipeline(X):
    if isinstance(X, str):
        X = pd.Series(X)
    elif isinstance(X, (pd.Series, pd.DataFrame)):
        pass
    else:
        raise Exception(
            f"Input should either be in 'str' format or a 'series' or 'Dataframe' with a column of text. Received an object of type {type(X)}"
        )
    
    expanded_contractions = X.apply(lambda x:
        contractions.fix(x)
    )
    
    lower = expanded_contractions.str.lower()
    
    custom_preprocessor = lower.apply(lambda x: x.replace("-"," ").replace("'s","").replace("’s","").replace("–",""))

    # punctuations
    removed_punctuation = custom_preprocessor.apply(
        lambda x: "".join([c for c in x if c not in punctuation])
    )

    # stop words
    stop_words = stopwords.words("english")
    removed_stop_words = removed_punctuation.apply(
        lambda x: " ".join(
            [word for word in word_tokenize(x) if word not in stop_words]
        )
    )
    removed_stop_words = removed_stop_words.apply(lambda x: remove_stopwords(x))
    all_stopwords_gensim = STOPWORDS.union(
        set(["the", "say", "said", "get", "it", "in", "like", "new", "year"])
    )
    removed_stop_words = removed_stop_words.apply(
        lambda x: " ".join(
            [word for word in word_tokenize(x) if word not in all_stopwords_gensim]
        )
    )
    sp = spacy.load('en_core_web_sm')
    all_stopwords = sp.Defaults.stop_words
    removed_stop_words = removed_stop_words.apply(
        lambda x: " ".join(
            [word for word in word_tokenize(x) if word not in all_stopwords]
        )
    )

    # Stemming and Lematizing
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    stem = removed_stop_words.apply(
        lambda x: " ".join([stemmer.stem(word) for word in word_tokenize(x)])
    )
    lemma = stem.apply(
        lambda x: " ".join([lemmatizer.lemmatize(word) for word in word_tokenize(x)])
    )

    return lemma   

In [15]:
def text_vectorizer(X,vectorizer,fit=False):
    if fit:
        return vectorizer.fit_transform(X)
    else:
        return vectorizer.transform(X)

In [16]:
def get_topic_vectors(X,model,fit=False):
    if fit:
        return model.fit_transform(X)
    else:
        return model.transform(X)

In [168]:
from sklearn.metrics.pairwise import cosine_similarity
def get_similar_articles(text_vectors, X, top_n_values=10):
    """
    Evalute the cosine similarity between provided 'text_vectors' and trained X (articles trained and stored as a vecotr of topics).
    Return dataframe with index as trained articles and columns as text_vector indices with values as similarity scores
    """
    similarity_scores = cosine_similarity(X,text_vectors,dense_output=True)
    return similarity_scores
#     return np.argsort(similarity_scores, axis=0)[::-1,:][:top_n_values,:]

In [171]:
pd.set_option("display.max_colwidth", 3)
def print_similar_articles(test_indices,similarity_scores):
    values = np.sort(similarity_scores, axis=0)[::-1,:][:5,:]
    similarity_array = np.argsort(similarity_scores, axis=0)[::-1,:][:5,:]
    for i in range(similarity_array.shape[1]):
        indices = similarity_array[:,i]
        print("\n")
        print(df_test.iloc[test_indices[i]].article_heading)
        print("\n")
        print(df_train.iloc[indices].article_heading)
        print("\n")
        print(values)

In [228]:
# vectorizer = joblib.load("../models/text_vectorizer_150")
# lda_150 = joblib.load("../models/lda_150")
# topic_train = joblib.load("../models/X_lda_150")

In [142]:
train_lemmas = text_pipeline(df_train.article_body)

In [145]:
vectorizer = CountVectorizer(stop_words='english')
vectorizer.vocabulary_

AttributeError: 'CountVectorizer' object has no attribute 'vocabulary_'

In [146]:
count_vectors_train = text_vectorizer(train_lemmas,vectorizer,fit=True)
count_vectors_train.shape

(17655, 81444)

In [38]:
vectorizer.vocabulary_

{'feder': 37234,
 'judg': 52914,
 'thursday': 93843,
 'impos': 50046,
 'preliminari': 75876,
 'injunct': 50630,
 'biden': 17785,
 'administr': 10648,
 'rule': 81872,
 'immigr': 49920,
 'custom': 28547,
 'enforc': 34826,
 'ice': 49488,
 'offic': 69639,
 'significantli': 86492,
 'narrow': 66175,
 'categori': 22829,
 'illeg': 49779,
 'target': 92117,
 'arrest': 14398,
 'deport': 30467,
 'mark': 60684,
 'latest': 56523,
 'legal': 57069,
 'blow': 18714,
 'polici': 74695,
 'drew': 32673,
 'tipton': 94173,
 'violat': 99328,
 'congression': 26360,
 'mandat': 60275,
 'louisiana': 58838,
 'texa': 92992,
 'file': 37745,
 'lawsuit': 56721,
 'like': 57848,
 'succeed': 90655,
 'claim': 24811,
 'procedur': 76371,
 'act': 10413,
 'apa': 13730,
 'issu': 51549,
 'guidanc': 43327,
 'februari': 37221,
 'limit': 57908,
 'agent': 11052,
 'focus': 38628,
 'pose': 75150,
 'threat': 93660,
 'nation': 66284,
 'secur': 84376,
 'cross': 28013,
 'border': 19350,
 'nov': 68704,
 'commit': 25959,
 'aggrav': 11068,
 

In [186]:
n_components = [30,60,90,120,150,180,240,300]

In [187]:
# n_components = 200
for components in n_components:
    model = LatentDirichletAllocation(n_components=components)
    topic_vectors_train = get_topic_vectors(count_vectors_train,model,fit=True)
    time_str = datetime.now().strftime("%H%M")
    date_str = datetime.now().strftime("%m%d")
    joblib.dump(topic_vectors_train,f"../models/vectorizer_{date_str}_{time_str}_{components}")
    joblib.dump(vectorizer,f"../models/lda_model_{date_str}_{time_str}_{components}")
    joblib.dump(model,f"../models/topic_vector_train_{date_str}_{time_str}_{components}")

In [189]:
# import random
# test_indices = random.sample(range(df_test.shape[0]), 1)
test_indices = [0,1]
test_lemmas = text_pipeline(df_test.iloc[test_indices].article_body)
lemma_test_vectors = text_vectorizer(test_lemmas,vectorizer)
topic_vectors_test = get_topic_vectors(lemma_test_vectors,model)
similarity_scores = get_similar_articles(topic_vectors_test,topic_vectors_train)
print_similar_articles(test_indices,similarity_scores)



Nikki Haley, other Republicans call for Biden's resignation or impeachment after attack at Kabul airport


18270    Trump says he won’t attend Biden’s inauguration                                                               
18225    Acting US homeland security secretary Chad Wolf resigns – as it happened                                      
18266    Twitter permanently suspends Trump's account, citing risk of 'further incitement of violence' – as it happened
18280    Trump acknowledges 'new administration' – as it happened                                                      
18201    Pence says he won't invoke 25th amendment in letter to Pelosi - as it happened                                
Name: article_heading, dtype: object


[[0.80470097 0.80400173]
 [0.7582203  0.79277045]
 [0.74757207 0.78921244]
 [0.74327022 0.78246625]
 [0.73800849 0.78186185]]


Rep. Mike Doyle tests positive for COVID despite being fully vaccinated


2532     Fauci: 'I don't think we're going to see 

# Old Results

## 100 components

In [229]:
print_similar_articles(*get_similar_articles(0))

             Tested for this article              


Nikki Haley, other Republicans call for Biden's resignation or impeachment after attack at Kabul airport


               Recommended Articles               
article_id
24043    https://www.nytimes.com/2021/08/19/us/politics/trump-biden-afghan-taliban.html                                                   
14490    https://www.foxnews.com/politics/sen-tuberville-slams-squad-afghanistan-protecting-women                                         
24078    https://www.nytimes.com/2021/08/17/us/politics/lawmakers-from-both-parties-express-fury-over-the-withdrawal-from-afghanistan.html
14323    https://www.foxnews.com/politics/biden-trump-trade-accusations-afghanistan                                                       
33554    https://www.theguardian.com/us-news/2021/aug/16/afghanistan-withdrawal-joe-biden-crisis                                          
24084    https://www.nytimes.com/2021/08/16/us/politics/afghanistan-withdrawal-cong

In [210]:
print_similar_articles(*get_similar_articles(60))

             Tested for this article              


Fauci blamed measles outbreak on Hasidic Jews; Orthodox leaders set the record straight


               Recommended Articles               
article_id
24041    Alabama has no more I.C.U. beds available, the state authorities said.                
16621    Nation hits 70 percent vaccination goal amid surging Delta variant                    
16926    Leaders in under-vaccinated areas should 'speak out' amid virus surge, Fauci says     
33813    Delta variant accounts for 83% of new cases in US, CDC director says                  
15566    White House defends door-to-door vaccination push amid backlash                       
25151    Louisiana, lagging in vaccinations, gambles on a lottery.                             
25242    White House Says It Will Narrowly Miss July 4 Vaccination Goal                        
28396    Top U.S. health experts say vaccine supplies and vaccinations will increase by spring.
28757    ‘An incredible sca

In [212]:
print_similar_articles(*get_similar_articles(95))

             Tested for this article              


Cuomo snaps at reporter when confronted about his scandals at hurricane briefing


               Recommended Articles               
article_id
14524    Time's Up president apologizes for backing Cuomo but declines to resign, claiming 'we were used as cover'
14598    NY Gov. Cuomo resigns amid sexual harassment scandal                                                     
14632    Who is Cuomo accuser Brittany Commisso?                                                                  
33669    New York lawmakers’ impeachment inquiry into Cuomo nearing an end                                        
14753    Cuomo under investigation: More district attorneys explore cases against NY governor                     
33697    Andrew Cuomo sexual harassment: the key testimony from the report                                        
14803    Cuomo sexually harassed multiple women in violation of state and federal law, NY AG finds              

In [213]:
print_similar_articles(*get_similar_articles(118))

             Tested for this article              


Gov. Newsom says Republican will lead state over 'COVID cliff', touts Schwarzenegger's input


               Recommended Articles               
article_id
33561    California’s governor recall election is heating up. Here’s what you need to know                   
14388    Newsom ramps up attacks in CA recall, slams Elder as further right than Trump                       
14390    Newsom recall: Larry Elder says he will repeal mask, vaccine mandates if he wins election           
14514    Newsom recall: Democrats ramp up attacks on Larry Elder                                             
14732    California recall candidates slam Newsom, take shots at each other in first debate                  
14961    Elizabeth Warren backs Newsom in recall fight, accuses 'Trump Republicans' of 'coming to grab power'
15685    California Gov. Gavin Newsom recall vote set for Sept. 14                                           
26426    In Californ

In [214]:
print_similar_articles(*get_similar_articles(107))

             Tested for this article              


New Jersey bans jails from contracting with ICE to hold immigration detainees


               Recommended Articles               
article_id
16916    Strange how Kim opponents keep calling him 'not one of us'                          
34023    Ice transfers 30 detainees to unknown location amid hunger strike                   
19489    Murphy's coming ad blitz                                                            
19603    Murphy's poll position                                                              
19702    Brent Spiner has spoken                                                             
21646    Johnson and Huttle and ICE contracts                                                
21680    ‘Our state has forgotten us’: Immigrant groups sour on New Jersey’s liberal governor
22789    Donnelly presses ahead                                                              
22849    The legal weed saga continues               

In [230]:
print_similar_articles(*get_similar_articles(107))

             Tested for this article              


New Jersey bans jails from contracting with ICE to hold immigration detainees


               Recommended Articles               
article_id
16916    https://www.politico.com/newsletters/new-jersey-playbook/2021/07/26/strange-how-kim-opponents-keep-calling-him-not-one-of-us-493720
34023    https://www.theguardian.com/us-news/2021/jun/29/ice-detainees-new-jersey-protests                                                  
19489    https://www.politico.com/newsletters/new-jersey-playbook/2021/05/10/murphys-coming-ad-blitz-492779                                 
19603    https://www.politico.com/newsletters/new-jersey-playbook/2021/05/06/murphys-poll-position-492746                                   
19702    https://www.politico.com/newsletters/new-jersey-playbook/2021/05/04/restrictions-to-be-lifted-sort-of-492707                       
21646    https://www.politico.com/newsletters/new-jersey-playbook/2021/03/17/johnson-and-huttle-and-

## 150 components

In [57]:
print_similar_articles(*get_similar_articles(0))

             Tested for this article              


Nikki Haley, other Republicans call for Biden's resignation or impeachment after attack at Kabul airport


               Recommended Articles               
article_id
24043    Trump’s Deal With the Taliban Draws Fire From His Former Allies                             
14290    Republican senators question Biden's fitness for office amid Afghanistan debacle            
14323    Biden, Trump trade accusations on Afghanistan                                               
14313    Biden says American troops should not be dying in Afghanistan; 18 months since last US death
33554    Defiant Biden stands ‘squarely behind’ decision to withdraw from Afghanistan                
14337    Trump calls on Biden to 'resign in disgrace' over crisis in Afghanistan                     
20495    Biden announces withdrawal from Afghanistan in speech heavy on symbolism                    
22763    POLITICO Playbook PM: Opening moments from the impeachm

In [58]:
print_similar_articles(*get_similar_articles(60))

             Tested for this article              


Fauci blamed measles outbreak on Hasidic Jews; Orthodox leaders set the record straight


               Recommended Articles               
article_id
16675    Biden’s new mask guidance too little, too late for parts of the country, officials say       
16971    Alabama governor says ‘it’s time to start blaming the unvaccinated folks’ as pandemic worsens
33813    Delta variant accounts for 83% of new cases in US, CDC director says                         
33872    Delta variant gains ground in US as outbreaks highlight vaccine divide                       
17492    Delta variant said to be far more widespread than federal estimates                          
25295    With Vaccination Goal in Doubt, Biden Warns of Variant’s Threat                              
19609    ‘Doomsday scenario’: Lagging vaccine rates stir fears of dangerous variants                  
27549    The Imperious Rise and Accelerating Fall of Andrew Cuomo         

In [59]:
print_similar_articles(*get_similar_articles(95))

             Tested for this article              


Cuomo snaps at reporter when confronted about his scandals at hurricane briefing


               Recommended Articles               
article_id
14353    NY lawmakers to continue Cuomo investigation, reversing course after backlash                               
14524    Time's Up president apologizes for backing Cuomo but declines to resign, claiming 'we were used as cover'   
14540    Cuomo successor Hochul distances herself from disgraced gov, says they 'have not been close'                
14778    4 Northeast Dem governors call on Cuomo to resign in sex-harassment scandal                                 
14807    Cuomo sexual harassment investigation: READ THE FULL REPORT                                                 
14948    Cuomo accuser calls for disbarment of lawyer who handled harassment complaint after resignation announcement
15291    Cuomo grilled by AG's lawyers in sexual harassment probe                             

In [60]:
print_similar_articles(*get_similar_articles(118))

             Tested for this article              


Gov. Newsom says Republican will lead state over 'COVID cliff', touts Schwarzenegger's input


               Recommended Articles               
article_id
14502    California recall election of Gov. Newsom gets underway as voters start receiving ballots  
14388    Newsom ramps up attacks in CA recall, slams Elder as further right than Trump              
14390    Newsom recall: Larry Elder says he will repeal mask, vaccine mandates if he wins election  
14658    CA gubernatorial candidate Kevin Faulconer says Gov. Newsom trying to 'partisan-ize' recall
14732    California recall candidates slam Newsom, take shots at each other in first debate         
14716    Newsom says recall is unfair, effects will be felt 'across the country'                    
16868    Poll: Newsom 'in jeopardy' of being recalled if Democrats remain apathetic                 
15019    Recalling Newsom: Likely California voters split on ousting governor in ne

In [61]:
print_similar_articles(*get_similar_articles(107))

             Tested for this article              


New Jersey bans jails from contracting with ICE to hold immigration detainees


               Recommended Articles               
article_id
34023    Ice transfers 30 detainees to unknown location amid hunger strike 
18351    The line wins!                                                    
19489    Murphy's coming ad blitz                                          
19702    Brent Spiner has spoken                                           
20414    Murphy challenges Sweeney on gun control                          
20705    Cannabis commission to meet prematurely                           
21303    Murphy changes cannabis commission pick and the race with NY is on
21646    Johnson and Huttle and ICE contracts                              
22191    Juliano to Huttle: Don't be like Trump                            
22789    Donnelly presses ahead                                            
Name: article_heading, dtype: string


#