In [1]:
import joblib
import pandas as pd
import numpy as np
from datetime import datetime

import contractions

from nltk.tokenize import word_tokenize
from string import punctuation


from nltk.corpus import stopwords
from gensim.parsing.preprocessing import STOPWORDS
import spacy

from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from gensim.parsing.preprocessing import remove_stopwords

In [2]:
df_articles = pd.read_csv("../data/interim/articles_processed.csv")

In [3]:
    df_articles.article_published_on = df_articles.article_published_on.astype(np.datetime64)

In [4]:
df_train,df_test = df_articles[df_articles.article_published_on<datetime(year=2021,day=20,month=8)],df_articles[df_articles.article_published_on>=datetime(year=2021,day=20,month=8)]

## Functions

In [5]:
def text_pipeline(X):
    if isinstance(X, str):
        X = pd.Series(X)
    elif isinstance(X, (pd.Series, pd.DataFrame)):
        pass
    else:
        raise Exception(
            f"Input should either be in 'str' format or a 'series' or 'Dataframe' with a column of text. Received an object of type {type(X)}"
        )
    
    expanded_contractions = X.apply(lambda x:
        contractions.fix(x)
    )
    
    lower = expanded_contractions.str.lower()
    
    custom_preprocessor = lower.apply(lambda x: x.replace("-"," ").replace("'s","").replace("’s","").replace("–",""))

    # punctuations
    removed_punctuation = custom_preprocessor.apply(
        lambda x: "".join([c for c in x if c not in punctuation])
    )

    # stop words
    stop_words = stopwords.words("english")
    removed_stop_words = removed_punctuation.apply(
        lambda x: " ".join(
            [word for word in word_tokenize(x) if word not in stop_words]
        )
    )
    removed_stop_words = removed_stop_words.apply(lambda x: remove_stopwords(x))
    all_stopwords_gensim = STOPWORDS.union(
        set(["the", "say", "said", "get", "it", "in", "like", "new", "year"])
    )
    removed_stop_words = removed_stop_words.apply(
        lambda x: " ".join(
            [word for word in word_tokenize(x) if word not in all_stopwords_gensim]
        )
    )
    sp = spacy.load('en_core_web_sm')
    all_stopwords = sp.Defaults.stop_words
    removed_stop_words = removed_stop_words.apply(
        lambda x: " ".join(
            [word for word in word_tokenize(x) if word not in all_stopwords]
        )
    )

    # Stemming and Lematizing
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    stem = removed_stop_words.apply(
        lambda x: " ".join([stemmer.stem(word) for word in word_tokenize(x)])
    )
    lemma = stem.apply(
        lambda x: " ".join([lemmatizer.lemmatize(word) for word in word_tokenize(x)])
    )

    return lemma   

In [6]:
def text_vectorizer(X,vectorizer,fit=False):
    if fit:
        return vectorizer.fit_transform(X)
    else:
        return vectorizer.transform(X)

In [7]:
def get_topic_vectors(X,model,fit=False):
    if fit:
        return model.fit_transform(X)
    else:
        return model.transform(X)

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
def get_similar_articles(text_vectors, X, top_n_values=10):
    """
    Evalute the cosine similarity between provided 'text_vectors' and trained X (articles trained and stored as a vecotr of topics).
    Return dataframe with index as trained articles and columns as text_vector indices with values as similarity scores
    """
    similarity_scores = cosine_similarity(X,text_vectors,dense_output=True)
    return similarity_scores
#     return np.argsort(similarity_scores, axis=0)[::-1,:][:top_n_values,:]

In [9]:
pd.set_option("display.max_colwidth", 3)
def print_similar_articles(test_indices,similarity_scores):
    values = np.sort(similarity_scores, axis=0)[::-1,:][:5,:]
    similarity_array = np.argsort(similarity_scores, axis=0)[::-1,:][:5,:]
    for i in range(similarity_array.shape[1]):
        indices = similarity_array[:,i]
#         print("\n")
#         print(df_test.iloc[test_indices[i]].article_heading)
#         print("\n")
#         print(df_train.iloc[indices].article_heading)
#         print("\n")
#         print(values[:,i])
        return df_test.iloc[test_indices[i]].article_heading,pd.DataFrame({'article_heading':df_train.iloc[indices].article_heading,'Similarity score':values[:,i],'article_date':df_train.iloc[indices].article_published_on})

In [10]:
def get_saved_models(n_components):
    if n_components==300:
        topic_vectors_train = joblib.load(f"../models/vectorizer_0830_1513_300")
        vectorizer = joblib.load(f"../models/lda_model_0830_1513_300")
        model =joblib.load(f"../models/topic_vector_train_0830_1513_300")
    elif n_components==240:
        topic_vectors_train = joblib.load(f"../models/vectorizer_0830_1406_240")
        vectorizer = joblib.load(f"../models/lda_model_0830_1406_240")
        model =joblib.load(f"../models/topic_vector_train_0830_1406_240")
    elif n_components==180:
        topic_vectors_train = joblib.load(f"../models/vectorizer_0830_1304_180")
        vectorizer = joblib.load(f"../models/lda_model_0830_1304_180")
        model =joblib.load(f"../models/topic_vector_train_0830_1304_180")
    elif n_components==150:
        topic_vectors_train = joblib.load(f"../models/vectorizer_0830_1205_150")
        vectorizer = joblib.load(f"../models/lda_model_0830_1205_150")
        model =joblib.load(f"../models/topic_vector_train_0830_1205_150")
    elif n_components==120:
        topic_vectors_train = joblib.load(f"../models/vectorizer_0830_1109_120")
        vectorizer = joblib.load(f"../models/lda_model_0830_1109_120")
        model =joblib.load(f"../models/topic_vector_train_0830_1109_120")
    elif n_components==90:
        topic_vectors_train = joblib.load(f"../models/vectorizer_0830_1015_90")
        vectorizer = joblib.load(f"../models/lda_model_0830_1015_90")
        model =joblib.load(f"../models/topic_vector_train_0830_1015_90")
    elif n_components==60:
        topic_vectors_train = joblib.load(f"../models/vectorizer_0830_0925_60")
        vectorizer = joblib.load(f"../models/lda_model_0830_0925_60")
        model =joblib.load(f"../models/topic_vector_train_0830_0925_60")
    elif n_components==30:
        topic_vectors_train = joblib.load(f"../models/vectorizer_0830_0838_30")
        vectorizer = joblib.load(f"../models/lda_model_0830_0838_30")
        model =joblib.load(f"../models/topic_vector_train_0830_0838_30")
    return topic_vectors_train,vectorizer,model

In [11]:
components_saved = [300,240,180,150,120,90,60,30]

# Biden's resignation related news

In [12]:
from IPython.display import display

In [13]:
for component in components_saved:
    print("\n")
    print(component)
    print("\n")
    topic_vectors_train,vectorizer,model = get_saved_models(component)
    test_indices = [0]
    test_lemmas = text_pipeline(df_test.iloc[test_indices].article_body)
    lemma_test_vectors = text_vectorizer(test_lemmas,vectorizer)
    topic_vectors_test = get_topic_vectors(lemma_test_vectors,model)
    similarity_scores = get_similar_articles(topic_vectors_test,topic_vectors_train)
    heading, result = print_similar_articles(test_indices,similarity_scores)
    print(heading)
    display(result)



300


Nikki Haley, other Republicans call for Biden's resignation or impeachment after attack at Kabul airport


Unnamed: 0,article_heading,Similarity score,article_date
18270,Trump says he won’t attend Biden’s inauguration,0.804701,2021-01-08 19:02:27
18225,Acting US homeland security secretary Chad Wolf resigns – as it happened,0.75822,2021-01-12 01:24:10
18266,"Twitter permanently suspends Trump's account, citing risk of 'further incitement of violence' – as it happened",0.747572,2021-01-09 02:44:30
18280,Trump acknowledges 'new administration' – as it happened,0.74327,2021-01-08 06:49:33
18201,Pence says he won't invoke 25th amendment in letter to Pelosi - as it happened,0.738008,2021-01-13 02:57:59




240


Nikki Haley, other Republicans call for Biden's resignation or impeachment after attack at Kabul airport


Unnamed: 0,article_heading,Similarity score,article_date
18270,Trump says he won’t attend Biden’s inauguration,0.691643,2021-01-08 19:02:27
14572,Washington Breathes an Uneasy Sigh of Relief,0.681281,2021-01-21 01:01:23
7140,Biden says he ‘can’t picture’ U.S. troops in Afghanistan next year,0.681009,2021-03-25 14:36:00
18054,Joe Biden sworn in as 46th president amid turmoil and loss in US,0.678106,2021-01-20 16:48:03
15080,"As the White House slips into deeper crisis, Trump says he will not go to Biden’s inauguration.",0.677046,2021-01-08 15:48:40




180


Nikki Haley, other Republicans call for Biden's resignation or impeachment after attack at Kabul airport


Unnamed: 0,article_heading,Similarity score,article_date
16899,Biden outlines Afghanistan withdrawal: ‘It’s time for American troops to come home’ – as it happened,0.748787,2021-04-15 00:08:56.000000
388,Biden hammered for spending time at Camp David while Taliban take over Afghanistan,0.695903,2021-08-14 23:59:59.701004
200,"Biden, Trump trade accusations on Afghanistan",0.692594,2021-08-16 23:59:59.983031
216,Biden points fingers for Afghanistan debacle after promising he wouldn't 'blame others',0.678736,2021-08-15 23:59:59.982000
185,Republicans demand resignations over Biden's Afghanistan debacle,0.671901,2021-08-16 23:59:59.983031




150


Nikki Haley, other Republicans call for Biden's resignation or impeachment after attack at Kabul airport


Unnamed: 0,article_heading,Similarity score,article_date
12496,"Read excerpts from Biden’s prepared remarks on Afghanistan, to be delivered at the White House this afternoon.",0.803796,2021-04-14 15:32:18
11476,Biden recognizes the sacrifice of service members in a speech ahead of Memorial Day.,0.774038,2021-05-28 20:32:28
9867,"For Biden, Images of Defeat He Wanted to Avoid",0.758318,2021-08-15 23:27:05
1951,Taliban takeover leaves Afghan diplomats in Washington limbo,0.739022,2021-08-17 14:02:00
6361,Biden announces withdrawal from Afghanistan in speech heavy on symbolism,0.709257,2021-04-14 15:29:00




120


Nikki Haley, other Republicans call for Biden's resignation or impeachment after attack at Kabul airport


Unnamed: 0,article_heading,Similarity score,article_date
10568,"It’s Situation Normal for U.S. Diplomats in Kabul, Despite Taliban Gains",0.791096,2021-07-12 20:45:22.000000
6361,Biden announces withdrawal from Afghanistan in speech heavy on symbolism,0.760508,2021-04-14 15:29:00.000000
12496,"Read excerpts from Biden’s prepared remarks on Afghanistan, to be delivered at the White House this afternoon.",0.745555,2021-04-14 15:32:18.000000
163,Lindsey Graham says heads should roll at State Department over Afghanistan debacle,0.739434,2021-08-17 23:59:59.983002
5927,Harris says she had key role in Biden's Afghanistan withdrawal decision,0.735206,2021-04-25 10:50:00.000000




90


Nikki Haley, other Republicans call for Biden's resignation or impeachment after attack at Kabul airport


Unnamed: 0,article_heading,Similarity score,article_date
14612,"Trump Departs Vowing, ‘We Will Be Back in Some Form’",0.801028,2021-01-20 22:37:58.000000
6361,Biden announces withdrawal from Afghanistan in speech heavy on symbolism,0.792973,2021-04-14 15:29:00.000000
18055,Trump tells Americans 'have a good life' as he leaves White House for final time,0.791276,2021-01-20 15:53:05.000000
18064,'We did what we came here to do': Trump fails to mention Biden in farewell address,0.785036,2021-01-19 23:43:33.000000
246,Jen Psaki 'out of the office' as Biden remains silent on Taliban takeover of Afghanistan,0.784945,2021-08-14 23:59:59.983003




60


Nikki Haley, other Republicans call for Biden's resignation or impeachment after attack at Kabul airport


Unnamed: 0,article_heading,Similarity score,article_date
167,Republican senators question Biden's fitness for office amid Afghanistan debacle,0.740246,2021-08-17 23:59:59.983036
6361,Biden announces withdrawal from Afghanistan in speech heavy on symbolism,0.727123,2021-04-14 15:29:00.000000
367,Sen. Tuberville slams 'Squad' for not hearing 'one word' on protecting women in Afghanistan,0.71783,2021-08-17 23:59:59.950001
142,John McCain's past remarks on Blinken being 'dangerous to America' resurface amid Afghanistan debacle,0.704015,2021-08-18 23:59:59.983000
14327,Marjorie Taylor Greene’s Controversies Are Piling Up. Republicans Are Quiet.,0.697866,2021-01-30 00:36:46.000000




30


Nikki Haley, other Republicans call for Biden's resignation or impeachment after attack at Kabul airport


Unnamed: 0,article_heading,Similarity score,article_date
14613,A Call for Unity to a Nation Facing a Pandemic and Division,0.95305,2021-01-20 21:43:09
14577,"Michael Ellis, a Trump appointee at the N.S.A. who was sworn in on Tuesday, has been placed on leave.",0.951212,2021-01-21 01:52:20
15103,Trump is said to have discussed pardoning himself.,0.951015,2021-01-07 20:41:44
14621,Trump was not the first president to snub an inauguration.,0.950968,2021-01-20 09:51:20
14703,"Sharpshooters, Protesters, a Secret Train Trip",0.950925,2021-01-19 00:19:32


# Fauci Covid

In [14]:
for component in components_saved:
    print("\n")
    print(component)
    print("\n")
    topic_vectors_train,vectorizer,model = get_saved_models(component)
    test_indices = [60]
    test_lemmas = text_pipeline(df_test.iloc[test_indices].article_body)
    lemma_test_vectors = text_vectorizer(test_lemmas,vectorizer)
    topic_vectors_test = get_topic_vectors(lemma_test_vectors,model)
    similarity_scores = get_similar_articles(topic_vectors_test,topic_vectors_train)
    result = print_similar_articles(test_indices,similarity_scores)
    display(result)



300




('Fauci blamed measles outbreak on Hasidic Jews; Orthodox leaders set the record straight',
                                                                        article_heading  \
 2480   White House back-channeling with networks on Covid coverage                       
 15681  Republican governor says ‘time to start blaming unvaccinated’ for rise in cases   
 2005   Swing voters, week 1: ‘Election is no slam-dunk’                                  
 2283   Fauci urges more testing to track breakthrough Covid cases                        
 15690  ‘It’s too late’: US doctor says dying patients begging for Covid vaccine          
 
        Similarity score        article_date  
 2480   0.882582         2021-08-03 04:30:00  
 15681  0.877112         2021-07-23 14:13:19  
 2005   0.860398         2021-08-16 11:24:00  
 2283   0.837508         2021-08-08 12:19:00  
 15690  0.836280         2021-07-22 16:19:14  )



240




('Fauci blamed measles outbreak on Hasidic Jews; Orthodox leaders set the record straight',
                                                                          article_heading  \
 15589  New York City to require vaccination for indoor dining, concerts and gyms           
 715    NYC to recommend, not mandate masks for vaccinated people indoors                   
 17380  Will I have to wear a mask after getting the Covid vaccine? The science explained   
 15703  Delta variant accounts for 83% of new cases in US, CDC director says                
 5477   ‘Doomsday scenario’: Lagging vaccine rates stir fears of dangerous variants         
 
        Similarity score               article_date  
 15589  0.929702         2021-08-03 16:26:53.000000  
 715    0.927695         2021-08-01 23:59:59.980207  
 17380  0.918070         2021-03-12 19:54:15.000000  
 15703  0.912664         2021-07-21 12:48:08.000000  
 5477   0.912347         2021-05-06 04:30:00.000000  )



180




('Fauci blamed measles outbreak on Hasidic Jews; Orthodox leaders set the record straight',
                                                                                                      article_heading  \
 14487  ‘An incredible scale of tragedy’: The U.S. records 25 million virus cases.                                      
 13655  D.C.’s Mayor Mourned Covid’s Unequal Toll. Her Sister Is the Latest Victim.                                     
 15589  New York City to require vaccination for indoor dining, concerts and gyms                                       
 9914   Fauci calls booster shots ‘likely,’ not now but in future, citing early signs that vaccine immunity may wane.   
 2283   Fauci urges more testing to track breakthrough Covid cases                                                      
 
        Similarity score        article_date  
 14487  0.920223         2021-01-23 21:30:33  
 13655  0.919705         2021-02-26 00:47:28  
 15589  0.911439         2021-08-03 16:



150




('Fauci blamed measles outbreak on Hasidic Jews; Orthodox leaders set the record straight',
                                                                         article_heading  \
 15568  Millennials hit with biggest increase in California Covid cases                    
 10549  Top Tennessee Vaccine Official Says She Was Fired Over Shots for Teens             
 17220  Michigan Covid surge raises fears it’s an early sign of things to come across US   
 15703  Delta variant accounts for 83% of new cases in US, CDC director says               
 15288  The Los Angeles area is hit with a Christmas coronavirus surge.                    
 
        Similarity score        article_date  
 15568  0.902537         2021-08-04 20:06:22  
 10549  0.897042         2021-07-13 17:14:43  
 17220  0.893785         2021-03-24 09:00:35  
 15703  0.890622         2021-07-21 12:48:08  
 15288  0.888298         2021-01-02 22:52:54  )



120




('Fauci blamed measles outbreak on Hasidic Jews; Orthodox leaders set the record straight',
                                                                        article_heading  \
 12244  Some mass vaccination sites in U.S. close as demand begins to fall.               
 15703  Delta variant accounts for 83% of new cases in US, CDC director says              
 16629  Decline in US Covid vaccinations presents new problem: how to shrink operations   
 8287   Policy hackathon: How to fix vaccination                                          
 5625   Feds rethink vaccination strategy as slowing demand reveals stark divide          
 
        Similarity score        article_date  
 12244  0.827969         2021-04-23 10:56:52  
 15703  0.791402         2021-07-21 12:48:08  
 16629  0.775511         2021-05-02 06:00:25  
 8287   0.768793         2021-02-18 04:30:00  
 5625   0.768463         2021-05-03 04:30:00  )



90




('Fauci blamed measles outbreak on Hasidic Jews; Orthodox leaders set the record straight',
                                                                              article_heading  \
 10906  Louisiana, lagging in vaccinations, gambles on a lottery.                               
 10549  Top Tennessee Vaccine Official Says She Was Fired Over Shots for Teens                  
 9928   Inside a San Antonio pediatric hospital, children battle for breath. Parents cry.       
 932    White House says COVID vaccinations 'picking back up' amid concern over delta variant   
 11081  Could the U.S. Have Saved More Lives? 5 Alternate Scenarios for the Vaccine Rollout     
 
        Similarity score            article_date  
 10906  0.943304         2021-06-25 09:01:14.000  
 10549  0.942904         2021-07-13 17:14:43.000  
 9928   0.940909         2021-08-12 20:51:40.000  
 932    0.940344         2021-07-25 23:59:59.984  
 11081  0.939825         2021-06-17 17:46:12.000  )



60




('Fauci blamed measles outbreak on Hasidic Jews; Orthodox leaders set the record straight',
                                                                                article_heading  \
 15896  Threat to vulnerable Americans rises as Delta variant spreads                             
 3363   Delta variant said to be far more widespread than federal estimates                       
 17265  Spread of Covid variants fuels concern in Michigan as state reopens                       
 1907   Biden will require vaccines for staff at federally funded nursing homes                   
 8006   CDC's draft guidelines for vaccinated Americans call for small steps toward normal life   
 
        Similarity score        article_date  
 15896  0.905311         2021-07-01 10:00:05  
 3363   0.888557         2021-07-08 08:59:00  
 17265  0.883953         2021-03-20 20:32:37  
 1907   0.883746         2021-08-18 15:41:00  
 8006   0.883403         2021-03-02 10:37:00  )



30




('Fauci blamed measles outbreak on Hasidic Jews; Orthodox leaders set the record straight',
                                                                article_heading  \
 16139  Seattle becomes most vaccinated city in US, mayor says                    
 3065   Poll: Unvaccinated Americans least concerned about Delta variant          
 2495   Poll: Slim majority in U.S. backs return of masking                       
 6212   Half of U.S. adults now have received at least 1 Covid shot               
 15275  As the Virus Spikes, Vaccine Distribution Is One More Hurdle for States   
 
        Similarity score        article_date  
 16139  0.989582         2021-06-10 15:31:10  
 3065   0.954764         2021-07-18 12:43:00  
 2495   0.953928         2021-08-02 13:26:00  
 6212   0.936165         2021-04-18 15:30:00  
 15275  0.930603         2021-01-03 23:16:21  )

# New Jersey ban on ICE

In [15]:
for component in components_saved:
    print("\n")
    print(component)
    print("\n")
    topic_vectors_train,vectorizer,model = get_saved_models(component)
    test_indices = [107]
    test_lemmas = text_pipeline(df_test.iloc[test_indices].article_body)
    lemma_test_vectors = text_vectorizer(test_lemmas,vectorizer)
    topic_vectors_test = get_topic_vectors(lemma_test_vectors,model)
    similarity_scores = get_similar_articles(topic_vectors_test,topic_vectors_train)
    result = print_similar_articles(test_indices,similarity_scores)
    display(result)



300




('New Jersey bans jails from contracting with ICE to hold immigration detainees',
                                                                                 article_heading  \
 2069  'I feel like I did the right thing,' Cuomo tells mag in first interview since resignation   
 3332  Murphy now no. 2 at NGA                                                                     
 5570  Brent Spiner has spoken                                                                     
 5693  New Jersey grew because of pizza                                                            
 3479  It feels like 2013                                                                          
 
       Similarity score        article_date  
 2069  0.815930         2021-08-13 20:15:00  
 3332  0.812244         2021-07-09 06:55:00  
 5570  0.812039         2021-05-04 07:00:00  
 5693  0.811083         2021-04-30 06:55:00  
 3479  0.807662         2021-06-28 07:30:00  )



240




('New Jersey bans jails from contracting with ICE to hold immigration detainees',
                                                    article_heading  \
 9356  The Trump implosion auction, brought to you by a Republican?   
 5570  Brent Spiner has spoken                                        
 7397  The details of the Johnson allegation                          
 6933  Should reporters be a protected class in N.J.?                 
 6763  A peek at the Christie Institute's finances                    
 
       Similarity score        article_date  
 9356  0.860522         2021-01-22 06:55:00  
 5570  0.857629         2021-05-04 07:00:00  
 7397  0.854525         2021-03-19 06:55:00  
 6933  0.852794         2021-03-31 06:55:00  
 6763  0.849693         2021-04-05 07:30:00  )



180




('New Jersey bans jails from contracting with ICE to hold immigration detainees',
                                           article_heading  Similarity score  \
 6280  Murphy challenges Sweeney on gun control             0.886096           
 8960  Senate Republican Leader Kean won't seek reelection  0.870437           
 5078  It's just another masked Monday                      0.869855           
 6441  Lisa McCormick is the new Kanye West                 0.868756           
 1928  Another tough choice for Rabner                      0.867678           
 
             article_date  
 6280 2021-04-16 06:58:00  
 8960 2021-02-02 07:30:00  
 5078 2021-05-17 07:00:00  
 6441 2021-04-13 07:00:00  
 1928 2021-08-18 07:00:00  )



150




('New Jersey bans jails from contracting with ICE to hold immigration detainees',
                                   article_heading  Similarity score  \
 4885  Senate moves bill to ban ICE jail contracts  0.830091           
 3332  Murphy now no. 2 at NGA                      0.827526           
 8713  The legal weed saga continues                0.821075           
 7690  More problems in District 37                 0.821040           
 6280  Murphy challenges Sweeney on gun control     0.820780           
 
             article_date  
 4885 2021-05-21 06:55:00  
 3332 2021-07-09 06:55:00  
 8713 2021-02-08 07:30:00  
 7690 2021-03-12 06:55:00  
 6280 2021-04-16 06:58:00  )



120




('New Jersey bans jails from contracting with ICE to hold immigration detainees',
                                   article_heading  Similarity score  \
 4885  Senate moves bill to ban ICE jail contracts  0.901565           
 5693  New Jersey grew because of pizza             0.901258           
 7510  Johnson and Huttle and ICE contracts         0.900855           
 6280  Murphy challenges Sweeney on gun control     0.900101           
 1928  Another tough choice for Rabner              0.899309           
 
             article_date  
 4885 2021-05-21 06:55:00  
 5693 2021-04-30 06:55:00  
 7510 2021-03-17 07:01:00  
 6280 2021-04-16 06:58:00  
 1928 2021-08-18 07:00:00  )



90




('New Jersey bans jails from contracting with ICE to hold immigration detainees',
                                                                            article_heading  \
 7544  ‘Our state has forgotten us’: Immigrant groups sour on New Jersey’s liberal governor   
 7510  Johnson and Huttle and ICE contracts                                                   
 6280  Murphy challenges Sweeney on gun control                                               
 7690  More problems in District 37                                                           
 5693  New Jersey grew because of pizza                                                       
 
       Similarity score        article_date  
 7544  0.897390         2021-03-16 13:41:00  
 7510  0.881991         2021-03-17 07:01:00  
 6280  0.877478         2021-04-16 06:58:00  
 7690  0.875522         2021-03-12 06:55:00  
 5693  0.875320         2021-04-30 06:55:00  )



60




('New Jersey bans jails from contracting with ICE to hold immigration detainees',
                                                                                article_heading  \
 15913  Ice transfers 30 detainees to unknown location amid hunger strike                         
 7544   ‘Our state has forgotten us’: Immigrant groups sour on New Jersey’s liberal governor      
 1125   Biden stands by nominee linked to tree-spiking plot despite new revelations, Psaki says   
 1188   McConnell calls for Biden to withdraw nominee linked to eco-terrorist plot                
 261    New Jersey illegal immigrant charged with murder of 18-year-old high school soccer star   
 
        Similarity score               article_date  
 15913  0.901773         2021-06-29 20:34:32.000000  
 7544   0.829778         2021-03-16 13:41:00.000000  
 1125   0.775712         2021-07-18 23:59:59.983972  
 1188   0.773981         2021-07-15 23:59:59.982002  
 261    0.767860         2021-08-13 23:59:59.982999 



30




('New Jersey bans jails from contracting with ICE to hold immigration detainees',
                                                                         article_heading  \
 12459  Florida Man Who Posed as Immigration Lawyer Gets 20-Year Sentence                  
 9968   Biden Administration Violating Decree on Migrant Children, Court Filing Says       
 9194   Biden administration replaces top immigration court official                       
 18002  Biden official involved in removal of DoJ lawyer concerned by family separations   
 2046   Taliban seize power amid chaos in Afghanistan                                      
 
        Similarity score        article_date  
 12459  0.990432         2021-04-15 21:05:03  
 9968   0.990313         2021-08-10 01:31:21  
 9194   0.983978         2021-01-27 11:06:00  
 18002  0.983542         2021-01-23 02:31:07  
 2046   0.976060         2021-08-15 10:33:00  )

# Cuomo Sexual harrasement allegations resign

In [16]:
for component in components_saved:
    print("\n")
    print(component)
    print("\n")
    topic_vectors_train,vectorizer,model = get_saved_models(component)
    test_indices = [95]
    test_lemmas = text_pipeline(df_test.iloc[test_indices].article_body)
    lemma_test_vectors = text_vectorizer(test_lemmas,vectorizer)
    topic_vectors_test = get_topic_vectors(lemma_test_vectors,model)
    similarity_scores = get_similar_articles(topic_vectors_test,topic_vectors_train)
    result = print_similar_articles(test_indices,similarity_scores)
    display(result)



300




('Cuomo snaps at reporter when confronted about his scandals at hurricane briefing',
                                                                                       article_heading  \
 2449   Gov. Andrew Cuomo denies report findings that he sexually harassed women                         
 15587  Andrew Cuomo sexual harassment: the key testimony from the report                                
 679    Cuomo defiantly declines to resign, denies wrongdoing after bombshell sexual harassment report   
 17281  Cuomo accuser recalls toxic workplace culture 'especially for women'                             
 2459   AG: Cuomo sexually harassed multiple women, broke state and federal law                          
 
        Similarity score            article_date  
 2449   0.990925         2021-08-03 13:35:00.000  
 15587  0.990371         2021-08-03 20:18:42.000  
 679    0.989831         2021-08-02 23:59:59.981  
 17281  0.989769         2021-03-19 22:21:00.000  
 2459   0.985904      



240




('Cuomo snaps at reporter when confronted about his scandals at hurricane briefing',
                                                                            article_heading  \
 15588  Cuomo faces calls to quit after inquiry finds he sexually harassed 11 women           
 2449   Gov. Andrew Cuomo denies report findings that he sexually harassed women              
 2265   ‘What he did to me was a crime’: Cuomo accuser goes public with groping allegations   
 7529   More women reaching out with Cuomo harassment allegations, accuser’s attorney says    
 568    DeBlasio calls on 'narcissist' Cuomo to 'get the hell out of the way' and resign      
 
        Similarity score            article_date  
 15588  0.982332         2021-08-03 19:27:03.000  
 2449   0.980721         2021-08-03 13:35:00.000  
 2265   0.978419         2021-08-09 08:24:00.000  
 7529   0.977373         2021-03-16 19:05:00.000  
 568    0.977216         2021-08-04 23:59:59.983  )



180




('Cuomo snaps at reporter when confronted about his scandals at hurricane briefing',
                                                                                           article_heading  \
 679    Cuomo defiantly declines to resign, denies wrongdoing after bombshell sexual harassment report       
 2449   Gov. Andrew Cuomo denies report findings that he sexually harassed women                             
 15587  Andrew Cuomo sexual harassment: the key testimony from the report                                    
 508    Cuomo executive assistant Brittany Commisso goes public, describes alleged misconduct in interview   
 547    Cuomo lawyers blast AG report, saying evidence was left out: 'Doesn't pass muster'                   
 
        Similarity score               article_date  
 679    0.992912         2021-08-02 23:59:59.981000  
 2449   0.991405         2021-08-03 13:35:00.000000  
 15587  0.990869         2021-08-03 20:18:42.000000  
 508    0.990610         2021-08-08 2



150




('Cuomo snaps at reporter when confronted about his scandals at hurricane briefing',
                                                                      article_heading  \
 509    Who is Cuomo accuser Brittany Commisso?                                         
 518    Cuomo executive assistant comes forward, alleges NY governor 'broke the law'    
 1167   Cuomo grilled by AG's lawyers in sexual harassment probe                        
 2459   AG: Cuomo sexually harassed multiple women, broke state and federal law         
 17529  New York attorney general seeks to investigate Cuomo sexual harassment claims   
 
        Similarity score               article_date  
 509    0.994832         2021-08-08 23:59:59.983000  
 518    0.994751         2021-08-07 23:59:59.981001  
 1167   0.994134         2021-07-16 23:59:59.949000  
 2459   0.993738         2021-08-03 11:26:00.000000  
 17529  0.993366         2021-02-28 22:55:54.000000  )



120




('Cuomo snaps at reporter when confronted about his scandals at hurricane briefing',
                                                                                                  article_heading  \
 470   Cuomo detractors say resignation shouldn't be end of scandal, governor should be 'prosecuted and arrested'   
 7865  Cuomo accuser says governor’s national profile emboldened him                                                
 505   Time's Up chairwoman resigns amid backlash for effort to aid Cuomo, discredit one of his accusers            
 568   DeBlasio calls on 'narcissist' Cuomo to 'get the hell out of the way' and resign                             
 622   Plane flies 'Remove Cuomo' banner above New York state Capitol                                               
 
       Similarity score               article_date  
 470   0.988576         2021-08-09 23:59:59.972002  
 7865  0.985963         2021-03-04 20:05:00.000000  
 505   0.985339         2021-08-08 23:59:59.982566  



90




('Cuomo snaps at reporter when confronted about his scandals at hurricane briefing',
                                                                            article_heading  \
 7865   Cuomo accuser says governor’s national profile emboldened him                         
 15587  Andrew Cuomo sexual harassment: the key testimony from the report                     
 8267   New York assemblymember: Cuomo 'berated me,' asked me to lie about alleged cover-up   
 475    NY Gov. Cuomo resigns amid sexual harassment scandal                                  
 2265   ‘What he did to me was a crime’: Cuomo accuser goes public with groping allegations   
 
        Similarity score               article_date  
 7865   0.995420         2021-03-04 20:05:00.000000  
 15587  0.995420         2021-08-03 20:18:42.000000  
 8267   0.995327         2021-02-18 08:48:00.000000  
 475    0.995193         2021-08-09 23:59:59.984001  
 2265   0.995124         2021-08-09 08:24:00.000000  )



60




('Cuomo snaps at reporter when confronted about his scandals at hurricane briefing',
                                                                                           article_heading  \
 7865   Cuomo accuser says governor’s national profile emboldened him                                        
 17138  Sherry Vill is latest to accuse Andrew Cuomo of sexual misconduct                                    
 675    NY Gov. Cuomo was recorded singing ‘Do You Love Me?’ to accuser Charlotte Bennett                    
 2265   ‘What he did to me was a crime’: Cuomo accuser goes public with groping allegations                  
 508    Cuomo executive assistant Brittany Commisso goes public, describes alleged misconduct in interview   
 
        Similarity score               article_date  
 7865   0.994899         2021-03-04 20:05:00.000000  
 17138  0.994357         2021-03-29 22:06:54.000000  
 675    0.993358         2021-08-02 23:59:59.981015  
 2265   0.993221         2021-08-09 0



30




('Cuomo snaps at reporter when confronted about his scandals at hurricane briefing',
                                                                              article_heading  \
 7361   Current staffer publicly accuses Cuomo of sexual harassment                             
 13319  Why Al Franken Hovers Over the Debate About Cuomo’s Future                              
 9944   Railing at Enemies and Pleading for Time: Inside Cuomo’s Final Days                     
 17138  Sherry Vill is latest to accuse Andrew Cuomo of sexual misconduct                       
 17302  Cuomo scandal: sexual harassment rife in New York state capitol, female reporters say   
 
        Similarity score        article_date  
 7361   0.995708         2021-03-19 17:47:00  
 13319  0.995579         2021-03-12 21:33:58  
 9944   0.995506         2021-08-11 00:25:15  
 17138  0.995452         2021-03-29 22:06:54  
 17302  0.995371         2021-03-18 17:37:28  )

# Random

In [17]:
import random
test_indices = random.sample(range(df_test.shape[0]), 1)
for component in components_saved:
    print("\n")
    print(component)
    print("\n")
    topic_vectors_train,vectorizer,model = get_saved_models(component)
    test_lemmas = text_pipeline(df_test.iloc[test_indices].article_body)
    lemma_test_vectors = text_vectorizer(test_lemmas,vectorizer)
    topic_vectors_test = get_topic_vectors(lemma_test_vectors,model)
    similarity_scores = get_similar_articles(topic_vectors_test,topic_vectors_train)
    result = print_similar_articles(test_indices,similarity_scores)
    display(result)



300




('Holes in reporting of breakthrough Covid cases hamper CDC response',
                                                        article_heading  \
 3479  It feels like 2013                                                 
 6664  Why get a farmland tax assessment when you can have a parsonage?   
 2850  Guadag-no longer a Republican                                      
 6617  A real election in Camden?                                         
 7624  The soon-to-be Notorious RWA                                       
 
       Similarity score        article_date  
 3479  0.982843         2021-06-28 07:30:00  
 6664  0.982386         2021-04-07 06:55:00  
 2850  0.982358         2021-07-23 06:55:00  
 6617  0.981856         2021-04-08 06:58:00  
 7624  0.981621         2021-03-15 07:30:00  )



240




('Holes in reporting of breakthrough Covid cases hamper CDC response',
                                                    article_heading  \
 5254  Massive Hindu temple raided                                    
 7397  The details of the Johnson allegation                          
 7265  The Asbury Park Pressed                                        
 8335  Cannabis gets stuck in the weeds, again                        
 2590  Sorry, chief justice. It's not looking good on redistricting   
 
       Similarity score        article_date  
 5254  0.980855         2021-05-12 06:55:00  
 7397  0.979490         2021-03-19 06:55:00  
 7265  0.979266         2021-03-23 07:30:00  
 8335  0.976212         2021-02-17 06:55:00  
 2590  0.976205         2021-07-30 06:55:00  )



180




('Holes in reporting of breakthrough Covid cases hamper CDC response',
                                      article_heading  Similarity score  \
 3003  Republican senator joins Murphy administration  0.985538           
 7265  The Asbury Park Pressed                         0.981996           
 9152  Murphy sticks by Hicks amid Edna Mahan scandal  0.980113           
 6441  Lisa McCormick is the new Kanye West            0.977843           
 5078  It's just another masked Monday                 0.977801           
 
             article_date  
 3003 2021-07-20 07:00:00  
 7265 2021-03-23 07:30:00  
 9152 2021-01-28 06:58:00  
 6441 2021-04-13 07:00:00  
 5078 2021-05-17 07:00:00  )



150




('Holes in reporting of breakthrough Covid cases hamper CDC response',
                                                                       article_heading  \
 7751  Will a state Senate race have echos of Trumpism?                                  
 7453  Murphy goes in for a Gordon                                                       
 2138  POLITICO Playbook: Pelosi’s summer squeeze                                        
 1994  'I stand squarely behind my decision': Biden holds firm on Afghanistan drawdown   
 6441  Lisa McCormick is the new Kanye West                                              
 
       Similarity score        article_date  
 7751  0.978549         2021-03-11 06:58:00  
 7453  0.978100         2021-03-18 06:58:00  
 2138  0.977505         2021-08-12 06:03:00  
 1994  0.977505         2021-08-16 16:33:00  
 6441  0.977417         2021-04-13 07:00:00  )



120




('Holes in reporting of breakthrough Covid cases hamper CDC response',
                                           article_heading  Similarity score  \
 5693  New Jersey grew because of pizza                     0.981479           
 8477  A new idea on mandatory minimum sentences            0.980913           
 2955  Ciattarelli asks the right for 'wiggle room'         0.980640           
 7453  Murphy goes in for a Gordon                          0.979702           
 8960  Senate Republican Leader Kean won't seek reelection  0.979146           
 
             article_date  
 5693 2021-04-30 06:55:00  
 8477 2021-02-12 06:55:00  
 2955 2021-07-21 06:55:00  
 7453 2021-03-18 06:58:00  
 8960 2021-02-02 07:30:00  )



90




('Holes in reporting of breakthrough Covid cases hamper CDC response',
                                         article_heading  Similarity score  \
 7110  NJ vs. GA on voting laws                           0.986152           
 7690  More problems in District 37                       0.985765           
 5254  Massive Hindu temple raided                        0.985660           
 6836  Ciattarelli and Christie: Still not cool together  0.985141           
 6088  Rizzo gets another shot at matching funds          0.984033           
 
             article_date  
 7110 2021-03-26 06:55:00  
 7690 2021-03-12 06:55:00  
 5254 2021-05-12 06:55:00  
 6836 2021-04-02 06:55:00  
 6088 2021-04-21 06:55:00  )



60




('Holes in reporting of breakthrough Covid cases hamper CDC response',
                                                                       article_heading  \
 5153  O'Toole's Port Authority                                                          
 3432  Murphy to sign rushed budget... 5 days later                                      
 1994  'I stand squarely behind my decision': Biden holds firm on Afghanistan drawdown   
 2138  POLITICO Playbook: Pelosi’s summer squeeze                                        
 8653  Donnelly presses ahead                                                            
 
       Similarity score        article_date  
 5153  0.982976         2021-05-14 06:55:00  
 3432  0.982654         2021-06-29 07:00:00  
 1994  0.982300         2021-08-16 16:33:00  
 2138  0.982300         2021-08-12 06:03:00  
 8653  0.980818         2021-02-09 07:30:00  )



30




('Holes in reporting of breakthrough Covid cases hamper CDC response',
                                        article_heading  Similarity score  \
 7110  NJ vs. GA on voting laws                          0.980599           
 5078  It's just another masked Monday                   0.979449           
 7751  Will a state Senate race have echos of Trumpism?  0.979217           
 8170  Is legal weed finally upon us?                    0.978516           
 6710  George Gilmore's latest move                      0.975206           
 
             article_date  
 7110 2021-03-26 06:55:00  
 5078 2021-05-17 07:00:00  
 7751 2021-03-11 06:58:00  
 8170 2021-02-22 07:30:00  
 6710 2021-04-06 07:00:00  )

## Article heading in the weightage

In [18]:
def get_similar_articles(test_indices,similarity_scores):
    values = np.sort(similarity_scores, axis=0)[::-1,:][:5,:]
    similarity_array = np.argsort(similarity_scores, axis=0)[::-1,:][:5,:]
    for i in range(similarity_array.shape[1]):
        indices = similarity_array[:,i]
        return df_test.iloc[test_indices[i]].article_heading,pd.DataFrame({'article_heading':df_train.iloc[indices].article_heading,'Similarity score':values[:,i],'article_date':df_train.iloc[indices].article_published_on})

In [19]:
def get_similarity_score(text_vectors, X, top_n_values=10):
    """
    Evalute the cosine similarity between provided 'text_vectors' and trained X (articles trained and stored as a vecotr of topics).
    Return dataframe with index as trained articles and columns as text_vector indices with values as similarity scores
    """
    similarity_scores = cosine_similarity(X,text_vectors,dense_output=True)
    return similarity_scores
#     return np.argsort(similarity_scores, axis=0)[::-1,:][:top_n_values,:]

In [20]:
def process_news_article(component,include_headings=False,heading_weightage=0.6,test_indices=None):
    topic_vectors_train,vectorizer,model = get_saved_models(component)
    test_lemmas = text_pipeline(df_test.iloc[test_indices].article_body)
    lemma_test_vectors = text_vectorizer(test_lemmas,vectorizer)
    topic_vectors_test = get_topic_vectors(lemma_test_vectors,model)
    similarity_scores = get_similarity_score(topic_vectors_test,topic_vectors_train)
    heading, result = get_similar_articles(test_indices,similarity_scores)
    if include_headings:
        test_lemmas = text_pipeline(df_test.iloc[test_indices].article_heading)
        lemma_test_vectors = text_vectorizer(test_lemmas,vectorizer)
        topic_vectors_test = get_topic_vectors(lemma_test_vectors,model)
        similarity_scores = heading_weightage*(get_similarity_score(topic_vectors_test,topic_vectors_train))+(1-heading_weightage)*similarity_scores
        _, result = get_similar_articles(test_indices,similarity_scores)
    return heading, result

In [21]:
import random
# test_indices = random.sample(range(df_test.shape[0]), 1)
test_indices = [107]
for component in components_saved:
    print("\n")
    print(component)
    print("\n")
    heading, result = process_news_article(component,include_headings=True,heading_weightage=0.75,test_indices=test_indices)
    print(heading)
    display(result)



300


New Jersey bans jails from contracting with ICE to hold immigration detainees


Unnamed: 0,article_heading,Similarity score,article_date
15913,Ice transfers 30 detainees to unknown location amid hunger strike,0.88785,2021-06-29 20:34:32
17579,California lawmakers push to stop deportations and end jail transfers to Ice,0.798793,2021-02-24 22:07:16
16290,California governor pardons formerly incarcerated firefighters,0.789293,2021-05-29 01:45:18
14961,Manhunt Underway for Six Escaped California Inmates,0.786895,2021-01-11 20:52:27
14899,One of Six Inmates Who Escaped From California Jail Is Recaptured,0.786895,2021-01-13 04:17:33




240


New Jersey bans jails from contracting with ICE to hold immigration detainees


Unnamed: 0,article_heading,Similarity score,article_date
15913,Ice transfers 30 detainees to unknown location amid hunger strike,0.875368,2021-06-29 20:34:32
9356,"The Trump implosion auction, brought to you by a Republican?",0.692033,2021-01-22 06:55:00
5570,Brent Spiner has spoken,0.686218,2021-05-04 07:00:00
7397,The details of the Johnson allegation,0.684969,2021-03-19 06:55:00
6933,Should reporters be a protected class in N.J.?,0.684514,2021-03-31 06:55:00




180


New Jersey bans jails from contracting with ICE to hold immigration detainees


Unnamed: 0,article_heading,Similarity score,article_date
15913,Ice transfers 30 detainees to unknown location amid hunger strike,0.780847,2021-06-29 20:34:32
5570,Brent Spiner has spoken,0.728121,2021-05-04 07:00:00
6280,Murphy challenges Sweeney on gun control,0.723129,2021-04-16 06:58:00
4885,Senate moves bill to ban ICE jail contracts,0.72048,2021-05-21 06:55:00
6441,Lisa McCormick is the new Kanye West,0.71997,2021-04-13 07:00:00




150


New Jersey bans jails from contracting with ICE to hold immigration detainees


Unnamed: 0,article_heading,Similarity score,article_date
15913,Ice transfers 30 detainees to unknown location amid hunger strike,0.899883,2021-06-29 20:34:32
17880,New claims of migrant abuse as Ice defies Biden to continue deportations,0.856205,2021-02-02 23:50:11
18138,Ice flies African asylum seekers to Nairobi in last-minute deportation push,0.855822,2021-01-16 10:00:28
17856,Exclusive: Ice cancels deportation flight to Africa after claims of brutality,0.853252,2021-02-04 21:06:33
11934,South Carolina is set to become the 4th state to allow firing squads to carry out capital punishment.,0.838575,2021-05-07 14:30:03




120


New Jersey bans jails from contracting with ICE to hold immigration detainees


Unnamed: 0,article_heading,Similarity score,article_date
15913,Ice transfers 30 detainees to unknown location amid hunger strike,0.881812,2021-06-29 20:34:32.000000
14417,"In the first blow to Biden’s immigration agenda, a federal judge blocks a 100-day pause on deportations.",0.702081,2021-01-26 22:10:01.000000
13149,Democrats Confront a Surge at the Border,0.701934,2021-03-19 23:00:04.000000
1180,Federal judge orders Biden administration to end new DACA applications,0.701287,2021-07-15 23:59:59.982002
11153,Biden extends temporary work permissions for some undocumented immigrants who are victims of crime.,0.701254,2021-06-14 21:49:22.000000




90


New Jersey bans jails from contracting with ICE to hold immigration detainees


Unnamed: 0,article_heading,Similarity score,article_date
15913,Ice transfers 30 detainees to unknown location amid hunger strike,0.948649,2021-06-29 20:34:32
7544,‘Our state has forgotten us’: Immigrant groups sour on New Jersey’s liberal governor,0.783868,2021-03-16 13:41:00
7510,Johnson and Huttle and ICE contracts,0.763587,2021-03-17 07:01:00
6280,Murphy challenges Sweeney on gun control,0.760162,2021-04-16 06:58:00
5693,New Jersey grew because of pizza,0.755846,2021-04-30 06:55:00




60


New Jersey bans jails from contracting with ICE to hold immigration detainees


Unnamed: 0,article_heading,Similarity score,article_date
15913,Ice transfers 30 detainees to unknown location amid hunger strike,0.941161,2021-06-29 20:34:32.000000
17769,Authorities investigate reports of 80 migrants trapped in truck in Texas,0.832672,2021-02-11 18:33:02.000000
17351,Omar urges end to prison contracts to fix 'abuse-ridden' immigration detention system,0.783777,2021-03-15 18:26:22.000000
7544,‘Our state has forgotten us’: Immigrant groups sour on New Jersey’s liberal governor,0.783665,2021-03-16 13:41:00.000000
1361,"Border agents in Del Rio Sector nab migrants with convictions for child molestation, attempted murder",0.772428,2021-07-09 23:59:59.983001




30


New Jersey bans jails from contracting with ICE to hold immigration detainees


Unnamed: 0,article_heading,Similarity score,article_date
11285,El Chapo’s Wife Set to Plead Guilty to Helping Run Drug Empire,0.982683,2021-06-08 21:00:06.000000
11085,U.S. Ends Trump Policy Limiting Asylum for Gang and Domestic Violence Survivors,0.9794,2021-06-16 23:19:48.000000
443,ICE emails show how agency chief halted deportation after 'Abolish ICE' activist's request,0.974774,2021-08-09 23:59:59.982002
5998,Trump group teams with Texas in challenge to Biden immigration policies,0.970573,2021-04-22 18:15:00.000000
1523,Texas sheriffs sue Biden administration over immigration policy barring deportations,0.970311,2021-07-01 23:59:59.983000


# weightage to recent articles

In [22]:
import math

In [23]:
#exponential deay of importance: day_diff/100
math.exp(-0.3)

0.7408182206817179

In [24]:
import math
def get_article_importance_day_wise(growth=1000):
    diff_from_max_date = (df_train.article_published_on-(df_train.article_published_on.max())).dt.days
    return np.exp(diff_from_max_date/growth)

In [25]:
def get_similarity_score(text_vectors, X, top_n_values=10,factor=None):
    """
    Evalute the cosine similarity between provided 'text_vectors' and trained X (articles trained and stored as a vecotr of topics).
    Return dataframe with index as trained articles and columns as text_vector indices with values as similarity scores
    """
    similarity_scores = cosine_similarity(X,text_vectors,dense_output=True)
    return similarity_scores*factor
#     return np.argsort(similarity_scores, axis=0)[::-1,:][:top_n_values,:]

In [26]:
def process_news_article(component,include_headings=False,heading_weightage=0.6,test_indices=None,factor=np.ones((df_train.shape[0])).reshape(-1,1)):
    topic_vectors_train,vectorizer,model = get_saved_models(component)
    test_lemmas = text_pipeline(df_test.iloc[test_indices].article_body)
    lemma_test_vectors = text_vectorizer(test_lemmas,vectorizer)
    topic_vectors_test = get_topic_vectors(lemma_test_vectors,model)
    similarity_scores = get_similarity_score(topic_vectors_test,topic_vectors_train,factor=factor)
    heading, result = get_similar_articles(test_indices,similarity_scores)
    if include_headings:
        test_lemmas = text_pipeline(df_test.iloc[test_indices].article_heading)
        lemma_test_vectors = text_vectorizer(test_lemmas,vectorizer)
        topic_vectors_test = get_topic_vectors(lemma_test_vectors,model)
        similarity_scores = heading_weightage*(get_similarity_score(topic_vectors_test,topic_vectors_train,factor=factor))+(1-heading_weightage)*similarity_scores
        _, result = get_similar_articles(test_indices,similarity_scores)
    return heading, result

In [27]:
b=np.array([1,2,3])
b.reshape(-1,1)

array([[1],
       [2],
       [3]])

In [28]:
import random
# test_indices = random.sample(range(df_test.shape[0]), 1)
test_indices = [107]
for component in components_saved:
    print("\n")
    print(component)
    print("\n")
    factor = get_article_importance_day_wise(growth=1000).values.reshape(-1,1)
    heading, result = process_news_article(component,include_headings=True,heading_weightage=0.75,test_indices=test_indices,factor=factor)
    print(heading)
    display(result)



300


New Jersey bans jails from contracting with ICE to hold immigration detainees


Unnamed: 0,article_heading,Similarity score,article_date
15913,Ice transfers 30 detainees to unknown location amid hunger strike,0.842861,2021-06-29 20:34:32
15721,Survivors of California’s forced sterilizations: ‘It’s like my life wasn’t worth anything’,0.743036,2021-07-19 10:00:55
10426,Biden Administration Transfers Its First Detainee From Guantánamo Bay,0.740073,2021-07-19 09:02:45
16290,California governor pardons formerly incarcerated firefighters,0.726427,2021-05-29 01:45:18
10474,"A Pause in Federal Executions, but Uncertainty About What’s Next",0.724096,2021-07-16 19:59:07




240


New Jersey bans jails from contracting with ICE to hold immigration detainees


Unnamed: 0,article_heading,Similarity score,article_date
15913,Ice transfers 30 detainees to unknown location amid hunger strike,0.831012,2021-06-29 20:34:32.000000
1994,'I stand squarely behind my decision': Biden holds firm on Afghanistan drawdown,0.673443,2021-08-16 16:33:00.000000
2138,POLITICO Playbook: Pelosi’s summer squeeze,0.670755,2021-08-12 06:03:00.000000
2223,The governor's race has left the state,0.667445,2021-08-10 06:56:00.000000
269,New Jersey Dem governor jets off to Italy for vacation as coronavirus spikes in state,0.662628,2021-08-13 23:59:59.980998




180


New Jersey bans jails from contracting with ICE to hold immigration detainees


Unnamed: 0,article_heading,Similarity score,article_date
15913,Ice transfers 30 detainees to unknown location amid hunger strike,0.74128,2021-06-29 20:34:32.000000
1928,Another tough choice for Rabner,0.717575,2021-08-18 07:00:00.000000
2331,Redistricting about to get started,0.703368,2021-08-06 06:55:00.000000
269,New Jersey Dem governor jets off to Italy for vacation as coronavirus spikes in state,0.694654,2021-08-13 23:59:59.980998
177,Chris Christie slams Phil Murphy over NJ Dem's Italy vacation,0.69349,2021-08-17 23:59:59.976001




150


New Jersey bans jails from contracting with ICE to hold immigration detainees


Unnamed: 0,article_heading,Similarity score,article_date
15913,Ice transfers 30 detainees to unknown location amid hunger strike,0.854285,2021-06-29 20:34:32
15486,Cameroonian asylum-seeker sues US for alleged assault by Ice officers,0.782426,2021-08-12 10:00:14
16014,Leader behind bleach ‘miracle cure’ claims Trump consumed his product,0.780865,2021-06-22 09:00:01
3582,Biden bans solar panel material from Chinese firm over forced labor,0.764931,2021-06-24 13:21:00
11945,"After a Decade Without Executions, South Carolina’s Solution: Bring Out the Firing Squad",0.75499,2021-05-07 07:00:06




120


New Jersey bans jails from contracting with ICE to hold immigration detainees


Unnamed: 0,article_heading,Similarity score,article_date
15913,Ice transfers 30 detainees to unknown location amid hunger strike,0.837129,2021-06-29 20:34:32.000000
998,Harris tells DACA recipients that administration will take 'action' after program is ruled unlawful,0.679593,2021-07-21 23:59:59.981999
1180,Federal judge orders Biden administration to end new DACA applications,0.677166,2021-07-15 23:59:59.982002
1172,"Biden calls DACA ruling 'deeply disappointing,' urges Congress to pass citizenship path",0.676993,2021-07-16 23:59:59.968998
15729,"US justice department to appeal Daca court decision, says Biden",0.664803,2021-07-17 15:25:51.000000




90


New Jersey bans jails from contracting with ICE to hold immigration detainees


Unnamed: 0,article_heading,Similarity score,article_date
15913,Ice transfers 30 detainees to unknown location amid hunger strike,0.90058,2021-06-29 20:34:32
1928,Another tough choice for Rabner,0.751137,2021-08-18 07:00:00
2223,The governor's race has left the state,0.739743,2021-08-10 06:56:00
1886,It's not 2009,0.739618,2021-08-19 06:58:00
2069,"'I feel like I did the right thing,' Cuomo tells mag in first interview since resignation",0.739536,2021-08-13 20:15:00




60


New Jersey bans jails from contracting with ICE to hold immigration detainees


Unnamed: 0,article_heading,Similarity score,article_date
15913,Ice transfers 30 detainees to unknown location amid hunger strike,0.893471,2021-06-29 20:34:32.000000
533,"Border agents in Texas pick up MS-13 gang members, sex offender",0.755731,2021-08-06 23:59:59.978002
443,ICE emails show how agency chief halted deportation after 'Abolish ICE' activist's request,0.741623,2021-08-09 23:59:59.982002
1361,"Border agents in Del Rio Sector nab migrants with convictions for child molestation, attempted murder",0.741399,2021-07-09 23:59:59.983001
10506,Biden’s nominees to lead ICE and the Border Patrol are a sharp departure from the Trump era.,0.740953,2021-07-15 09:00:15.000000




30


New Jersey bans jails from contracting with ICE to hold immigration detainees


Unnamed: 0,article_heading,Similarity score,article_date
443,ICE emails show how agency chief halted deportation after 'Abolish ICE' activist's request,0.965075,2021-08-09 23:59:59.982002
9968,"Biden Administration Violating Decree on Migrant Children, Court Filing Says",0.935759,2021-08-10 01:31:21.000000
1980,Can America Still Help Afghanistan? 8 Former Officials on What’s Next.,0.935002,2021-08-17 04:30:00.000000
2226,"Biden railed against Trump’s immigration policies, now defends them in courts",0.927516,2021-08-10 04:31:00.000000
2124,"In Africa, a second effort to fight a third Covid wave",0.926837,2021-08-12 10:00:00.000000


## Ensemble

In [68]:
def get_similar_articles(test_indices,similarity_scores, top_n_values = 5):
    values = np.sort(similarity_scores, axis=0)[::-1,:][:top_n_values,:]
    similarity_array = np.argsort(similarity_scores, axis=0)[::-1,:][:top_n_values,:]
    for i in range(similarity_array.shape[1]):
        indices = similarity_array[:,i]
    return df_test.iloc[test_indices[i]].article_heading,pd.DataFrame({'article_heading':df_train.iloc[indices].article_heading,'Similarity score':values[:,i],'article_date':df_train.iloc[indices].article_published_on})

In [69]:
import math
def get_article_importance_day_wise(growth=1000):
    diff_from_max_date = (df_train.article_published_on-(df_train.article_published_on.max())).dt.days
    return np.exp(diff_from_max_date/growth)

In [70]:
def get_similarity_score(text_vectors, X, factor=None):
    """
    Evalute the cosine similarity between provided 'text_vectors' and trained X (articles trained and stored as a vecotr of topics).
    Return dataframe with index as trained articles and columns as text_vector indices with values as similarity scores
    """
    similarity_scores = cosine_similarity(X,text_vectors,dense_output=True)
    return similarity_scores*factor
#     return np.argsort(similarity_scores, axis=0)[::-1,:][:top_n_values,:]

In [71]:
def process_news_article(component,include_headings=False,heading_weightage=0.6,test_indices=None,factor=np.ones((df_train.shape[0])).reshape(-1,1)):
    topic_vectors_train,vectorizer,model = get_saved_models(component)
    test_lemmas = text_pipeline(df_test.iloc[test_indices].article_body)
    lemma_test_vectors = text_vectorizer(test_lemmas,vectorizer)
    topic_vectors_test = get_topic_vectors(lemma_test_vectors,model)
    similarity_scores = get_similarity_score(topic_vectors_test,topic_vectors_train,factor=factor)
    heading, result = get_similar_articles(test_indices,similarity_scores)
    if include_headings:
        test_lemmas = text_pipeline(df_test.iloc[test_indices].article_heading)
        lemma_test_vectors = text_vectorizer(test_lemmas,vectorizer)
        topic_vectors_test = get_topic_vectors(lemma_test_vectors,model)
        similarity_scores = heading_weightage*(get_similarity_score(topic_vectors_test,topic_vectors_train,factor=factor))+(1-heading_weightage)*similarity_scores
        _, result = get_similar_articles(test_indices,similarity_scores)
    return heading, result

In [72]:
def ensemble_similarity_scores(components,include_headings=False,heading_weightage=0.6,test_indices=None,factor=np.ones((df_train.shape[0])).reshape(-1,1)):
    component_similarity_scores = []
    for component in components:
        topic_vectors_train,vectorizer,model = get_saved_models(component)
        test_lemmas = text_pipeline(df_test.iloc[test_indices].article_body)
        lemma_test_vectors = text_vectorizer(test_lemmas,vectorizer)
        topic_vectors_test = get_topic_vectors(lemma_test_vectors,model)
        similarity_scores = get_similarity_score(topic_vectors_test,topic_vectors_train,factor=factor)
#         heading, result = get_similar_articles(test_indices,similarity_scores)
        if include_headings:
            test_lemmas = text_pipeline(df_test.iloc[test_indices].article_heading)
            lemma_test_vectors = text_vectorizer(test_lemmas,vectorizer)
            topic_vectors_test = get_topic_vectors(lemma_test_vectors,model)
            similarity_scores = heading_weightage*(get_similarity_score(topic_vectors_test,topic_vectors_train,factor=factor))+(1-heading_weightage)*similarity_scores
#             _, result = get_similar_articles(test_indices,similarity_scores)
        component_similarity_scores.append(similarity_scores)
    return component_similarity_scores

In [73]:
import random
test_indices = random.sample(range(df_test.shape[0]), 1)
# test_indices = [0]
factor=np.ones((df_train.shape[0])).reshape(-1,1)
# weights = np.random.dirichlet(np.ones(8),size=1).reshape(8,)
# factor = get_article_importance_day_wise(growth=1000).values.reshape(-1,1)
component_similarity_scores = ensemble_similarity_scores(components=components_saved,include_headings=True,heading_weightage=0.75,test_indices=test_indices,factor=factor)
similarity_scores = np.average(np.array(component_similarity_scores),axis=0,weights=weights)
heading, result = get_similar_articles(test_indices,similarity_scores,top_n_values=10)
print(heading)
display(result)

Biden Nominates Burns and Emanuel to Be His Ambassadors to China and Japan


Unnamed: 0,article_heading,Similarity score,article_date
2109,Ervin Graves adds top EPA aide,0.660873,2021-08-12 17:11:00
1965,The Sierra Club spat,0.656117,2021-08-17 10:00:00
8921,Cruz delays vote on U.N. nominee Thomas-Greenfield as impeachment trial looms,0.653987,2021-02-02 20:45:00
13724,Senate confirms Linda Thomas-Greenfield to be U.N. ambassador and Tom Vilsack to be agriculture secretary.,0.646475,2021-02-23 17:15:31
3631,Biden nominates Cindy McCain for U.N. food agency ambassadorship,0.62859,2021-06-23 18:34:00
13225,Katherine Tai is confirmed as U.S. trade representative.,0.627484,2021-03-17 16:22:48
3095,Former Sen. Tom Udall is Biden's pick as ambassador to New Zealand and Samoa,0.627106,2021-07-16 15:36:00
14428,"Biden’s pick for commerce secretary, Gina Raimondo, has a confirmation hearing.",0.595459,2021-01-26 13:32:47
9439,Biden names his acting Cabinet,0.595289,2021-01-20 19:00:00
13564,Gina Raimondo is confirmed as commerce secretary.,0.593168,2021-03-02 20:08:21


In [35]:
len(components_saved)

8

In [58]:
old_weights = weights

In [74]:
a = pd.DataFrame({'A':[1,2,3],'B':[10,11,2]})

In [75]:
a

Unnamed: 0,A,B
0,1,10
1,2,11
2,3,2


In [80]:
result=[]
for _,row in a.iterrows():
    result.append(row.to_dict())
result

[{'A': 1, 'B': 10}, {'A': 2, 'B': 11}, {'A': 3, 'B': 2}]

In [81]:
test_indices

[634]

In [83]:
df_test.iloc[634].article_heading

'Biden Nominates Burns and Emanuel to Be His Ambassadors to China and Japan'

In [84]:
df_test.iloc[634].article_body

'WASHINGTON — President Biden on Friday nominated R. Nicholas Burns, a veteran Foreign Service officer and a former ambassador to NATO, as ambassador to China and Rahm Emanuel, the former mayor of Chicago and former President Barack Obama’s first chief of staff, as ambassador to Japan.'

In [None]:
WASHINGTON — President Biden on Friday nominated R. Nicholas Burns, a veteran Foreign Service officer and a former ambassador to NATO, as ambassador to China and Rahm Emanuel, the former mayor of Chicago and former President Barack Obama’s first chief of staff, as ambassador to Japan.

In [93]:
df_test.iloc[test_indices].article_heading.values[0]

'Biden Nominates Burns and Emanuel to Be His Ambassadors to China and Japan'