In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ipywidgets as widgets
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
from ipywidgets import HBox, VBox
from IPython.display import display
%matplotlib inline
pd.set_option('display.max_colwidth', None)

In [2]:
df = pd.read_csv('newsData.csv')

### Text cleaning (remove special characters from keywords)

In [3]:
df = df.replace({'\'': ''}, regex=True)

In [4]:
df = df.replace({'\]': ''}, regex=True)

In [5]:
df = df.replace({'\[': ''}, regex=True)

In [6]:
df = df[['title','text','keywords','link']]
df.shape

(1720, 4)

### convert keywords to string array

In [7]:
keywords = df['keywords'].str.split(',')

In [8]:
keywords = df['keywords'].fillna("").astype('str')

### create tfidf matrix using sklearn TfidfVectorizer

In [9]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')

In [10]:
matrix = tf.fit_transform(keywords)
##show matrix dimensions
matrix.shape

(1720, 30314)

### find cosine similarity using sklearn cosine_similarity

In [11]:
cosSim= cosine_similarity(matrix, matrix)
##show sample of cosine similarity array
cosSim[:10, :10]

array([[1.        , 0.02245185, 0.01624453, 0.01527454, 0.01451524,
        0.03182148, 0.02852832, 0.01732251, 0.02998594, 0.03484808],
       [0.02245185, 1.        , 0.02796744, 0.0379288 , 0.01288564,
        0.02812955, 0.01284527, 0.01537774, 0.02508307, 0.01732719],
       [0.01624453, 0.02796744, 1.        , 0.02930386, 0.01435876,
        0.01596608, 0.01431377, 0.01713577, 0.01504512, 0.00559133],
       [0.01527454, 0.0379288 , 0.02930386, 1.        , 0.01350138,
        0.02370218, 0.07858818, 0.08524931, 0.05140995, 0.04586832],
       [0.01451524, 0.01288564, 0.01435876, 0.01350138, 1.        ,
        0.01426644, 0.01279002, 0.01531161, 0.01344351, 0.00499611],
       [0.03182148, 0.02812955, 0.01596608, 0.02370218, 0.01426644,
        1.        , 0.03627095, 0.0268801 , 0.04775603, 0.03018799],
       [0.02852832, 0.01284527, 0.01431377, 0.07858818, 0.01279002,
        0.03627095, 1.        , 0.02409832, 0.05022627, 0.04507248],
       [0.01732251, 0.01537774, 0.0171357

In [12]:
def recommender(title):
    ##create indices for titles/links
    titles = df['title']
    link = df['link']
    indices = pd.Series(df.index, index=df['title'])
    
    ##find recommendations using cosine similarity
    idx = indices[title]
    similarity = list(enumerate(cosSim[idx]))
    similarity = sorted(similarity, key=lambda x: x[1], reverse=True)
    similarity = similarity[1:21]
    news_indices = [i[0] for i in similarity]
    newsTitle = titles.iloc[news_indices]
    newsLink = link.iloc[news_indices]
    result = newsTitle, newsLink
    return result

## test recommender

In [13]:
recommender('Bank of England gears up for next stimulus push')

(1684                      Australia central bank policy measures working as expected - official
 1633                   UPDATE 1-Australia central bank holds rates, expands bank funding scheme
 1631                     Australia central bank expands low-cost funding as dire GDP data looms
 1519                        UPDATE 1-Indonesia c.bank keeps rates on hold, eases car loan rules
 1681                                            Stimulus deal unlikely until after the election
 1600                           Bank of England policymakers warn of bigger risks for UK economy
 1620                           Bank of England policymakers warn UK economy facing bigger risks
 1560    Indian government consumption key to growth in economy amid pandemic, central bank says
 1707        Taiwan central bank likely to stand pat as economy weathers pandemic - Reuters poll
 1374                                 Sensex, Nifty end lower after broader selloff; banks weigh
 296                      UPDA

In [14]:
recommender('Top Japan government spokesman signals push to re-open economy, boost stimulus')

(1706                       Japans economy minister instructed by new PM to take steps without hesitation
 1659                     Japans worst postwar economic downturn could force new leader to boost stimulus
 256                     Suga seen top contender in Japan PM race as ruling party plans slimmed-down vote
 1597                                Japan shares rise on hopes for U.S. economy, new prime minister eyed
 1678                               Yoshihide Suga picked by Japans governing party to succeed Shinzo Abe
 273     How Shinzo Abes exit could threaten regional stability and Japans alliance with the US (Opinion)
 1664                                   Japans Suga says strong economy necessary to pursue fiscal reform
 1574                   Newsmaker: Japans Shinzo Abe sought to revive economy, fulfil conservative agenda
 272                                            Shinzo Abe: Revisionist nationalist or pragmatic realist?
 1626         UPDATE 1-Thailand announces $2.2

*notice that the titles are in similar rankings in each others lists (good sign)

 ## Get recommendations based on keywords
 ### enter a title to view top 20 recommendations

In [15]:
interact = interact_manual.options(manual_name="find recommendations")
im = interact(recommender, title='Brazils Guedes finds influence waning as Bolsonaro takes up spending reins')
im.widget.children[0].description = 'enter title'
display(im)

interactive(children=(Text(value='Brazils Guedes finds influence waning as Bolsonaro takes up spending reins',…

<function __main__.recommender(title)>