In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ipywidgets as widgets
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
from ipywidgets import HBox, VBox
from IPython.display import display
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import re
import string
import random
import nbinteract as nbi
%matplotlib inline
pd.set_option('display.max_colwidth', None)

In [2]:
df = pd.read_csv('newsData.csv')

### Text cleaning (remove special characters)

In [3]:
# Function for converting into lower case
def make_lower_case(text):
    return text.lower()


# Function for removing punctuation
def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text

# Function for removing the html tags
def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

# Applying all the functions in description and storing as "cleaned"
df['cleaned'] = df['text'].apply(func = make_lower_case)
df['cleaned'] = df.cleaned.apply(func=remove_punctuation)
df['cleaned'] = df.cleaned.apply(func=remove_html)

### create tfidf matrix using sklearn TfidfVectorizer 
max_df = 0.8 (ignores overused words - not helpful), ngram range 1-3, min document freq = 0

In [4]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 3),max_df=0.8, min_df=0, stop_words='english')

In [5]:
matrix = tf.fit_transform(df['cleaned'])
##show matrix dimensions
matrix.shape

(1720, 1023226)

min document freq = 0.01

In [6]:
tf2 = TfidfVectorizer(analyzer='word',ngram_range=(1, 3),max_df=0.8, min_df=0.01, stop_words='english')

In [7]:
matrix2 = tf2.fit_transform(df['cleaned'])
##show matrix dimensions
matrix2.shape

(1720, 4705)

min document freq = 0.02

In [8]:
tf3 = TfidfVectorizer(analyzer='word',ngram_range=(1, 3),max_df=0.8, min_df=0.02, stop_words='english')

In [9]:
matrix3 = tf3.fit_transform(df['cleaned'])
##show matrix dimensions
matrix3.shape

(1720, 2546)

min document freq = 0.1

In [10]:
tf4 = TfidfVectorizer(analyzer='word',ngram_range=(1, 3),max_df=0.8, min_df=0.1, stop_words='english')

In [11]:
matrix4 = tf4.fit_transform(df['cleaned'])
##show matrix dimensions
matrix4.shape

(1720, 321)

### find cosine similarity using sklearn cosine_similarity

min document freq = 0.0

In [12]:
cosSim= cosine_similarity(matrix, matrix)
##show sample of cosine similarity array
cosSim[:10, :10]

array([[1.        , 0.06880661, 0.01339282, 0.03519993, 0.01280378,
        0.03945928, 0.02100663, 0.01491427, 0.02682117, 0.0299798 ],
       [0.06880661, 1.        , 0.02662433, 0.07078183, 0.02578568,
        0.13277196, 0.04641155, 0.03960896, 0.05256381, 0.06091471],
       [0.01339282, 0.02662433, 1.        , 0.02027867, 0.00837577,
        0.01316197, 0.01436022, 0.00857093, 0.01848081, 0.01110879],
       [0.03519993, 0.07078183, 0.02027867, 1.        , 0.01456893,
        0.03787778, 0.04374385, 0.05828577, 0.05494052, 0.05130178],
       [0.01280378, 0.02578568, 0.00837577, 0.01456893, 1.        ,
        0.02145459, 0.01095427, 0.0140804 , 0.01076846, 0.01222504],
       [0.03945928, 0.13277196, 0.01316197, 0.03787778, 0.02145459,
        1.        , 0.02894496, 0.03477358, 0.06758499, 0.0645916 ],
       [0.02100663, 0.04641155, 0.01436022, 0.04374385, 0.01095427,
        0.02894496, 1.        , 0.02564241, 0.03241848, 0.03557593],
       [0.01491427, 0.03960896, 0.0085709

min document freq = 0.01

In [13]:
cosSim2= cosine_similarity(matrix2, matrix2)
##show sample of cosine similarity array
cosSim2[:10, :10]

array([[1.        , 0.30505936, 0.0656685 , 0.24799205, 0.09311692,
        0.18504195, 0.15462122, 0.12583555, 0.16716745, 0.16579324],
       [0.30505936, 1.        , 0.09965509, 0.36721666, 0.15070892,
        0.39200468, 0.24345122, 0.20887752, 0.25975741, 0.26401425],
       [0.0656685 , 0.09965509, 1.        , 0.0936003 , 0.0451685 ,
        0.04876283, 0.07628341, 0.05362285, 0.08322409, 0.05170567],
       [0.24799205, 0.36721666, 0.0936003 , 1.        , 0.11288847,
        0.19516325, 0.28900596, 0.32078615, 0.34083782, 0.31967299],
       [0.09311692, 0.15070892, 0.0451685 , 0.11288847, 1.        ,
        0.11768203, 0.08867926, 0.13065983, 0.07567915, 0.09472805],
       [0.18504195, 0.39200468, 0.04876283, 0.19516325, 0.11768203,
        1.        , 0.16849977, 0.15129127, 0.19802937, 0.20436147],
       [0.15462122, 0.24345122, 0.07628341, 0.28900596, 0.08867926,
        0.16849977, 1.        , 0.22489324, 0.22782226, 0.25869016],
       [0.12583555, 0.20887752, 0.0536228

min document freq = 0.02

In [14]:
cosSim3= cosine_similarity(matrix3, matrix3)
##show sample of cosine similarity array
cosSim3[:10, :10]

array([[1.        , 0.35747672, 0.08055533, 0.31276858, 0.13564879,
        0.25902813, 0.2031346 , 0.15672031, 0.20438778, 0.22315506],
       [0.35747672, 1.        , 0.10597623, 0.40097926, 0.18447785,
        0.47995665, 0.26980388, 0.22590872, 0.28734823, 0.29643917],
       [0.08055533, 0.10597623, 1.        , 0.09659528, 0.05244029,
        0.05904306, 0.08390779, 0.05981697, 0.0798675 , 0.05826872],
       [0.31276858, 0.40097926, 0.09659528, 1.        , 0.14136134,
        0.23845456, 0.32022135, 0.34650285, 0.38680462, 0.36008027],
       [0.13564879, 0.18447785, 0.05244029, 0.14136134, 1.        ,
        0.16476722, 0.11279112, 0.16135931, 0.09826678, 0.12344018],
       [0.25902813, 0.47995665, 0.05904306, 0.23845456, 0.16476722,
        1.        , 0.19210005, 0.1937333 , 0.24009762, 0.25208536],
       [0.2031346 , 0.26980388, 0.08390779, 0.32022135, 0.11279112,
        0.19210005, 1.        , 0.25370296, 0.26678052, 0.29621404],
       [0.15672031, 0.22590872, 0.0598169

In [15]:
titles = df['title']
link = df['link']
indices = pd.Series(df.index, index=df['title'])
indices

title
Indian telecom giant Vodafone Idea rebrands as ‘Vi’ – TechCrunch                                                      0
Facebook addresses political controversy in India, monetization opportunities, startup investments – TechCrunch       1
YouTube launches its TikTok rival, YouTube Shorts, initially in India – TechCrunch                                    2
Groww, an investment app for millennials in India, raises $30M led by YC Continuity – TechCrunch                      3
LanzaTech is developing a small-scale waste biomass gasifier for ethanol production in India – TechCrunch             4
                                                                                                                   ... 
Europe's electricity could be 80% fossil fuel-free by 2030: industry group                                         1715
Brazil's Guedes finds influence waning as Bolsonaro takes up spending reins                                        1716
Bank of England gears up for next 

In [16]:
def recommender(title):

    idx = indices[title]
    similarity = list(enumerate(cosSim[idx]))
    similarity = sorted(similarity, key=lambda x: x[1], reverse=True)
    similarity = similarity[1:21]
    news_indices = [i[0] for i in similarity]
    newsTitle = titles.iloc[news_indices]
    newsLink = link.iloc[news_indices]
    return [newsTitle, newsLink]

In [17]:
def recommender2(title):

    idx = indices[title]
    similarity = list(enumerate(cosSim2[idx]))
    similarity = sorted(similarity, key=lambda x: x[1], reverse=True)
    similarity = similarity[1:21]
    news_indices = [i[0] for i in similarity]
    newsTitle = titles.iloc[news_indices]
    newsLink = link.iloc[news_indices]
    return [newsTitle, newsLink]

In [18]:
def recommender3(title):

    idx = indices[title]
    similarity = list(enumerate(cosSim3[idx]))
    similarity = sorted(similarity, key=lambda x: x[1], reverse=True)
    similarity = similarity[1:21]
    news_indices = [i[0] for i in similarity]
    newsTitle = titles.iloc[news_indices]
    newsLink = link.iloc[news_indices]
    return [newsTitle, newsLink]

### test recommenders

In [23]:
recommender('Bank of England gears up for next stimulus push')

SyntaxError: invalid syntax (<ipython-input-23-fc09e599a65c>, line 1)

In [20]:
recommender2('Bank of England gears up for next stimulus push')

[1600                            Bank of England policymakers warn of bigger risks for UK economy
 1619                         UK economy might take years to recover from COVID hit-BoE's Vlieghe
 1620                            Bank of England policymakers warn UK economy facing bigger risks
 1654                            UK economy extends recovery from COVID crash, growth seen fading
 1652                                              ECB sees "strong rebound" signs, monitoring FX
 1552                       French central banker says any 2020 GDP forecast revision would be up
 1701                    BOJ holds fire, offers brighter view of economy as pandemic impact eases
 1604              Australia's central bank has limited options as economy sinks into steep slump
 1633                    UPDATE 1-Australia central bank holds rates, expands bank funding scheme
 1658                                         ECB must keep up support for the economy - Villeroy
 1684               

In [21]:
recommender3('Bank of England gears up for next stimulus push')

[1600                            Bank of England policymakers warn of bigger risks for UK economy
 1652                                              ECB sees "strong rebound" signs, monitoring FX
 1620                            Bank of England policymakers warn UK economy facing bigger risks
 1619                         UK economy might take years to recover from COVID hit-BoE's Vlieghe
 1654                            UK economy extends recovery from COVID crash, growth seen fading
 1552                       French central banker says any 2020 GDP forecast revision would be up
 1701                    BOJ holds fire, offers brighter view of economy as pandemic impact eases
 1604              Australia's central bank has limited options as economy sinks into steep slump
 1707         Taiwan central bank likely to stand pat as economy weathers pandemic - Reuters poll
 1633                    UPDATE 1-Australia central bank holds rates, expands bank funding scheme
 1658               

not immediately clear which recommender is best (all seem to include relevant sources) - future work would require user feedback to understand which version users consider most relevant

 ## Get recommendations based on text
 ### enter a title to view top 20 recommendations

In [25]:
interact = interact_manual.options(manual_name="find recommendations")
im = interact(recommender, title='Bank of England policymakers warn of bigger risks for UK economy')
im.widget.children[0].description = 'enter title'
display(im)

interactive(children=(Text(value='Bank of England policymakers warn of bigger risks for UK economy', descripti…

<function __main__.recommender(title)>