In [1]:
# uncomment the below line to install the requirements
# pip install -r requirements.txt


In [2]:
import json
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np     
import random

## Import the data
 
We have five JSON files to parse. Each file is a batch of articles and their associated metadata from The Guardian's Content API (CAPI). With a free developer key, we are limited to only 200 articles per request. With a total of five downloads, this brings the total to 1000 on which we can use. If you'd like to query the API yourself, this repo contains an example query in a .txt file. You can sign up for your own developer key at https://open-platform.theguardian.com/ 

The JSON files contain many fields that could be useful for the purposes of machine learning. However, we will focus on the following fields for now:

- trailtext 
- headline
- body (this is the data we will use for machine learning, the above two merely provide context)

We can experiment with the metadata later (such as tags, author etc). Let's initialise our arrays and append with each article's relevant field.

In [16]:
files = ["search_0.json", "search_1.json", "search_2.json", "search_3.json", "search_4.json"]

headline = []
trailText = []
body = []

for file in files:
        
    # Opening JSON file
    f = open("articles/" + file)
    
    # returns JSON object as 
    # a dictionary
    data = json.load(f)
        
    #this is the data that we actually care about
    articles = data['response']['results']

    for i in range(len(articles)):
        headline.append((articles[i]["fields"]["headline"]))
        trailText.append((articles[i]["fields"]["trailText"]))
        body.append((articles[i]["fields"]["body"]))



Sanity check the lengths

In [4]:
len(headline), len(trailText), len(body)


(1000, 1000, 1000)

## Preprocessing the data

We should now clean the data as some of the fields contain HTML (which we don't want). 

In [17]:
# body[1]

In [18]:
htmlRemover = re.compile('<.*?>') 
newlineRemover = '\n'

# as per recommendation from @freylis, compile once only

def cleanhtml(raw_html):
    cleantext = re.sub(htmlRemover, '', raw_html)
    cleantext = re.sub(newlineRemover, '', cleantext)
    return cleantext

  

In [19]:
cleanTrailText = []
cleanBody = []

for i in range(len(body)):
    cleanTrailText.append(cleanhtml(trailText[i]))
    cleanBody.append(cleanhtml(body[i]))

In [21]:
# cleanBody[1]

## Transform the articles into a sparse matrix based on similarity

But first ofcourse, we need to perform a tf-idf transformation

In [22]:
vect = TfidfVectorizer(min_df=1, stop_words="english")                                                                                                                                                                                                   
tfidf = vect.fit_transform(cleanBody)                                                                                                                                                                                                                       
pairwise_similarity = tfidf * tfidf.T 

In [23]:
tfidf

<1000x40735 sparse matrix of type '<class 'numpy.float64'>'
	with 339334 stored elements in Compressed Sparse Row format>

In [24]:
#necessary for ensuring we don't just return the same article
arr = pairwise_similarity.toarray()     
np.fill_diagonal(arr, np.nan)     

## Did it work? Let's find out!

Let's draw 10 random articles and find the most similar for each one.

In [25]:
for x in range(1, 10):

    i = random.randint(0,1000)                                                                                                                                                                                                                                                                                                                                                                                                                                  

    input_doc = cleanBody[i]                                                                                                                                                                                                 
    input_idx = cleanBody.index(input_doc)                                                                                                                                                                                                                      
    input_idx                                                                                                                                                                                                                                              
    result_idx = np.nanargmax(arr[input_idx]) 
    print("{}:".format(x))
    print("SAVE FOR LATER ARTICLE: {}".format(headline[i]))
    print("{}".format(trailText[i]))

    print("\n")
    # print("cleanBody[result_idx]   

    print("RECOMMENDED ARTICLE: {}".format(headline[result_idx]))
    print("{}".format(trailText[result_idx]))
    similarity_score = round(arr[input_idx,result_idx], 3)
    print("Similarity score: {}".format(similarity_score))
    print("___________")



1:
SAVE FOR LATER ARTICLE: Dave Brailsford recruited to Andrew Strauss’s review of English cricket
Andrew Strauss has recruited Sir Dave Brailsford, the former performance director of British Cycling and current director of sport at Ineos, to his high performance review of English cricket


RECOMMENDED ARTICLE: Millions of dollars are flowing into US cricket. But is there a market for the sport?
America is the largest sports market in the world. But can cricket really establish a foothold where others have failed?
Similarity score: 0.227
___________
2:
SAVE FOR LATER ARTICLE: Zelenskiy urges ‘maximum sanctions’ against Russia in Davos speech
Ukrainian president tells business leaders they need to decide whether ‘brute force’ should rule the world


RECOMMENDED ARTICLE: Davos day one: Zelenskiy calls for maximum sanctions against Russia; recession fears on the rise – business live
Rolling coverage of the first day of the World Economic Forum in Davos
Similarity score: 0.529
___________
