Order for scripts:

1. download.py
2. import_transform.py
3. matrix.py
4. run.py

Extra scripts useful for AWS and to be experimented with

1. _import_and_merge.py
2. _join.py
3. _split.py
4. _tests.py

In [20]:
import requests
import json
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np   
import random

## download.py

In [15]:
print("Running")

for page_no in range(1,11): #gets 10 pages worth of CAPI data
    print(f'Fetching response {page_no}')
    
    url = f'https://content.guardianapis.com/search?page-size=200&page={page_no}&show-fields=trailText,headline,body,byline&api-key=f2652ec5-7f11-4682-bb81-16c0a5e6c850'
          
    r = requests.get(url, allow_redirects=True)
          
    print('Writing to disk...')
    open(f'articledata/articles_{page_no}.json', 'wb').write(r.content)
    print(f'Saved articledata/articles_{page_no}.json')
    
print("Done")

Running
Fetching response 1
Writing to disk...
Saved articledata/articles_1.json
Fetching response 2
Writing to disk...
Saved articledata/articles_2.json
Fetching response 3
Writing to disk...
Saved articledata/articles_3.json
Fetching response 4
Writing to disk...
Saved articledata/articles_4.json
Fetching response 5
Writing to disk...
Saved articledata/articles_5.json
Fetching response 6
Writing to disk...
Saved articledata/articles_6.json
Fetching response 7
Writing to disk...
Saved articledata/articles_7.json
Fetching response 8
Writing to disk...
Saved articledata/articles_8.json
Fetching response 9
Writing to disk...
Saved articledata/articles_9.json
Fetching response 10
Writing to disk...
Saved articledata/articles_10.json
Done


## import_transform.py

In [18]:
files = ["articles_1.json", "articles_2.json", "articles_3.json", "articles_4.json", "articles_5.json", "articles_6.json", "articles_7.json", "articles_8.json", "articles_9.json", "articles_10.json"]

results = []

for file in files:
    
    print(f'Opening {file}')
        
    # Opening JSON file
    f = open("articledata/" + file)
    
    # returns JSON object as 
    # a dictionary
    # print(data)
    data = json.load(f)
    
    # print((data["response"]["results"][0]))
    
    data = data["response"]["results"]
    
    print("Appending items to array")
    for item in data:
        results.append(item)
    
    f.close()
    
myDict = {"all_articles": results}

data = myDict["all_articles"]

headline = []
trailText = []
body = []
ids = []

print("Appending items to array")
for article in data:

    headline.append(article["fields"]["headline"])
    trailText.append(article["fields"]["trailText"])
    body.append(article["fields"]["body"])
    ids.append(article["id"])
    

print("Removing html")
htmlRemover = re.compile('<.*?>') 
newlineRemover = '\n'

# as per recommendation from @freylis, compile once only

def cleanhtml(raw_html):
    cleantext = re.sub(htmlRemover, '', raw_html)
    cleantext = re.sub(newlineRemover, '', cleantext)
    return cleantext

cleanTrailText = []
cleanBody = []

for i in range(len(body)):
    cleanTrailText.append(cleanhtml(trailText[i]))
    cleanBody.append(cleanhtml(body[i]))
    

print("Performing vectorization")

vect = TfidfVectorizer(min_df=1, stop_words="english")
tfidf = vect.fit_transform(cleanBody)                                                                                       
pairwise_similarity = tfidf * tfidf.T 

arr = pairwise_similarity.toarray()     
np.fill_diagonal(arr, np.nan)     
arr = arr.astype('float16') #compress to float 16
print("Saving data")

###before you save, split the data and put it back together in the run.py

# np.split(arr, 2)

# file = 1
# for split in (np.split(arr, 2)):
#     print(f"Saving file {file}")
#     np.save(f"data/sparseMatrix{file}.npy", split)
#     file += 1

np.save("data/sparseMatrix.npy", arr)

textfile = open("data/article_ids.txt", "w")
for element in ids:
    textfile.write(element + "\n")
textfile.close()
    
print("Finished")




Opening articles_1.json
Appending items to array
Opening articles_2.json
Appending items to array
Opening articles_3.json
Appending items to array
Opening articles_4.json
Appending items to array
Opening articles_5.json
Appending items to array
Opening articles_6.json
Appending items to array
Opening articles_7.json
Appending items to array
Opening articles_8.json
Appending items to array
Opening articles_9.json
Appending items to array
Opening articles_10.json
Appending items to array
Appending items to array
Removing html
Performing vectorization
Saving data
Finished


# run.py

In [19]:
### load data

txt_file = open("data/article_ids.txt", "r")
file_content = txt_file.read()

ids = file_content.split("\n")
txt_file.close()

arr = np.load('data/sparseMatrix.npy')

### return id

#example ID
ex = ids[random.randint(0, len(ids))] #random number generator

#get index positoin of article
input_idx = ids.index(ex)                    
result_idx = np.nanargmax(arr[input_idx]) 

print("SAVE FOR LATER ARTICLE: {}".format(ids[input_idx]))
print("{}".format(ids[input_idx]))

print("\n")

print("RECOMMENDED ARTICLE: {}".format(ids[result_idx]))
print("{}".format(ids[result_idx]))
similarity_score = round(arr[input_idx,result_idx], 3)
print("Similarity score: {}".format(similarity_score))



SAVE FOR LATER ARTICLE: money/2022/aug/23/six-ways-to-reduce-gas-consumption-uk-energy-bills
money/2022/aug/23/six-ways-to-reduce-gas-consumption-uk-energy-bills


RECOMMENDED ARTICLE: environment/2022/aug/23/energy-use-is-a-decision-for-individuals-insist-no-10-and-truss-allies
environment/2022/aug/23/energy-use-is-a-decision-for-individuals-insist-no-10-and-truss-allies
Similarity score: 0.427001953125
