In [16]:
from dateutil import parser
from datetime import datetime
import logging
import json
import os
import pytz
import requests
from typing import Optional
import pandas as pd
from nltk.sentiment import SentimentIntensityAnalyzer

In [36]:
BASE_URL = 'https://content.guardianapis.com/search'
params = {'api-key': 'test',
                  'order-by': 'newest',
                  'page-size': 10,
                  'page': 1,
                  #'q': '',
                  }
guardian_posts = requests.get(BASE_URL, params=params).json()['response']['results']

In [37]:
guardian_posts

[{'id': 'world/live/2024/jun/10/israel-gaza-war-live-benny-gantz-benjamin-netanyahu-latest-updates',
  'type': 'liveblog',
  'sectionId': 'world',
  'sectionName': 'World news',
  'webPublicationDate': '2024-06-10T12:40:57Z',
  'webTitle': 'Israel-Gaza war live: Netanyahu to decide on future of war cabinet after Gantz resigns',
  'webUrl': 'https://www.theguardian.com/world/live/2024/jun/10/israel-gaza-war-live-benny-gantz-benjamin-netanyahu-latest-updates',
  'apiUrl': 'https://content.guardianapis.com/world/live/2024/jun/10/israel-gaza-war-live-benny-gantz-benjamin-netanyahu-latest-updates',
  'isHosted': False,
  'pillarId': 'pillar/news',
  'pillarName': 'News'},
 {'id': 'world/article/2024/jun/10/us-un-security-council-joe-biden-gaza-peace-deal',
  'type': 'article',
  'sectionId': 'world',
  'sectionName': 'World news',
  'webPublicationDate': '2024-06-10T12:38:47Z',
  'webTitle': 'US to ask UN security council to back Joe Biden’s Gaza peace deal',
  'webUrl': 'https://www.thegua

In [38]:
try:
    sia = SentimentIntensityAnalyzer()
except LookupError:
    import nltk
    # in lambda you can only write to /tmp folder
    # nltk needs to download data to run a model
    nltk.download('vader_lexicon', download_dir='/tmp')
    # nltk will look for the downloaded data to run SentimentIntensityAnalyzer
    nltk.data.path.append("/tmp")
    sia = SentimentIntensityAnalyzer()


def _time_parser(publication_time: str) -> datetime:
    '''
    Parse string from Guardian api like '2024-06-10T10:46:19Z'
    to a datetime object in utc time
    '''
    return parser.parse(publication_time)


def is_recent(guardian_post: dict,
              max_time_interval_minutes: int = 5) -> bool:
    '''
    a post is recent if it is posted in the last x minutes'
    '''
    time_created = _time_parser(guardian_post['webPublicationDate'])
    now = datetime.now(tz=pytz.UTC)
    # converts time to minutes as the function takes minutes as argument
    seconds_diff = (now-time_created).seconds
    minutes_diff = seconds_diff/60
    is_recent_post = minutes_diff <= max_time_interval_minutes
    return is_recent_post


def extract_fields(guardian_post: dict) -> dict:
    '''
    Arbitrary decision to save only some fields of the post,
    store them in a different dictionary form which
    is convenient for saving them later
    '''
    #TODO: removed the author field as it is not present in the guardian api
    time_created = _time_parser(guardian_post['webPublicationDate'])
    text = guardian_post['webTitle']
    return dict(timestamp=time_created, text=text)


def _get_sentiment(string: str) -> float:
    '''
    make sure the score is between -1 (very negative) and 1 (very positive)
    '''
    # sia is the SentimentIntensityAnalyzer object which gives a positive and negative score
    score = sia.polarity_scores(string)
    # we want only 1 score so the negative sentiment will be a negative score 
    # and likewise for the positive
    score = score['neg'] * -1 + score['pos']
    return score

def add_sentiment_score(guardian_post: dict) -> dict:
    guardian_post['sentiment_score'] = _get_sentiment(guardian_post['text'])
    return guardian_post

def convert_timestamp_to_int(guardian_post: dict) ->dict:
    '''datetime object are not serializable for json,
    so we need to convert them to unix timestamp'''
    guardian_post = guardian_post.copy()
    guardian_post['timestamp'] = guardian_post['timestamp'].timestamp()
    return guardian_post


In [39]:
# only take recent posts
recent_guardian_posts = [guardian_post for guardian_post in guardian_posts
                    if is_recent(guardian_post)]   
# format posts
recent_guardian_posts = [extract_fields(guardian_post) for guardian_post in recent_guardian_posts]
# add sentiment to posts
recent_guardian_posts = [add_sentiment_score(guardian_post) for guardian_post in recent_guardian_posts]
# create a filename with datetime timestamp
now_str = datetime.now(tz=pytz.UTC).strftime('%d-%m-%Y-%H:%M:%S')
filename = f'{now_str}.json'
output_path_file = f'/tmp/{filename}'
# in lambda files need to be dumped into /tmp folder

posts_to_save = [convert_timestamp_to_int(guardian_post)
                    for guardian_post in recent_guardian_posts]

guardian_post_df = pd.DataFrame(recent_guardian_posts)

In [40]:
guardian_post_df

Unnamed: 0,timestamp,text,sentiment_score
0,2024-06-10 12:40:57+00:00,Israel-Gaza war live: Netanyahu to decide on f...,-0.479
1,2024-06-10 12:38:47+00:00,US to ask UN security council to back Joe Bide...,0.349
2,2024-06-10 12:37:16+00:00,Russia-Ukraine war live: Jets outside Ukraine ...,-0.178


In [41]:
posts_to_save

[{'timestamp': 1718023257.0,
  'text': 'Israel-Gaza war live: Netanyahu to decide on future of war cabinet after Gantz resigns',
  'sentiment_score': -0.479},
 {'timestamp': 1718023127.0,
  'text': 'US to ask UN security council to back Joe Biden’s Gaza peace deal',
  'sentiment_score': 0.349},
 {'timestamp': 1718023036.0,
  'text': 'Russia-Ukraine war live: Jets outside Ukraine could become targets says Russian politician; Russia claims to have taken Donetsk village',
  'sentiment_score': -0.178}]