In [1]:
# Install packages
# %pip install requests
# %pip install pandas
# %pip install bs4 
# %pip install numpy

In [6]:
# Import libraries
import requests
import datetime
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
from datetime import datetime as dt


# API Key
secret = '2211c202d86d46f78b48b3e532557d7d'
# Define the endpoint
url = 'https://newsapi.org/v2/everything'

# Get and update articles per player per language with date using newsapi.org and Beautifulsoup

## Intro
The goal of this file is to get all articles from newsapi.org that include the given playername. Per article we store the information for which player it was retrieved and the information from the API of when the article was published and in which language. 

The free version of the API limits us in three ways: 
* we are only allowed to make 100 requests per day,
* per request we will only get maximum 100 articles and
* we will only be able to retrieve the data 1 month back.

We nevertheless decided for this API, as the other APIs that we looked at even had more restrictions.

Therefore we created an initial csv with the following steps: 
1. Get data for one player for one language:                get_articlesplayer
2. Get data for multiple players and multiple languages:    get_players_df        using get_articlesplayer
And then we updated this csv, at least every month, with the following steps:
3. Get the Date where a file was last updated:              get_date_of_file
4. Update a given file:                                     updateplayerdata      using get_players_df
Duplicates will be filtered out in the Preprocessing .

## Function get_articlesplayer 
The function takes a player, language and a date from and until which the article data should be created, as an input. 
The function then calls the api and stores all article urls in a list called urls and all dates in a list called dates.
BeautifulSoup is then used to parse each HTML response per url in urls and concatenates all paragraphs into one string. The resulting strings are stored in a dataframe called articles. 
The function returns this dataframe and the dates list.

In [7]:
def get_articlesplayer(player, language, date_to):

    # Set the url from newsapi
    url = 'https://newsapi.org/v2/everything'

    # Create date_form which is the current date
    today = datetime.date.today()
    date_from = today.strftime("%Y-%m-%d")

    # Specify the query and number of returns
    parameters = {
        'q': player, # query phrase
        'pageSize': 100,  # maximum is 100
        'apiKey': secret, # your own API key
        'sortBy':'publishedAt',
        'from': date_from,
        'to': date_to, 
        'language': language
    }

    # get the response
    response = requests.get(url, params = parameters)

    # Parse the JSON response and extract article URLs
    if response.ok:
        data = response.json()
        urls = [article['url'] for article in data['articles']]
        dates = [article['publishedAt'] for article in data['articles']]

    else:
        print('Error: Request failed with status', response.status_code)

    # Scrape the full article content for each URL and store in a dataframe
    articles = []
    for url in urls:
        # Make a request to the article URL
        response = requests.get(url)

        # Parse the HTML response and extract the article content
        if response.ok:
            soup = BeautifulSoup(response.content, 'html.parser')
        
            content = '\n'.join([p.text.strip() for p in soup.find_all('p')])
            articles.append({'content': content})
        else:
            articles.append('-1')
            print('Error: Request failed with status', response.status_code)



    return(articles, dates)

## Get articles for multiple players and multiple Languages
The get_players_df takes a list of players, a list of languages and a date until the articles should be collected as an input. 
A dataframe df_all_players will be created with columns for the articles content, playernames, the given language and the date the article was published. 
The function then loops over the list of players, and within that over the list of langauges, and calls the get_articlesplayer function per language and per player. Each time the articles content, playernames, the given language and the date the article was published is stored in df_one_lang and then appended to df_all_players.
The function returns the df_all_players.

In [8]:
def get_players_df(playerlist, languagelist, date_to):

    # Create an empty Datframe where all player data is stored
    df_all_players = pd.DataFrame(columns=['data', 'player', 'language','publishedAt'])

    # Loop over all players in the player list 
    for player in playerlist:

    # Loop over all languages in the languages list 
        for language in languagelist:

            # Call the get_articlesplayer to get the articles for a given player and language
            articles_arr, dates_arr = get_articlesplayer(player, language, date_to)

            # Create an array with the current language of the same size as the articles data
            language_arr = np.full((len(articles_arr)), language)

            # Create an array with the current player name of the same size as the articles data
            player_arr = np.full((len(articles_arr)), player)

            # Create a datframe with the articles, playername and language
            df_one_lang = pd.DataFrame({'data': articles_arr, 'player': player_arr,'language': language_arr, 'publishedAt': dates_arr})

            # Add the data of the iteration to the prior data 
            df_all_players = pd.concat([df_all_players, df_one_lang])
        
        df_all_players = df_all_players[df_all_players["data"] != "-1"]
        
    return df_all_players

### Save initial dataframe as csv

In [None]:
#ACTION: Can someone please check this (@Gala/Kevin) so that we don't need to run it.
# # Create the list of players which should be selected
# playerlist = ['Mitchel Bakker', 'Jeremie Frimpong', 'Moussa Diaby', 'Jonathan Tah', 'Piero Hincapie', 'Piero Hincapié', 'Exequiel Palacios', 'Mykhaylo Mudryk'] 

# # Create a list of languages 
# languagelist = ['en', 'de', 'es']

# # Create the date_to
# today = datetime.date.today()
# thirty_days_ago = today - datetime.timedelta(days=30)
# date_to = thirty_days_ago.strftime('%Y-%m-%d')


# # Apply the function get_players_df
# df_all_players = get_players_df(playerlist, languagelist, date_to)


# # Define the folder path
# folder_path = "data_files"

# # Define the file path
# file_path = os.path.join(folder_path, "all_data.csv")

# # Save the DataFrame as a CSV file
# df_all_players.to_csv(file_path, header=file_path.tell()==0, index=False) #KEVIN: do we need the header argument?

## Function get_date_of_file
The function takes a file as an input and delivers the last updated time of the file as a string

In [9]:
def get_date_of_file(file):
    # get the last time the file was updated
    creation_time = os.stat(file)
    last_modified_time = creation_time.st_mtime

    # convert the float time into readable format
    seconds = int(last_modified_time)
    microseconds = int((last_modified_time - seconds) * 1000000)
    dta = dt.fromtimestamp(seconds).replace(microsecond=microseconds)
    date_string = dta.strftime('%m-%d-%Y')

    return date_string

# Function updateplayerdata
Within the function a file which needs to be updated is taken as an input. 
The function collects all player data from the last day of updating and then appends the data to the existing csv in append mode.
The output id df_new that now includes the data that was in the csv before aswell as the new data. 

In [10]:
def updateplayerdata(file):
    # Create the list of players which should be selected
    playerlist = ['Mitchel Bakker', 'Jeremie Frimpong', 'Moussa Diaby', 'Jonathan Tah', 'Piero Hincapie', 'Piero Hincapié', 'Exequiel Palacios', 'Mykhaylo Mudryk'] 
    # Create a list of languages 
    languagelist = ['en', 'de', 'es']

    # Call the get_players_df function on the players, languages and the last time the file was updated 
    df_new = get_players_df(playerlist, languagelist, get_date_of_file(file))

    # Open the CSV file in append mode and write the DataFrame
    with open(file, 'a', newline='') as f:
        df_new.to_csv(f, header=f.tell()==0, index=False)

    return df_new

### Update existing csv

In [11]:
#ACTION: change file url
file = '/Users/kevingiesen/Library/Mobile Documents/com~apple~CloudDocs/BIPM Master/Semester 2/TWSM/TWSM Project/SS23-BIPM-Analytics-Lab---Group-4-repository/data_files/all_data_v3.csv'

updateplayerdata(file)

Error: Request failed with status 404
Error: Request failed with status 404
Error: Request failed with status 404
Error: Request failed with status 404
Error: Request failed with status 404
Error: Request failed with status 403
Error: Request failed with status 403
Error: Request failed with status 404
Error: Request failed with status 404
Error: Request failed with status 404
Error: Request failed with status 404
Error: Request failed with status 404
Error: Request failed with status 404
Error: Request failed with status 403
Error: Request failed with status 403
Error: Request failed with status 403
Error: Request failed with status 403
Error: Request failed with status 403
Error: Request failed with status 403
Error: Request failed with status 403
Error: Request failed with status 403
Error: Request failed with status 404
Error: Request failed with status 404
Error: Request failed with status 404
Error: Request failed with status 404
Error: Request failed with status 404
Error: Reque

Unnamed: 0,data,player,language,publishedAt
0,{'content': 'Play Now Men's Brackets Play Now ...,Mitchel Bakker,en,2023-05-17T17:55:40Z
1,{'content': 'Bayer Leverkusen has signed Aleja...,Mitchel Bakker,en,2023-05-17T00:00:19Z
0,{'content': 'Einzeltest Dauertest Fahrbericht ...,Mitchel Bakker,de,2023-05-23T13:37:00Z
1,{'content': 'Nutze kicker auf seinen digitalen...,Mitchel Bakker,de,2023-05-23T10:24:42Z
2,{'content': 'Nutze kicker auf seinen digitalen...,Mitchel Bakker,de,2023-05-22T08:22:38Z
...,...,...,...,...
32,{'content': 'Real Madrid and Manchester City p...,Mykhaylo Mudryk,en,2023-05-09T19:45:09Z
33,{'content': 'West Ham United are the sole Prem...,Mykhaylo Mudryk,en,2023-05-09T14:30:39Z
34,{'content': 'Inter Milan beat AC Milan 2-0 in ...,Mykhaylo Mudryk,en,2023-05-09T14:17:43Z
35,{'content': 'Chelsea transfer news has been in...,Mykhaylo Mudryk,en,2023-05-09T13:30:38Z


# Summary
This file is used to get and update the data that we gather from the website's urls which we gather from the newsapi.org API. 
The output is the csv all_data_v3.csv which is stored in the data folder.

# Next steps for Bayer04 Leverkusen
To further improve the data gathering Bayer04 could change to the paid version of the API, which would allow them to gather more data, more recent (real-time) data and data which is up to 5 years old. For further information: https://newsapi.org/pricing 