In [1]:
# Parameters
msgs = "Ran from Airflow at 2022-07-09T04:00:00+00:00!"


In [None]:
import time
import os
import requests
import urllib.request
import urllib.parse
import json
import re
import collections
import pandas as pd
import bs4
from bs4 import BeautifulSoup
import dateparser
from datetime import datetime
import numpy as np

In [None]:
ID = '006'
SITE = "pressafrik.com"

In [None]:
categories = ["Monde_r14","Afrique_r22","Opinion_r12","Societe_r23","Medias_r25","Portrait_r24","Reportage_r11","Economie_r9","Politique_r1","Editorial_r13"]

In [None]:
def parseDate(date):
    try:
        return dateparser.parse(date, languages=['fr'])
    except:
        return dateparser.parse(date)

In [None]:
def getWebPage(URL):
    page  = requests.get(URL)
    return BeautifulSoup(page.text, "html.parser")

In [None]:
def getArticleTitle(HTML):       
    h = HTML.find_all("h1", {"class": "access"})
    return str(h[0].text).replace("\n","").replace("\t", "").replace("\r", "").strip()

In [None]:
def getArticleContent(HTML):
    h = HTML.find_all("div", {"class": "access firstletter"})
    return str(h[0].text).replace("\n","").replace("\t", "").replace("\r", "")

In [None]:
def getArticleAuthor(HTML):
    try:
        h = HTML.find_all("div", {"class": "real-auteur auteur"})
        return str(h[0].text).replace("\n","").replace("\t", "").replace("\r", "")
    except: return ""

In [None]:
def getArticleDate(HTML):
    try:
        h = HTML.find_all("div", {"id": "date"})
        return str(h[0].text).replace("\n","").replace("\t", "").replace("\r", "")
    except: return ""

In [None]:
def getArticlesList(KEYWORD):
    listArt_links = []
    URL = "https://www.pressafrik.com"
    step = 5
    index = 0
    s = 0
    while True:
        heartbeat(ID)
        html = getWebPage(f"https://www.pressafrik.com/{KEYWORD}.html?start={index}")
        listArt =[URL+i.find("a",href=True)['href'] for i in html.find_all("div",{"id":"z_col1"})[0].find_all("h3")]
        if index>0 and listArt[-1]==listArt_links[-1][-1]:break
        listArt_links.append(listArt)
        if parseTime(getArticleDate(getWebPage(listArt_links[-1][-1]))).year <= 2019:
            if s>0: return listArt_links
            s+=1
        index+=step
    return listArt_links

In [None]:
def getData(listArt_links):
    data = []
    err_list = []
    for link in listArt_links:
        try:
            heartbeat(ID)
            p = getWebPage(link)
            _title      = getArticleTitle(p)
            _article    = getArticleContent(p)
            _date       = getArticleDate(p)
            _author     = getArticleAuthor(p)
            _commments  = ""
            data.append({'DATE_SCRAPING':str(datetime.datetime.today()),
                  'DATE_ARTICLE':_date,
                  'URL':link,
                  "AUTEUR":_author,
                  "TITRE":_title,
                  "SITE":"pressafrik.com",
                  "CONTENU":_article,
                  "COMMENTAIRES":_commments,})
        except:
            err_list.append(link)
    return data, err_list

In [None]:
def getLinks(category, index):
    URL = "https://www.pressafrik.com"
    page = getWebPage(f"https://www.pressafrik.com/{category}.html?start={index}")
    _links =[URL+i.find("a",href=True)['href'] for i in page.find_all("div",{"id":"z_col1"})[0].find_all("h3")]
    if len(_links)==0: return []
    return _links 


def getNewLinks(category, checkPoint):
    links = []
    index  = 0
    step = 5
    while(True):
        _links = getLinks(category, index)
        if _links == []:return links
        if checkPoint[0] in _links or checkPoint[1] in _links:
            try:
                lastIndex = _links.index(checkPoint[0])
            except: lastIndex = _links.index(checkPoint[0])
            if lastIndex == 0:
                return []
            else:
                links.extend(_links)
                return links[:lastIndex]
        else:
            links.extend(_links)
            index+=step
    return links     


def getArticle(link,category):
        try:
            p = getWebPage(link)
            _title      = getArticleTitle(p)
            _article    = getArticleContent(p)
            _date       = getArticleDate(p)
            _author     = getArticleAuthor(p)
            _commments  = ""
            return [1,{'DATE_SCRAPING':str(datetime.today()),
                  'DATE':parseDate(_date),
                  'DATE_ARTICLE':_date,
                  'CATEGORY':category,
                  'URL':link,
                  "AUTEUR":_author,
                  "TITRE":_title,
                  "SITE":SITE,
                  "CONTENU":_article,
                  "COMMENTAIRES":_commments,}]
        except:
            return [0,link]



def updateJson(path, key, value):
    if not os.path.exists(path):
        _data = {key:[]}
    else:
        try:
            jsonFile = open(path, "r",encoding='utf-8')
            _data = json.load(jsonFile)
            jsonFile.close()
        except:
            print("error UpdateJson")
    _data[key].append(value)
    jsonFile = open(path, "w+", encoding ='utf-8')
    jsonFile.write(json.dumps(_data, default=str))
    jsonFile.close()
    
        
def saveArticle(document):
    Y = document["DATE"].year
    M = document["DATE"].month
    if not os.path.exists(f"/data/notebooks/DB/DATA/{Y}"):
        os.mkdir(f"/data/notebooks/DB/DATA/{Y}") 
    if not os.path.exists(f"/data/notebooks/DB/DATA/{Y}/{M}"):
        os.mkdir(f"/data/notebooks/DB/DATA/{Y}/{M}")
    return updateJson(f"/data/notebooks/DB/DATA/{Y}/{M}/DATA{ID}.json", "DATA", document)


def saveErrors(errList):
    path =  f"/data/notebooks/DB/ERRORLOGS/ERRLOGS{ID}.json"
    Y = datetime.today().year
    M = datetime.today().month
    try:
        jsonFile = open(path, "r",encoding='utf-8')
        _data = json.load(jsonFile)
        jsonFile.close()
    except:
        print("error in saveErrors")
        
    if f"{Y}{M}" not in _data: 
        _data[f"{Y}{M}"] = [errList]
    else:
        _data[f"{Y}{M}"].append(errList)
    jsonFile = open(path, "w+", encoding ='utf-8')
    jsonFile.write(json.dumps(_data, default=str))
    jsonFile.close()



def checkPoint(category, link=None):
    path = f"/data/notebooks/DB/CHECKPOINT/CP{ID}.json"
    try:
        jsonFile = open(path, "r",encoding='utf-8')
        _data = json.load(jsonFile)
        jsonFile.close()
    except:
        print("ERR")
    if link == None: return [_data[category]["cp1"], _data[category]["cp2"]]
    _data[category]["cp2"] = _data[category]["cp1"]
    _data[category]["cp1"] = link
    _data[category]["time"] = str(datetime.today())
    jsonFile = open(path, "w+", encoding ='utf-8')
    jsonFile.write(json.dumps(_data))
    jsonFile.close()
    

    
def update():
    error=[]
    for category in categories:
        _checkPoint = checkPoint(category)
        links = getNewLinks(category, _checkPoint)
        i = 0
        for link in links[::-1]:
            i+=1
            status, document = getArticle(link,category)
            if status==0:error.append(document); continue
            saveArticle(document)
            checkPoint(category,link)
    saveErrors(error)


def initCP():
    path = f"/data/notebooks/DB/CHECKPOINT/CP{ID}.json"
    _data = {}
    for c in categories:
        _data[c] =""
    jsonFile = open(path, "w+", encoding ='utf-8')
    jsonFile.write(json.dumps(_data))
    jsonFile.close()

In [None]:
update()