In [1]:
# Parameters
msgs = "Ran from Airflow at 2022-07-09T04:00:00+00:00!"


In [2]:
import time
import os
import requests
import urllib.request
import urllib.parse
import json
import re
import collections
import pandas as pd
import bs4
from bs4 import BeautifulSoup
import dateparser
from datetime import datetime
import numpy as np

In [3]:
ID = '007'
SITE = 'senego.com'

In [4]:
categories = ["politique","sport","divertissement-2"]

In [5]:
def parseDate(date):
    try:
        return datetime.strptime(str(date), '%Y-%m-%dT%H:%M:%S+00:00')
    except:
        return dateparser.parse(date)

In [6]:
def getWebPage(URL):
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    page  = requests.get(URL,headers)
    return BeautifulSoup(page.text, "html.parser")

In [7]:
def getArticleTitle(HTML):       
  h = HTML.find_all("h1")
  return str(h[0].text).replace("\n","").replace("\t", "").replace("\r", "").strip()

In [8]:
def getArticleContent(HTML):
  h = HTML.find_all("div", {"class": "entry-content"})[0].find_all("p")
  return " ".join([i.text for i in h])

In [9]:
def getArticleAuthor(HTML):
    try:
        h = HTML.find_all("div", {"class": "author_name"})
        return h[0].text
    except: return ""

In [10]:
def getArticleDate(HTML):
    try:
        h = HTML.find_all("time", {"class": "single-post-time publie"})[0]
        return h['datetime']
    except: return ""

In [11]:
def getLinks(category, index):
    page = getWebPage(f'https://senego.com/rubrique/{category}/page/{index}/')
    _links = [i for i in page.find_all('article')]
    if len(_links) == 0: return []
    return [i.find('a', href=True)['href'] for i in _links]



def getNewLinks(category, checkPoint):
    index = 1
    links = []
    while(True):
        _links = getLinks(category, index)
        if _links == []:return links
        if checkPoint[0] in _links or checkPoint[1] in _links:
            try:
                lastIndex = _links.index(checkPoint[0])
            except: lastIndex = _links.index(checkPoint[0])
            if lastIndex == 0:
                return []
            else:
                links.extend(_links)
                return links[:lastIndex]
        else:
            links.extend(_links)
            index+=1
    return links       



def getArticle(link,category):
        try:
            p = getWebPage(link)
            _title      = getArticleTitle(p)
            _article    = getArticleContent(p)
            _date       = getArticleDate(p)
            _author     = getArticleAuthor(p)
            _commments  = ""
            return [1,{'DATE_SCRAPING':str(datetime.today()),
                  'DATE':parseDate(_date),
                  'DATE_ARTICLE':_date,
                  'CATEGORY':category,
                  'URL':link,
                  "AUTEUR":_author,
                  "TITRE":_title,
                  "SITE":SITE,
                  "CONTENU":_article,
                  "COMMENTAIRES":_commments,}]
        except:
            return [0,link]




def updateJson(path, key, value):
    if not os.path.exists(path):
        _data = {key:[]}
    else:
        try:
            jsonFile = open(path, "r",encoding='utf-8')
            _data = json.load(jsonFile)
            jsonFile.close()
        except:
            print("error UpdateJson")
    _data[key].append(value)
    jsonFile = open(path, "w+", encoding ='utf-8')
    jsonFile.write(json.dumps(_data, default=str))
    jsonFile.close()



def saveArticle(document):
    try:
        Y = document["DATE"].year
        M = document["DATE"].month
    except:print(document["DATE"])
    if not os.path.exists(f"/data/notebooks/DB/DATA/{Y}"):
        os.mkdir(f"/data/notebooks/DB/DATA/{Y}") 
    if not os.path.exists(f"/data/notebooks/DB/DATA/{Y}/{M}"):
        os.mkdir(f"/data/notebooks/DB/DATA/{Y}/{M}")
    return updateJson(f"/data/notebooks/DB/DATA/{Y}/{M}/DATA{ID}.json", "DATA", document)


def saveErrors(errList):
    path =  f"/data/notebooks/DB/ERRORLOGS/ERRLOGS{ID}.json"
    Y = datetime.today().year
    M = datetime.today().month
    try:
        jsonFile = open(path, "r",encoding='utf-8')
        _data = json.load(jsonFile)
        jsonFile.close()
    except:
        print("error in saveErrors")
        
    if f"{Y}{M}" not in _data: 
        _data[f"{Y}{M}"] = [errList]
    else:
        _data[f"{Y}{M}"].append(errList)
    jsonFile = open(path, "w+", encoding ='utf-8')
    jsonFile.write(json.dumps(_data, default=str))
    jsonFile.close()



def checkPoint(category, link=None):
    path = f"/data/notebooks/DB/CHECKPOINT/CP{ID}.json"
    try:
        jsonFile = open(path, "r",encoding='utf-8')
        _data = json.load(jsonFile)
        jsonFile.close()
    except:
        print("ERR")
    if link == None: return [_data[category]["cp1"], _data[category]["cp2"]]
    _data[category]["cp2"] = _data[category]["cp1"]
    _data[category]["cp1"] = link
    _data[category]["time"] = str(datetime.today())
    jsonFile = open(path, "w+", encoding ='utf-8')
    jsonFile.write(json.dumps(_data))
    jsonFile.close()


    
def update():
    error=[]
    for category in categories:
        _checkPoint = checkPoint(category)
        links = getNewLinks(category, _checkPoint)
        i = 0
        for link in links[::-1]:
            i+=1
            status, document = getArticle(link,category)
            if status==0:error.append(document); continue
            saveArticle(document)
            checkPoint(category,link)
    saveErrors(error)


def initCP():
    path = f"/data/notebooks/DB/CHECKPOINT/CP{ID}.json"
    _data = {}
    for c in categories:
        _data[c] =""
    jsonFile = open(path, "w+", encoding ='utf-8')
    jsonFile.write(json.dumps(_data))
    jsonFile.close()

In [12]:
update()