Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
117 lines (104 sloc) 3.98 KB
@@ -0,0 +1,115 @@
#récupère les liens issus du rapport Google Search Console
#parse la liste de BL, cherche le lien vers notre site sur la page, cherche l'ancre
#en sortie un rapport avec la répartition des ancres
import csv
import requests
import argparse
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
from collections import namedtuple, defaultdict
from concurrent.futures import ThreadPoolExecutor
#on commence par récupérer les URL dans le fichier CSV
def csvtolist(gsc_csv_file) :
with open(gsc_csv_file) as f :
f_csv = csv.reader(f)
next(f_csv) #on vire le header
links = [line[0] for line in f_csv] # liste qui va contenir les BL
return links
def linktocrawl(liste_url):
c = 0
#Fonction NoFollow
def isNofollow(link) :
if 'nofollow' in str(link) or 'Nofollow' in str(link) :
return True
return False
#check des liens internes
def is_internal(url,start_url):
u = urlparse(url)
s = urlparse(start_url)
if (u.netloc == s.netloc):
return True
return False
#Ecriture dans un CSV de sortie
def out_csv(url_property_list) :
with open('out.csv', 'w', newline='') as f:
f_writer = csv.writer(f)
header = 'domain,link,anchor,is_no_follow,internal_outlinks,external_outlinks'
f_writer.writerow(header.split(' '))
for url_property in url_property_list:
f_writer.writerow(url_property)
#Transformation de la fonction en class
class myGscCrawler(object) :
def __init__(self,linklist, domain):
self.linklist = linklist
self.domain = domain
self.count_timeout = 0
self.count_connect_error = 0
self.result = []
self.Url_property = namedtuple('Url_property', 'domain, link, anchor, is_no_follow, internal_outlinks, external_outlinks')
def check_link(self, url):
# logique de check d'URL
try:
print('URL to crawl :', url)
r = requests.get(url, verify=False)
except requests.exceptions.Timeout:
print('soucis de TimeOut')
self.count_timeout += 1
except requests.exceptions.ConnectionError:
print('Erreur de Connection')
self.count_connect_error += 1
soup = BeautifulSoup(r.text, 'lxml')
ndd = urlparse(url).netloc
internalLinks = 0
externalLinks = 0
if r.status_code != 200:
# On zappe les pages mortes
pass
list_links_ok = []
for l in soup.body.find_all('a'):
if not l.has_attr('href'):
continue
u = urljoin(url, l['href'])
u_parse = urlparse(u)
if is_internal(url, l['href']):
internalLinks += 1
else:
externalLinks += 1
list_links_ok.append(l)
for l in list_links_ok :
u = urljoin(url, l['href'])
u_parse = urlparse(u)
if domain in u_parse.netloc:
print(url, l['href'])
self.result.append(self.Url_property(ndd, url, l.string, isNofollow(l), internalLinks, externalLinks))
def check_all(self):
# on va gérer le pool ici
pool = ThreadPoolExecutor(128)
with pool as executor :
jobs = [executor.submit(self.check_link, url) for url in self.linklist]
print('timeout :', self.count_timeout, 'Erreur de Connexions :', self.count_connect_error)
return self.result
if __name__ == "__main__":
"""On réalise un crawler avec requests et bs4 pour les liens dans la GSC"""
parser = argparse.ArgumentParser()
parser.add_argument('-d', '--domain', required = True,
help = "Le domaine analysé")
parser.add_argument('-l', '--list', required = True,
help ="le fichier exporté depuis GSC")
args = parser.parse_args()
domain = args.domain
links = csvtolist(args.list)
test = myGscCrawler(links, domain)
gsclinks = test.check_all()
out_csv(gsclinks)