Permalink
Cannot retrieve contributors at this time
@@ -0,0 +1,115 @@ | |
#récupère les liens issus du rapport Google Search Console | |
#parse la liste de BL, cherche le lien vers notre site sur la page, cherche l'ancre | |
#en sortie un rapport avec la répartition des ancres | |
import csv | |
import requests | |
import argparse | |
from bs4 import BeautifulSoup | |
from urllib.parse import urlparse, urljoin | |
from collections import namedtuple, defaultdict | |
from concurrent.futures import ThreadPoolExecutor | |
#on commence par récupérer les URL dans le fichier CSV | |
def csvtolist(gsc_csv_file) : | |
with open(gsc_csv_file) as f : | |
f_csv = csv.reader(f) | |
next(f_csv) #on vire le header | |
links = [line[0] for line in f_csv] # liste qui va contenir les BL | |
return links | |
def linktocrawl(liste_url): | |
c = 0 | |
#Fonction NoFollow | |
def isNofollow(link) : | |
if 'nofollow' in str(link) or 'Nofollow' in str(link) : | |
return True | |
return False | |
#check des liens internes | |
def is_internal(url,start_url): | |
u = urlparse(url) | |
s = urlparse(start_url) | |
if (u.netloc == s.netloc): | |
return True | |
return False | |
#Ecriture dans un CSV de sortie | |
def out_csv(url_property_list) : | |
with open('out.csv', 'w', newline='') as f: | |
f_writer = csv.writer(f) | |
header = 'domain,link,anchor,is_no_follow,internal_outlinks,external_outlinks' | |
f_writer.writerow(header.split(' ')) | |
for url_property in url_property_list: | |
f_writer.writerow(url_property) | |
#Transformation de la fonction en class | |
class myGscCrawler(object) : | |
def __init__(self,linklist, domain): | |
self.linklist = linklist | |
self.domain = domain | |
self.count_timeout = 0 | |
self.count_connect_error = 0 | |
self.result = [] | |
self.Url_property = namedtuple('Url_property', 'domain, link, anchor, is_no_follow, internal_outlinks, external_outlinks') | |
def check_link(self, url): | |
# logique de check d'URL | |
try: | |
print('URL to crawl :', url) | |
r = requests.get(url, verify=False) | |
except requests.exceptions.Timeout: | |
print('soucis de TimeOut') | |
self.count_timeout += 1 | |
except requests.exceptions.ConnectionError: | |
print('Erreur de Connection') | |
self.count_connect_error += 1 | |
soup = BeautifulSoup(r.text, 'lxml') | |
ndd = urlparse(url).netloc | |
internalLinks = 0 | |
externalLinks = 0 | |
if r.status_code != 200: | |
# On zappe les pages mortes | |
pass | |
list_links_ok = [] | |
for l in soup.body.find_all('a'): | |
if not l.has_attr('href'): | |
continue | |
u = urljoin(url, l['href']) | |
u_parse = urlparse(u) | |
if is_internal(url, l['href']): | |
internalLinks += 1 | |
else: | |
externalLinks += 1 | |
list_links_ok.append(l) | |
for l in list_links_ok : | |
u = urljoin(url, l['href']) | |
u_parse = urlparse(u) | |
if domain in u_parse.netloc: | |
print(url, l['href']) | |
self.result.append(self.Url_property(ndd, url, l.string, isNofollow(l), internalLinks, externalLinks)) | |
def check_all(self): | |
# on va gérer le pool ici | |
pool = ThreadPoolExecutor(128) | |
with pool as executor : | |
jobs = [executor.submit(self.check_link, url) for url in self.linklist] | |
print('timeout :', self.count_timeout, 'Erreur de Connexions :', self.count_connect_error) | |
return self.result | |
if __name__ == "__main__": | |
"""On réalise un crawler avec requests et bs4 pour les liens dans la GSC""" | |
parser = argparse.ArgumentParser() | |
parser.add_argument('-d', '--domain', required = True, | |
help = "Le domaine analysé") | |
parser.add_argument('-l', '--list', required = True, | |
help ="le fichier exporté depuis GSC") | |
args = parser.parse_args() | |
domain = args.domain | |
links = csvtolist(args.list) | |
test = myGscCrawler(links, domain) | |
gsclinks = test.check_all() | |
out_csv(gsclinks) |