In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

In [2]:
rootdir = Path("/mnt/d/Laurent/nlptextdoc-data-201909")
websitesdf = pd.read_csv(rootdir / "_nlptextdoc" / "websites.csv",sep=';')

In [3]:
def getdataframes(websitedir):
    textdffile = websitedir / "_nlptextdoc" / "nlptextdocs.dataframe.feather"
    urlsdffile = websitedir / "_nlptextdoc" / "urls.dataframe.feather"
    if(textdffile.exists()):
        return (pd.read_feather(urlsdffile),pd.read_feather(textdffile))
    else:
        return (None,None)

In [43]:
websitedir = Path(r"D:\Laurent\nlptextdoc-data-201909\www.service-public.fr".replace("\\","/").replace("D:","/mnt/d"))
urlsdf,textdf = getdataframes(websitedir)

In [44]:
len(urlsdf),len(textdf)

(5702, 4650146)

In [46]:
from fasttext import FastText

langmodel = FastText.load_model("models/lid.176.bin")
langlabels = langmodel.get_labels()
langcodes = {val:val[-2:] for val in langlabels}

def getlanguage(text):
    pred = langmodel.predict(text)
    return (langcodes[pred[0][0]],pred[1][0])




In [47]:
import spacy

nlp = spacy.load("fr_core_news_sm",disable=["tagger","ner","parser"])

In [48]:
def add_lang_words_to_dataframes(websitedir):
    textdffile = websitedir / "_nlptextdoc" / "nlptextdocs.dataframe.feather"
    if(textdffile.exists()):
        textdf = pd.read_feather(textdffile)
        if("Lang" in textdf.columns):
            return textdf
        listLanguages = []
        listWordCounts = []
        for rowidx,row in textdf.iterrows():
            if (((row["DocEltType"] != "Document") or (row["DocEltCmd"] == "Title")) and (row["DocEltCmd"] != "End") and not (row["Text"] is None)):
                text = row["Text"]
                wordCount = len(nlp(text))
                listWordCounts.append(wordCount)
                textforlang = text.replace('\n',' ')
                lang,prob = getlanguage(textforlang)
                if(prob > 0.6):
                    listLanguages.append(lang)
                else:
                    listLanguages.append("?")
            else:
                listWordCounts.append(0)
                listLanguages.append(None)        
        textdf["Lang"] = listLanguages
        textdf["Words"] = listWordCounts
        textdf = textdf.astype({"Lang" : "category", "Words": np.uint16},copy=False)
        textdf.to_feather(textdffile)
        return textdf
    else:
        return None

In [49]:
textdf = add_lang_words_to_dataframes(websitedir)

In [38]:
textdf.head(20)

Unnamed: 0,DocId,DocEltType,DocEltCmd,NestingLevel,Text,Lang,Words
0,1,Document,Start,1,1,,0
1,1,Document,Title,1,À propos de nous | 10MeilleuresBanques.fr,fr,6
2,1,Document,Uri,1,https://www.10meilleuresbanques.fr/about,,0
3,1,List,Start,1,,,0
4,1,ListItem,Text,2,Classement,fr,1
5,1,ListItem,Text,2,Évaluation,fr,1
6,1,ListItem,Text,2,Plus d'infos,fr,3
7,1,List,End,1,,,0
8,1,TextBlock,Text,1,Home > About,?,3
9,1,Section,Start,1,À propos de nous,fr,4


In [None]:
for idx,row in websitesdf.iterrows():
    websitedir = Path(row[3].replace("\\","/").replace("D:","/mnt/d"))
    websiteurl = row[1]
    add_lang_words_to_dataframes(websitedir)
    print(f"Enhanced {websiteurl}")

Enhanced https://www.10meilleuresbanques.fr/
Enhanced https://www.abcbourse.com/
Enhanced https://acpr.banque-france.fr/
Enhanced https://www.afer.fr/
Enhanced https://www.ag2rlamondiale.fr/
Enhanced https://www.agpm.fr/
Enhanced https://www.amaguiz.com/
Enhanced https://www.arkea.com/
Enhanced https://www.assurland.com/
Enhanced https://www.aviva.fr/
Enhanced https://www.axa.fr/
Enhanced https://www.banque-edel.fr/
Enhanced https://www.banque-france.fr/
Enhanced http://www.banque-info.com/
Enhanced https://www.banque.fr/
Enhanced https://www.banquepopulaire.fr/
Enhanced https://www.banquesenligne.org/
Enhanced https://www.bforbank.com/
Enhanced https://www.boursedeparis.fr/
Enhanced http://www.boursedirect.fr/
Enhanced https://www.boursier.com/
Enhanced https://www.boursorama-banque.com/
Enhanced https://www.boursorama.com/
Enhanced https://www.bred.fr/
Enhanced https://www.ca-alsace-vosges.fr/
Enhanced http://www.capitaine-epargne.com/
Enhanced http://cercledelepargne.com/
Enhanced h

In [6]:
from hashlib import md5

def add_hash_to_dataframes(websitedir):
    textdffile = websitedir / "_nlptextdoc" / "nlptextdocs.dataframe.feather"
    if(textdffile.exists()):
        textdf = pd.read_feather(textdffile)
        if("Hash" in textdf.columns):
            return textdf
        listHashes = []
        for rowidx,row in textdf.iterrows():
            if (((row["DocEltType"] != "Document") or (row["DocEltCmd"] == "Title")) and (row["DocEltCmd"] != "End") and not (row["Text"] is None)):
                text = row["Text"]
                hval = md5(text.encode()).digest()
                listHashes.append(hval)
            else:
                listHashes.append(None)        
        textdf["Hash"] = listHashes
        textdf.to_feather(textdffile)
        return textdf
    else:
        return None

In [11]:
textdf = add_hash_to_dataframes(websitedir)

In [12]:
textdf.head(20)

Unnamed: 0,DocId,DocEltType,DocEltCmd,NestingLevel,Text,Lang,Words,Hash
0,1,Document,Start,1,1,,0,
1,1,Document,Title,1,À propos de nous | 10MeilleuresBanques.fr,fr,6,b'\x8e\xf7\x00hN\xc2+ \x9f\xe7F\x91\xdc\xcd\xb...
2,1,Document,Uri,1,https://www.10meilleuresbanques.fr/about,,0,
3,1,List,Start,1,,,0,
4,1,ListItem,Text,2,Classement,fr,1,b'\x8b\xf7\xf9\x1d\x96\xf4\xc0t\xe2s\x95\x18\x...
5,1,ListItem,Text,2,Évaluation,fr,1,b'R\xc2W.\x01\\\xeb\xe3v\x04(\x16!\xad>\xca'
6,1,ListItem,Text,2,Plus d'infos,fr,3,b'\x7f\xef\xdd\x90\x88xP!eHE\xd4\xf5u\x8a\xa3'
7,1,List,End,1,,,0,
8,1,TextBlock,Text,1,Home > About,?,3,b'P\xb5TM\x99\xd2\x8b\xc1\xac#Kg\x17U\x02\\'
9,1,Section,Start,1,À propos de nous,fr,4,b'#\xcf\x81\x8f\xf5g\xdd\x84\xbb\x8e\x95\x19\x...


In [50]:
import pandas as pd
from hashlib import md5

def add_unique_to_dataframes(websitedir):
    textdffile = websitedir / "_nlptextdoc" / "nlptextdocs.dataframe.feather"
    if(textdffile.exists()):
        textdf = pd.read_feather(textdffile)
        if("Unique" in textdf.columns):
            return textdf
        if(not ("Lang" in textdf.columns)):
            print("Dataframe not ready with Lang and Words")
            return None
        listUnique = []
        hashes = set()
        for rowidx,row in textdf.iterrows():
            if (((row["DocEltType"] != "Document") or (row["DocEltCmd"] == "Title")) and (row["DocEltCmd"] != "End") and not (row["Text"] is None)):
                text = row["Text"]
                hval = md5(text.encode()).digest()
                if not (hval in hashes):         
                    hashes.add(hval)
                    listUnique.append(True)
                else:
                    listUnique.append(False)
            else:
                listUnique.append(None)
        textdf["Unique"] = listUnique
        textdf.to_feather(websitedir / "_nlptextdoc" / "nlptextdocs.dataframe.feather")
        return textdf
    else:
        return None

In [51]:
textdf = add_unique_to_dataframes(websitedir)
textdf.head(50)

Unnamed: 0,DocId,DocEltType,DocEltCmd,NestingLevel,Text,Lang,Words,Unique
0,1,Document,Start,1,1,,0,
1,1,Document,Title,1,Accueil Associations | service-public.fr,?,4,True
2,1,Document,Uri,1,https://www.service-public.fr/associations,,0,
3,1,NavigationList,Start,1,,,0,
4,1,ListItem,Text,2,Aller au contenu,fr,3,True
5,1,ListItem,Text,2,Aller à la recherche,fr,4,True
6,1,ListItem,Text,2,Aller au menu de Associations,fr,5,True
7,1,NavigationList,End,1,,,0,
8,1,NavigationList,Start,1,,,0,
9,1,ListItem,Text,2,menu,?,1,True


In [11]:
for idx,row in websitesdf.iterrows():
    websitedir = Path(row[3].replace("\\","/").replace("D:","/mnt/d"))
    websiteurl = row[1]
    add_unique_to_dataframes(websitedir)
    print(f"Enhanced {websiteurl}")

Enhanced https://www.10meilleuresbanques.fr/
Enhanced https://www.abcbourse.com/
Enhanced https://acpr.banque-france.fr/
Enhanced https://www.afer.fr/
Enhanced https://www.ag2rlamondiale.fr/
Enhanced https://www.agpm.fr/
Enhanced https://www.amaguiz.com/
Enhanced https://www.arkea.com/
Enhanced https://www.assurland.com/
Enhanced https://www.aviva.fr/
Enhanced https://www.axa.fr/
Enhanced https://www.banque-edel.fr/
Enhanced https://www.banque-france.fr/
Enhanced http://www.banque-info.com/
Enhanced https://www.banque.fr/
Enhanced https://www.banquepopulaire.fr/
Enhanced https://www.banquesenligne.org/
Enhanced https://www.bforbank.com/
Enhanced https://www.boursedeparis.fr/
Enhanced http://www.boursedirect.fr/
Enhanced https://www.boursier.com/
Enhanced https://www.boursorama-banque.com/
Enhanced https://www.boursorama.com/
Enhanced https://www.bred.fr/
Enhanced https://www.ca-alsace-vosges.fr/
Enhanced http://www.capitaine-epargne.com/
Enhanced http://cercledelepargne.com/
Enhanced h

In [30]:
from urllib.parse import urlparse
from collections import Counter

names = []
subdirs = []
counter = Counter()
for idx,row in websitesdf.iterrows():
    subdir = row[3].split("\\")[-1:][0]
    subdirs.append(subdir)
    websiteurl = row[1]
    o = urlparse(websiteurl)
    domain = o.netloc.replace("www.","").replace(".fr","").replace(".org","").replace(".com","").replace(".coop","").replace(".","-")
    counter[domain] +=1
    name = domain if counter[domain]==1 else (domain + "-" + str(counter[domain])) 
    if(name == "banque"):
        name = "banque-fr"
    if(name == "banque-2"):
        name = "banque-org"
    if(name == "boursorama-2"):
        name = "boursorama-forum"
    if(name == "droit-financesmentcamarche"):
        name = "droit-finances-commentcamarche"
    if(name == "cbanque-2"):
        name = "cbanque-forum"
    if(name == "linternaute"):
        name = "linternaute-argent"
    names.append(name)
websitesdf["Name"] = names
websitesdf["Subdir"] = subdirs
websitesdf

Unnamed: 0,Dataset,Url,Scope,Path,Name,Subdir
0,Comparateur,https://www.10meilleuresbanques.fr/,domain,D:\Laurent\nlptextdoc-data-201909\10meilleures...,10meilleuresbanques,10meilleuresbanques.fr
1,Bourse,https://www.abcbourse.com/,domain,D:\Laurent\nlptextdoc-data-201909\abcbourse.com,abcbourse,abcbourse.com
2,Institution,https://acpr.banque-france.fr/,subdomain,D:\Laurent\nlptextdoc-data-201909\acpr.banque-...,acpr-banque-france,acpr.banque-france.fr
3,Assurance,https://www.afer.fr/,domain,D:\Laurent\nlptextdoc-data-201909\afer.fr,afer,afer.fr
4,Assurance,https://www.ag2rlamondiale.fr/,domain,D:\Laurent\nlptextdoc-data-201909\ag2rlamondia...,ag2rlamondiale,ag2rlamondiale.fr
...,...,...,...,...,...,...
140,Presse,https://www.ouest-france.fr/economie/banques-f...,subdomain,D:\Laurent\nlptextdoc-data-201909\www.ouest-fr...,ouest-france,www.ouest-france.fr
141,Banque,https://www.revolut.com/fr-FR/,path,D:\Laurent\nlptextdoc-data-201909\www.revolut....,revolut,www.revolut.com_fr-FR_
142,Institution,https://www.service-public.fr/particuliers/vos...,subdomain,D:\Laurent\nlptextdoc-data-201909\www.service-...,service-public,www.service-public.fr
143,Crédit,https://www.younited-credit.com/,domain,D:\Laurent\nlptextdoc-data-201909\younited-cre...,younited-credit,younited-credit.com


In [33]:
websitesdf.drop(columns="Path",inplace=True)
websitesdf

Unnamed: 0,Dataset,Url,Scope,Name,Subdir
0,Comparateur,https://www.10meilleuresbanques.fr/,domain,10meilleuresbanques,10meilleuresbanques.fr
1,Bourse,https://www.abcbourse.com/,domain,abcbourse,abcbourse.com
2,Institution,https://acpr.banque-france.fr/,subdomain,acpr-banque-france,acpr.banque-france.fr
3,Assurance,https://www.afer.fr/,domain,afer,afer.fr
4,Assurance,https://www.ag2rlamondiale.fr/,domain,ag2rlamondiale,ag2rlamondiale.fr
...,...,...,...,...,...
140,Presse,https://www.ouest-france.fr/economie/banques-f...,subdomain,ouest-france,www.ouest-france.fr
141,Banque,https://www.revolut.com/fr-FR/,path,revolut,www.revolut.com_fr-FR_
142,Institution,https://www.service-public.fr/particuliers/vos...,subdomain,service-public,www.service-public.fr
143,Crédit,https://www.younited-credit.com/,domain,younited-credit,younited-credit.com


In [35]:
websitesdf.to_csv(rootdir / "_nlptextdoc" / "websites.csv",sep=';')

In [36]:
targetdir = Path("/mnt/d/Users/Laurent/OneDrive/Dev/Python/nlptextdoc/dataset 092019/extraction")

In [54]:
import shutil

for idx,row in websitesdf.iterrows():
    name = row[3]
    subdir = row[4]
    source1 = rootdir / subdir / "_nlptextdoc" / "nlptextdocs.dataframe.feather"
    source2 = rootdir / subdir / "_nlptextdoc" / "urls.dataframe.feather"    
    if(not source1.exists()):
        print("ERROR: " + source1)
        continue
    #shutil.copyfile(source1, targetdir / (name + ".nlptextdocs.dataframe.feather"))
    shutil.copyfile(source2, targetdir / (name + ".urls.dataframe.feather"))
    print(name + " copy OK")

10meilleuresbanques copy OK
abcbourse copy OK
acpr-banque-france copy OK
afer copy OK
ag2rlamondiale copy OK
agpm copy OK
amaguiz copy OK
arkea copy OK
assurland copy OK
aviva copy OK
axa copy OK
banque-edel copy OK
banque-france copy OK
banque-info copy OK
banque-fr copy OK
banquepopulaire copy OK
banquesenligne copy OK
bforbank copy OK
boursedeparis copy OK
boursedirect copy OK
boursier copy OK
boursorama-banque copy OK
boursorama copy OK
bred copy OK
ca-alsace-vosges copy OK
capitaine-epargne copy OK
cercledelepargne copy OK
cetelem copy OK
cic copy OK
cnp copy OK
cofidis copy OK
cofinoga copy OK
comparabanques copy OK
comparalivrets copy OK
compte-nickel copy OK
credit-cooperatif copy OK
credit-du-nord copy OK
creditmutuel copy OK
culturebanque copy OK
diac copy OK
direct-assurance copy OK
droit-finances-commentcamarche copy OK
eko-by-ca copy OK
empruntis copy OK
en-bourse copy OK
eurofil copy OK
ffa-assurance copy OK
fortuneo copy OK
forum-doctissimo copy OK
forum-doctissimo-2 cop