# Extraction à partir de mongodb

Ce notebook permet de générer les données nécessaires à la compilation des données.

Cette procédure nécessite l'instanciation d'une base de donnés locale. Nous recommandons plutôt d'utiliser le jeu de données directement compilé et disponible sur data.gouv.fr : [inventaire du catalogue]()

In [3]:
from pymongo import MongoClient
import pandas as pd

In [4]:
client = MongoClient()
db = client.udata

In [5]:
collection = db.list_collection_names()

collection

['community_resource',
 'geo_level',
 'transfer',
 'discussion',
 'user',
 'dataset',
 'migrations',
 'organization',
 'oauth2_token',
 'oauth2_grant',
 'reuse',
 'role',
 'system.indexes',
 'topic',
 'issue',
 'post',
 'slug_follow',
 'oauth2_client',
 'tag',
 'dataset_metric_keys',
 'geo_zone',
 'organization_metric_keys',
 'harvest_job',
 'harvester',
 'zones_with_slashes',
 'datasets_tags',
 'site',
 'harvest_source',
 'metrics',
 'activity',
 'reuse_metric_keys',
 'user_metric_keys',
 'schedules',
 'license',
 'follow',
 'reuses_tags']

In [4]:
db.dataset.count_documents({})

39677

In [5]:
db.dataset.count_documents({'private':False})

30143

In [6]:
db.dataset.count_documents({'resources': {'$gt': []}})

37751

## Compilation des jeux de données

In [7]:
query = {}

datasets = pd.DataFrame(list(db.dataset.find(query)))

In [8]:
datasets.count()

_cls                 22505
_id                  39677
acronym                215
badges               31131
created_at           39677
deleted                  2
description          39671
ext                  39677
extras               39677
featured             35050
frequency            35193
frequency_date        2578
last_modified        39677
license              37824
metrics              39677
organization         37688
owner                 1551
private              30823
resources            39217
slug                 39677
spatial              26162
tags                 39671
temporal_coverage     3804
title                39677
dtype: int64

In [9]:
datasets.to_csv('data/datasets.csv', sep=";")

## Compilation des ressources

In [10]:
pipeline = [
    {'$match': {'resources': {'$gt': []}}},
    {'$unwind': '$resources'}
]

resources = pd.DataFrame([ { **x['resources'], 'dataset.id': x['_id'] } for x in list(db.dataset.aggregate(pipeline)) ])

In [11]:
resources.count()

_id            267273
checksum        13284
created_at     267273
dataset.id     267274
description    201777
extras         259036
filesize        13494
filetype       267210
format         266804
metrics        267274
mime            56856
modified       267271
published      267271
title          267273
type           199512
url            267273
urlhash        267198
dtype: int64

In [12]:
resources.head()

Unnamed: 0,_id,checksum,created_at,dataset.id,description,extras,filesize,filetype,format,metrics,mime,modified,published,title,type,url,urlhash
0,a0e4ea4e-9f86-43cf-90f7-a842d3314fea,,2014-05-07 03:38:17.029,53698e89a3a729239d2034c0,,"{'check:available': True, 'check:count-availab...",,file,csv,"{'nb_uniq_visitors': 0, 'nb_visits': 0, 'nb_hi...",,2014-01-06 10:39:47.676,2014-09-04 05:50:49.114,,,http://static.data.gouv.fr/b1/b3cfd71c4f756c86...,de99ab56743c9d42aa756eb899f8c1c8f0450861
1,1a7c68bf-6730-4a32-8858-bff8666a6deb,,2014-05-07 04:16:00.245,53699760a3a729239d204c97,Plan TCL édition Février 2014. ©TCL,"{'check:available': True, 'check:count-availab...",,remote,,"{'nb_uniq_visitors': 0, 'nb_visits': 0, 'nb_hi...",,2014-04-18 08:03:22.584,2014-09-04 06:45:43.692,Plan TCL,,http://m.site.tcl.fr/var/tcl/storage/original/...,4443a111b7e04116f55d3d0147fa7bdd94efff27
2,c20a9b52-fc5d-463c-875f-37b2437862ce,,2014-05-07 03:45:38.826,53699042a3a729239d20396d,Edition septembre 2013,,,file,xls,"{'nb_uniq_visitors': 0, 'nb_visits': 0, 'nb_hi...",,2014-02-26 16:07:56.956,2014-09-04 06:02:45.557,Catalogue des sites publics ouverts aux événem...,,https://www.data.gouv.fr/storage/f/2014-02-26T...,89888a28038dac186e4935d591c79aff637f3c02
3,d8d29b76-0972-4378-9ed4-b55ec289933e,,2014-05-07 04:16:26.482,5369977aa3a729239d204cdb,,,,file,csv,"{'nb_uniq_visitors': 0, 'nb_visits': 0, 'nb_hi...",,2014-01-07 14:54:04.981,2014-09-04 06:46:25.526,jeux-de-donnees-mairie-de-toulouse-tm.txt,,https://www.data.gouv.fr/storage/f/2013-11-20T...,294de60a9164ac5806346831563fdb4891e63ef7
4,1c4d1524-1145-4b15-9d88-ddbe06d31427,,2014-05-07 03:39:25.985,53698ecda3a729239d203583,données de gonflemnt des argiles,"{'check:available': True, 'check:count-availab...",,remote,mif−mid,"{'nb_uniq_visitors': 0, 'nb_visits': 0, 'nb_hi...",,2014-03-20 16:26:57.324,2014-09-04 05:52:15.675,,,http://www.argiles.fr/donneesDownload.asp,d46eb17075c988e410eb1c07b469851ee8d226f2


In [13]:
resources.to_csv('data/resources.csv', sep=";")

## Compilation des réutilisations

In [14]:
query = {}

reuses = pd.DataFrame(list(db.reuse.find(query)))

In [15]:
reuses.count()

_cls             1759
_id              2355
badges           1178
created_at       2355
datasets         2350
description      2355
ext              2355
extras           1587
featured          960
image            2140
image_url         866
last_modified    2355
metrics          2355
organization      631
owner            1767
private          1552
slug             2355
tags             2355
title            2355
type             2355
url              2355
urlhash          2355
dtype: int64

In [16]:
reuses.to_csv('data/reuses.csv', sep=";")

## Compilation des organisations

In [17]:
query = {}

organizations = pd.DataFrame(list(db.organization.find(query)))

In [18]:
organizations.count()

_cls             1142
_id              2021
acronym          1002
badges           1775
created_at       2021
description      2021
ext              2021
extras           2021
image_url         311
last_modified    2021
logo             1767
members          2001
metrics          2021
name             2021
requests         1869
slug             2021
teams            2021
url              1475
zone                2
dtype: int64

In [19]:
organizations.to_csv('data/organizations.csv', sep=";")

## Compilation des discussions

In [6]:
query = {}

discussions = pd.DataFrame(list(db.discussion.find(query)))

In [7]:
discussions.count()

_id           3111
closed         544
closed_by      544
created       3111
discussion    3111
extras        1464
subject       3111
title         3111
user          3111
dtype: int64

In [8]:
discussions.to_csv('data/discussions.csv', sep=";")

In [9]:
discussions.head()

Unnamed: 0,_id,closed,closed_by,created,discussion,extras,subject,title,user
0,5566b28754b314fff698bbc2,NaT,,2014-09-17 15:15:44.174,"[{'content': 'téléchargement impossible', 'pos...",,"{'_cls': 'Dataset', '_ref': DBRef('dataset', O...",téléchargement impossible,5419884dc751df2afa6125af
1,5566b28754b314fff698bbc3,NaT,,2014-09-18 15:53:41.626,"[{'content': 'lien 404', 'posted_on': 2014-09-...",,"{'_cls': 'Dataset', '_ref': DBRef('dataset', O...",lien 404,534fff48a3a7292c64a77b50
2,5566b28754b314fff698bbc4,NaT,,2014-09-19 09:08:07.066,[{'content': 'par rapport au site du ministère...,,"{'_cls': 'Dataset', '_ref': DBRef('dataset', O...",par rapport au site du ministère (et au doc pd...,534fff42a3a7292c64a7783c
3,5566b28754b314fff698bbc8,NaT,,2013-12-18 18:17:31.356,"[{'content': 'Erreur d'URL sur le document ""Pr...",,"{'_cls': 'Dataset', '_ref': DBRef('dataset', O...","Erreur d'URL sur le document ""Principaux indic...",534fff40a3a7292c64a7767f
4,5566b28754b314fff698bbc9,NaT,,2013-12-18 22:26:47.529,[{'content': 'Le lien pour télécharger les don...,,"{'_cls': 'Dataset', '_ref': DBRef('dataset', O...",Le lien pour télécharger les données ne foncti...,534fff3fa3a7292c64a77653
