# Les jeux de données sur data.gouv.fr

In [None]:
%run "librairies.ipynb"

In [None]:
#source = "https://www.data.gouv.fr/fr/datasets.csv?"
#source = "data/datasets-2019-01-02-15-31.csv"
source = "data/datasets.csv"

In [None]:
df = pd.read_csv(source, sep=";", parse_dates=["created_at", "last_modified"])
df = df.drop('resources', 1)
df.head()

In [None]:
import datetime
from bson import ObjectId
#for row in df.iterrows():
#    print(eval(row['extras']))

df_ext = df['extras'].apply(lambda c: pd.Series(eval(c)))

df = pd.concat([df, df_ext['harvest:domain']], axis=1);

In [None]:
df.count()

## date de création

In [None]:
created_year = df.groupby(df.created_at.dt.year).agg('count')

created_year

### nouveaux jeux de données par année

In [None]:
alt.Chart(df).mark_bar().encode(
    x="year:T",
    y="count(*):Q"
).transform_timeunit(
    year='year(created_at)'
)

In [None]:
alt.Chart(df).mark_area().encode(
    x="created_at:T",
    y="cumulative_count:Q"
).transform_window(
    window=[{ "op": "count", "field": "count", "as": "cumulative_count"}],
    sort=[{'field': "created_at"}]
)

In [None]:
alt.Chart(df[df['harvest:domain'].isnull()]).mark_area().encode(
    x="created_at:T",
    y="cumulative_count:Q"
).transform_window(
    window=[{ "op": "count", "field": "count", "as": "cumulative_count"}],
    sort=[{'field': "created_at"}]
)

In [None]:
alt.Chart(df).mark_line().encode(
    alt.X(
        "month:T",
        axis=alt.Axis(format="%b")
    ),
    y="count():Q",
    color=alt.Color(
        "year:O"
    ),
    #row="year:O"
).transform_timeunit(
    year='year(created_at)',
    month='month(created_at)'
)

### 2018

In [None]:
main = alt.Chart(df[(df.created_at > '2018-01-01') & (df.created_at < '2019-01-01')]).mark_line().encode(
    alt.X(
        "month:T",
        axis=alt.Axis(format="%b")
    ),
    y="count():Q"
).transform_timeunit(
    year='year(created_at)',
    month='month(created_at)'
)

mean = alt.Chart(df[df.created_at < '2018-01-01']).mark_line(opacity=0.8, color="grey").encode(
    alt.X(
        "month:T",
        axis=alt.Axis(format="%b")
    ),
    alt.Y(
        "mean(count):Q"
    )
).transform_timeunit(
    year='year(created_at)',
    month='month(created_at)'
).transform_aggregate(
    groupby=["year", "month"],
    count = "count()"
)

mean + main

In [None]:
alt.Chart(df[(df.created_at >= '2018-07-01') & (df.created_at < '2018-08-01')]).mark_line().encode(
    alt.X(
        "day:T",
        axis=alt.Axis(format="%d")
    ),
    y="count():Q",
    color="year:O"
).transform_timeunit(
    day='date(created_at)',
)

In [None]:
df[(df.created_at >= '2018-07-16') & (df.created_at < '2018-07-17')].organization.value_counts()

In [None]:
df[(df.created_at >= '2018-07-18') & (df.created_at < '2018-07-19')].organization.value_counts()

In [None]:
df[(df.created_at >= '2018-07-31') & (df.created_at < '2018-08-01')].organization.value_counts()

## mises à jour

In [None]:
alt.Chart(df).mark_bar().encode(
    x="year:T",
    y="count():Q"
).transform_timeunit(
    year='year(last_modified)'
)

In [None]:
alt.Chart(df).mark_bar().encode(
    alt.X(
        "yearmonth:T",
        axis=alt.Axis(format="%y-%m")
    ),
    y="count:Q"
).transform_timeunit(
    yearmonth='yearmonth(last_modified)'
).transform_aggregate(
    groupby=["yearmonth"],
    count = "count()"
)

### Durée de vie des jeux de données

nombre de jours entre la dernière mise à jour et la date de création

In [None]:
df['lifespan'] = (df['last_modified'] - df['created_at']).dt.days

In [None]:
df[ df['lifespan'] < 0 ].count()

In [None]:
alt.Chart(df[ df['lifespan'] > 0 ]).mark_bar().encode(
    x=alt.X("lifespan:Q", bin=True),
    y=alt.Y("count:Q")
).transform_aggregate(
    groupby=["lifespan"],
    count = "count()"
)

In [None]:
alt.Chart(df[ df['created_at'] < "2019-01-01"]).mark_circle().encode(
    y = "year(created_at):O",
    x = "month(created_at):O",
    size="count():Q",
    color="mean(lifespan):Q",
)

### Fraîcheur des jeux de données

Nombre de jours depuis la dernière mise à jour

In [None]:
df['freshness'] = (pd.to_datetime('2019-01-10') - df['last_modified']).dt.days

In [None]:
df[df['freshness'] < 0].count()

In [None]:
alt.Chart(df[ df['freshness'] > 0 ]).mark_bar().encode(
    x=alt.X("freshness:Q", bin=True),
    y=alt.Y("count:Q")
).transform_aggregate(
    groupby=["freshness"],
    count = "count()"
)

In [None]:
alt.Chart(df[ df['created_at'] < "2019-01-01"]).mark_circle().encode(
    y = "year(created_at):O",
    x = "month(created_at):O",
    size="count():Q",
    color="mean(freshness):Q",
)

In [None]:
alt.Chart(df[ (df['freshness'] > 0) & (df['created_at'] < '2019-01-01')]).mark_bar().encode(
    x=alt.X("freshness:Q", bin=True),
    y=alt.Y("count(*):Q"),
    row="year(created_at):O"
)

### 2018

In [None]:
alt.Chart(df[(df.last_modified > '2018-01-01') & (df.last_modified < '2019-01-01')]).mark_bar().encode(
    x="yearmonth:T",
    y="count:Q"
).transform_timeunit(
    yearmonth='yearmonth(last_modified)'
).transform_aggregate(
    groupby=["yearmonth"],
    count = "count()"
)

## moissoneurs

In [None]:
df['harvest:domain'].isnull().head()

In [None]:
alt.Chart(df[ (df['freshness'] > 0) & (df['harvest:domain'].isnull())]).mark_bar().encode(
    x=alt.X("freshness:Q", bin=True),
    y=alt.Y("count:Q")
).transform_aggregate(
    groupby=["freshness"],
    count = "count()"
)

In [None]:
alt.Chart(df[ (df['freshness'] > 0) & (df['created_at'] < '2019-01-01') & (df['harvest:domain'].isnull())]).mark_bar().encode(
    x=alt.X("freshness:Q", bin=True),
    y=alt.Y("count(*):Q"),
    row="year(created_at):O"
)

## jeux de données sans producteur

In [None]:
len(df[df['organization'].isnull() & df['owner'].isnull()])

In [None]:
df[ (df['organization'].isnull()) & (df['owner'].isnull()) ].to_csv('data/datasets-noorg-nouser.csv')