In [121]:
import re
from pymongo import MongoClient

client = MongoClient()
logsdb = client.get_database('management')
metadatadb = client.get_database('metadata')
logscol = logsdb.data
metacol = metadatadb.data

def get_metadata(ids):
    metapipe = { 'DATASET-PID' : {'$in' : ids} } 
    result = metacol.find( metapipe )
    meta = {}
    for x in result:
        try:
            meta[x['DATASET-PID']] = x['EMD:title']
#            print x['EMD:identifier']
        except:
            skip = x['DATASET-PID']
    return meta

def top_downloaded_files(limit):
#    pipe = [{'$match': { 'action': 'DOWNLOAD_FILE_REQUEST'}} , {'$group': {'_id': '$dataset(DATASET_ID', 'count' : { '$sum' : 1 }}}, { '$sort':{'count':-1} },  { '$limit' : limit }]
    pipe = [{'$match': { 'action': 'DOWNLOAD_FILE_REQUEST'}} , {'$group': {'_id': '$file(FILE_NAME(0)', 'count' : { '$sum' : 1 }}}, { '$sort':{'count':-1} },  { '$limit' : limit }]
    resultdata = logscol.aggregate(pipeline=pipe)
    return list(resultdata)

def deposited_files_by_user(limit):
    pipe = [{'$match': { 'action': 'FILE_DEPOSIT'}} , {'$group': {'_id': '$user', 'count' : { '$sum' : 1 }}}, { '$sort':{'count':-1} },  { '$limit': limit }]
    resultdata = logscol.aggregate(pipeline=pipe)
    return list(resultdata)

def most_downloaded_datasets(limit):
    pipe = [{'$group': {'_id': '$dataset(DATASET_ID', 'count' : { '$sum' : 1 }}}, { '$sort':{'count':-1} },  { '$limit' : limit }]
    resultdata = logscol.aggregate(pipeline=pipe)
    datasets = {}
    datalist = []
    copy = resultdata
    for dataset in resultdata:
        datalist.append(dataset['_id'])
        datasets[dataset['_id']] = dataset
    metadata = get_metadata(datalist)
    
    for dataset in datalist:
        try:
            datainfo = datasets[dataset]
            datainfo['title'] = metadata[dataset]
            datasets[dataset] = datainfo
        except:
            skip = 'yes'
    return datasets

#### Top downloaded datasets
Easy2 2Aoud: UC 15 (*) 
For the DANS website: Titles of the most downloaded datasets from the entire collection

A download a dataset is defined as a download from a user from one computer one on one dataset one day or several times one or more files download. 

This definition is identical to the definition of a 'download' in EASY1.
Files that are downloaded by a user more than once in a day, are counted more than eemaal.
Downloads by users with archivist and / or admin role, are not counted.

In [122]:
result = most_downloaded_datasets(10)
for line in result:
    print result[line]
#ids = ["easy-dataset:58245", "easy-dataset:44426"]
#get_metadata(ids)
#downloaded_files(5)

{u'count': 413, u'_id': u'easy-dataset:33895', 'title': u'Verzamelpagina Archeologie'}
{u'count': 549, u'_id': u'easy-dataset:51548', 'title': u"Thematische collectie: Project 'Long shadow of Sobibor' - beschrijving en alle interviews"}
{u'count': 359, u'_id': u'easy-dataset:64220', 'title': u'WoON2015: release 1.0 - WoonOnderzoek Nederland 2015'}
{u'count': 351, u'_id': u'easy-dataset:33871', 'title': u'Voyages of the WIC, 1674-1740'}
{u'count': 1451, u'_id': u'easy-dataset:50480', 'title': u"Sobibor Interviews, interview 02, Alexander 'Sasja' Petsjerski"}
{u'count': 804, u'_id': u'easy-dataset:39351', 'title': u'Dutch slave trade'}
{u'count': 404, u'_id': u'easy-dataset:52125', 'title': u'Zandbanenkaart 2012'}
{u'count': 1080, u'_id': u'easy-dataset:44426', 'title': u'NLGis shapefiles'}
{u'count': 49621, u'_id': None}
{u'count': 1986, u'_id': u'easy-dataset:57847', 'title': u'14035252 BUR.WRL.ARC Eindrapportage archeologisch onderzoek natuurvriendelijke oever NBW Lienden (locaties Va

#### Top downloaded files
A download a file is defined as a download from a user from one computer one on one dataset one day or several times one or more files download. 

Files that are downloaded by a user more than once in a day, are counted more than eemaal.
Downloads by users with archivist and / or admin role, are not counted.

In [123]:
top_downloaded_files(10)

[{u'_id': u'original/W. Klooster, Ships with cacao, Curacao-Netherlands 1701-1755.csv',
  u'count': 31},
 {u'_id': u'original/Merged_ACDS_1971-2006_20100217.sav', u'count': 22},
 {u'_id': u'Methodologie ROA schoolverlatersonderzoeken.pdf', u'count': 20},
 {u'_id': u'2003-05_10.pdf', u'count': 15},
 {u'_id': u'original/Schoorldam Westfriesedijk naast 29-BOORdefinief.pdf',
  u'count': 13},
 {u'_id': u'original/S120307 IVO-P Einderweg 11 te Uddel DEFINITIEF.pdf',
  u'count': 13},
 {u'_id': u'Documentatie_HBO2014.pdf', u'count': 13},
 {u'_id': u'HOP11-Bijlage_3_2_afbeeldingen-A1.pdf', u'count': 12},
 {u'_id': u'original/ETM Data - Study 1.csv', u'count': 12},
 {u'_id': u'original/Merged_ACDS_1971-2006_20100217.por', u'count': 12}]

#### Deposited files by user

In [124]:
deposited_files_by_user(10)

[{u'_id': u'baac', u'count': 516},
 {u'_id': u'KasperOArch', u'count': 73},
 {u'_id': u'mtump', u'count': 54},
 {u'_id': u'jcichy', u'count': 54},
 {u'_id': u'nvdsijs', u'count': 32},
 {u'_id': u'MauriceLipsch', u'count': 24},
 {u'_id': u'valentijng', u'count': 23},
 {u'_id': u'SuperE', u'count': 16},
 {u'_id': u'emiliealsarchivaris', u'count': 15},
 {u'_id': u'suemoerman', u'count': 15}]