# MongoDB utilities

Development notebook for two sets of utilities:

1. mongo_utils.py: general purpose
2. we1s_mongo_utils: project-specific

Below are examples of reporting queries for inspecting the WE1S mongodb collections.

## mongo_utils.py

In [9]:
"""mongo_utils.py
General purpose tools for working with pymongo and mongodb.
"""

from collections import Counter
from datetime import datetime
import json
import os
from pymongo import MongoClient 
from pymongo.errors import DuplicateKeyError, InvalidDocument
import pprint
pp = pprint.PrettyPrinter(indent=2, compact=False)

# import sys
# sys.path.insert(0, '/home/jovyan/utils/preprocessing/')
# from libs.fuzzyhasher.fuzzyhasher import FuzzyHasher
# from libs.zipeditor.zipeditor import ZipEditor, zip_scanner, zip_scanner_excludedirs, ZipProcessor
# from we1s_utils.ziputils import BatchJSONUploader

# filter = {"name": {"$regex": r"^(?!system\.)"}}

# def _tabulate_row(*row):
#     return ''.join(str(word).ljust(12) for word in row)



##############################
##  mongodb doc generators  ##
##############################
## these two (client_colls and db_colls)
## and their dependents might be refactored
## -- as it turns out, the database and client
## are retrievable from the collection object,
## so interfaces to collection level function
## could be simplified.
##
## client = MongoClient('mongodb://mongo/')
## db = client['we1s']
## coll = db['deletes_humanities']
##
## print(coll)
## print(coll.name)
## print(coll.database)
## print(coll.database.name)
## print(coll.database.client)
##
## OUTPUT:
## Collection(Database(MongoClient(host=['mongo:27017'], document_class=dict, tz_aware=False, connect=True), 'we1s'), 'deletes_humanities')
## deletes_humanities
## Database(MongoClient(host=['mongo:27017'], document_class=dict, tz_aware=False, connect=True), 'we1s')
## we1s
## MongoClient(host=['mongo:27017'], document_class=dict, tz_aware=False, connect=True)
## Database(MongoClient(host=['mongo:27017'], document_class=dict, tz_aware=False, connect=True), 'name')

def client_colls(client, filter=None):
    """All collections across all databases--generator."""
    d = dict((db, [collection for collection in client[db].list_collection_names(filter=filter)])
             for db in client.list_database_names())
    for db in d:
        for coll in d[db]:
            yield client, db, coll
            # yield coll
        
def db_colls(client, db, filter=None):
    """All collections in a database--generator."""
    db_coll_list = [collection for collection in client[db].list_collection_names(filter=filter)]
    for coll in db_coll_list:
        yield client, db, coll
        # yield coll



def mdbkey_decode(key):
    """Decode encoded mongodb key to recover original string."""
    return key.replace("\\u002e", ".").replace("\\u0024", "\$").replace("\\\\", "\\")

def mdbkey_encode(key):
    """Encode key field with . or $ to be valid mongodb key."""
    return key.replace("\\", "\\\\").replace(".", "\\u002e").replace("\$", "\\u0024")

def mdbkey_strip(key):
    """Strip . or $ to enforce valid mongodb key."""
    return key.replace(".", "").replace("$", "")

def print_doc(doc, pop_list=None, trim_dict=None):
    """Cleans up verbose mongodb JSON documents for preview
    using a list of top-level fields to pop or trim.
    """
    if pop_list:
        for key in pop_list:
            if key in doc:
                doc.pop(key)
    if trim_dict:
        for key, value in trim_dict.items():
            if key in doc:
                doc[key] = doc[key][0:value]
    pp.pprint(doc)


def print_query(query, style='pp'):
    """Convenience method for displaying pymongo
    queries in readable formats. Three styles:
    PrettyPrinter, json.dumps(), or print().
    
    Examples:
        >>> query = {'$or': [{'name': {'$regex': '.*liberal.*'}}, {'name': {'$regex': '.*humanities.*'}}]}
        >>> print_query(query)
        {   '$or': [   {'name': {'$regex': '.*liberal.*'}},
                       {'name': {'$regex': '.*humanities.*'}}]}

        >>> print_query(query, style=None)
        {'$or': [{'name': {'$regex': '.*liberal.*'}}, {'name': {'$regex': '.*humanities.*'}}]}

        >>> print_query(query, style='json')
        {
            "$or": [
                {
                    "name": {
                        "$regex": ".*liberal.*"
                    }
                },
                {
                    "name": {
                        "$regex": ".*humanities.*"
                    }
                }
            ]
        }
    """
    if style=='pp':
        # pprint adds linebreaks and indents, yet stays compact
        pp.pprint(query)
    elif style=='json':
        # json can also be used to print nested dict/lists
        # in a more articulated indented outline form
        print(json.dumps(query, sort_keys=True, indent=4))
    else:
        # pymongo queries are python dicts of dicts (or lists of dicts)
        # however, these nested lines may be hard to read        
        print(query)

def _test_print_query():
    """Test three output formats"""
    print('Display a mongodb query in three different print formats.')
    query = {'$or': [{'name': {'$regex': '.*liberal.*'}}, {'name': {'$regex': '.*humanities.*'}}]}
    print('----------')
    print_query(query, style='None')
    print('----------')
    print_query(query)
    print('----------')
    print_query(query, style='json')

def print_table(rows):
    for row in rows:
        print(''.join(str(word).ljust(12) for word in row))

def print_SON(son):
    if son:
        print(index.to_dict())

def report_aggregate(coll_list, pipeline):
    """
    pipeline_pub = [
        {'$group' : {'_id' : '$pub', 'count' : {'$sum' : 1}}},
        { '$sort' : {'count' : -1} }
    ]
    pipeline_term = [
        {'$group' : {'_id' : '$term', 'count' : {'$sum' : 1}}},
        { '$sort' : {'count' : -1} }
    ]
    """
    report = []
    for client, db, coll in coll_list:
        result = client[db].command('aggregate', coll, pipeline=pipeline, explain=False)
        for row in result:
            if row:
                report.append(json.dumps(row, sort_keys=True, indent=2))
    return report

def report_collstats(coll_list, key='count', header=True):
    """Return a stat (e.g. document count) for each mongodb
    collection list.
    May be passed a generator such as client_colls or db_colls.
    Pretty-print with print_table().

    Examples:
        >>> report_collstats((client, 'we1s', 'reddit'))
        count       db          coll      
        12345       we1s        reddit    

        >>> report_collstats(db_colls(client, 'we1s'), key='avgObjSize')
        avgObjSize  db          coll      
        19788       we1s        reddit    
        105391      we1s        humanities_keywords_no_exact

    Args:
        coll_list (tuple): (client, db_name, coll_name)
        key (str): the stat to report.
        header (bool): Include column headers in output

    Returns:
        A list of the key value for each db.collection:
        [(key, db, coll),
         (key, db, coll),
         (key, db, coll)]
    """

    report = []
    for client, db, coll in coll_list:
        if key:
            value = client[db].command("collstats", coll)[key]
        else:
            value = client[db].command("collstats", coll)
        report.append((value, db, coll))
    if report:
        if header:
            report.insert(0, (key, 'db', 'coll'))
        return report

def report_indexes(coll_list):
    """Return all indexes for each mongodb collection in a list.
    May be passed a generator such as client_colls or db_colls.
    Pretty-print with print_SON().
    
    Args:
        coll_list (tuple): (client, db_name, coll_name)

    Returns:
        A list of indexes (SON objects).
        [SON, SON, SON]
    """
    result = []
    for client, db, coll in coll_list:
        for index in client[db][coll].list_indexes():
            if index:
                result.append(index)
    return result




class MongoCounter:
    """For a mongo server or collection, build a list of FieldCounters"""

    def __init__(self, client=None, progress=None):
        self.results = []
        self.client = client
        self.progress = progress

    def _client_dbs_command(self, client, command='collstats'):
        """Loop over all dbs and all collections,
        returning command result for each collection.
        """
        d = dict((db, [collection for collection in client[db].list_collection_names()])
                     for db in client.list_database_names())
        for db in d:
            for coll in d[db]:
                # print(db, coll)
                yield client[db].command(command, coll), db, coll

    def _tabulate(self, *row):
        return ''.join(str(word).ljust(10) for word in row)

#     # DEPRECATED
#     def count_docs(self, client=None):
#         """Display document counts for all collections in all dbs.
#         e.g.
#         79 local startup_log
#         752243 we1s reddit
#         418302 we1s humanities_keywords
#         """
#         if not client: client = self.client
#         counts = self._client_dbs_command(client, command='collstats')
#         for collstats, db, coll in counts:
#             yield collstats['count'], db, coll

#     def count_docs_report(self, client=None):
#         if not client: client = self.client
#         counts = self.count_docs(client)
#         for collstats, db, coll in counts:
#             print(self._tabulate(collstats, db + '.' + coll))
            
    def count_fields_all(self,  client=None, progress=None, show_complete=True):
        if not client: client = self.client
        for db in client.list_database_names():
            self.count_fields_db(db, client, progress, show_complete)
        
    def count_fields_db(self, db, client=None, progress=None, show_complete=True):
        if not client: client = self.client
        db_colls = [collection for collection in client[db].list_collection_names()]
        for coll in db_colls:
            self.count_fields_collection(coll, db, client, progress, show_complete)
    
    def count_fields_collection(self, coll, db, client=None, progress=None, show_complete=True):
        if not client: client = self.client
        if progress==None and self.progress:
            progress=self.progress
        coll_count = client[db].command("collstats", coll)['count']
        fcounter = FieldCounter(name=db+'.'+coll + '(' + str(coll_count) + ' docs)', show_complete=show_complete)
        cursor = client[db][coll].find({})
        print(fcounter)
        for doc in cursor:
            if progress and fcounter.total !=0 and fcounter.total % progress == 0:
                print(fcounter)
            fcounter.count_fields(doc)
        # print('\n\n', '[FINAL]')
        print(fcounter.report())
        self.results.append(fcounter)

#     # DEPRECATED
#     def list_indexes(self, client=None):
#         """Loop over all dbs and all collections,
#         returning command result for each collection.
#         """
#         if not client: client = self.client
#         d = dict((db, [collection for collection in client[db].list_collection_names()])
#                      for db in client.list_database_names())
#         for db in d:
#             for coll in d[db]:
#                 for index in client[db][coll].list_indexes():
#                     print(index, db, coll)

    def __str__(self):
        return '\n'.join([result for result in self.results])


class FieldCounter:
    """Build counters of dictionary fields.
    Useful for surveying mongodb collections for presence/absence of keys across documents.
    Keeps two fields counters:
    
    -  counter: present fields with value
    -  counter_empties: present fields that are false (None, 0, '', false, etc.)
    ...and:
    -  total: number of documents counted
    
    Supports includes and excludes -- lists of fields to include or ignore.
    If includes are defined then only includes will be counted -- unless they
    are subsequently filtered by the excludes list.
    """

    def __init__(self, name='', show_complete=True, includes=None, excludes=None):
        self.name = name
        self.show_complete = show_complete
        self.includes = includes
        self.excludes = excludes
        self.clear()

    def _tabulate(self, *row):
        return ''.join(str(word).ljust(10) for word in row)
    
    def clear(self):
        self.counter = Counter()
        self.counter_empties = Counter()
        self.total = 0

    def count_collection(self, docs, includes=None, excludes=None):
        if not includes: includes = self.includes
        if not excludes: excludes = self.excludes
        for doc in docs:
            yield self.count_fields(doc, includes, excludes)

    def count_fields(self, doc, includes=None, excludes=None):
        """"""
        if not includes: includes = self.includes
        if not excludes: excludes = self.excludes
        for key, value in doc.items():
            if (not includes or key in includes) and (not excludes or key not in excludes):
                if value:
                    self.counter[key] += 1
                else:
                    self.counter_empties[key] += 1
        self.total+=1
        return doc, self.total

    def report(self, header=False):
        result = ''
        if header:
            result = self.__str__()
        if self.total > 0:
            colnames = self._tabulate('found', 'empties', 'missing', 'key')
            emptycount = 0
            entries = ''
            for key, value in sorted(self.counter.items()):
                if value != self.total:
                    emptycount = self.counter_empties.get(key, 0)
                if self.show_complete or value != self.total:
                    entries += self._tabulate(value, emptycount, self.total-value, key) + '\n'
            if entries:
                result = '\n'.join([result, colnames, entries])
                result += self._tabulate(str(self.total), 'counted') + '\n'
            else:
                result += '[no fields empty/missing]\n'
        else:
            result += '[no docs]\n'
        return result
    
    def __str__(self):
        return self.name


In [10]:
# STRIP FIELD WHITESPACE


def mongo_coll_field_update(coll, field, func):
    """Given a collection, updates each doc with the field,
    changing it to using the lambda function."""
    hits = 0
    for doc in coll.find({field: {'$exists': True, '$ne': []}}, {field:1}):
        if mongo_field_update(coll, doc, field, func):
            hits += 1
    return hits

def mongo_field_update(coll, doc, field, func):
    """Given a doc, changes a field and updates it in mongodb."""
    if field_update(doc, field, func):
        # print('*', end='')
        coll.update_one({'_id': doc['_id']},
                        {'$set': {field: doc[field]} },
                        upsert=False)
        return True
    return False

def field_update(doc, field, func):
    """Takes a doc, field name, and lambda function for the field.
    Changes the field in place and returns True if updated.
    """
    if field in doc and doc[field]:
        val = func(doc[field])
        if doc[field] != val:
            doc[field] = val
            return True
    return False

def func_strip(x):
    """Many fields have leading and trailing spaces,
    leading to e.g. four different "The New York Times".
    for a collection, iterate over all pub fields and,
    if there are leading or trailing spaces, strip.
    this should really be part of a pre-import validator.
    """
    return x.strip()
    # this could also be passed without a function as:
    #   lambda x: x.strip()

    
#
#  DEPRECATED 
#
#  def strip_field(db, collection, field):
#     """Many fields have leading and trailing spaces,
#     leading to e.g. four different "The New York Times".
#     for a collection, iterate over all pub fields and,
#     if there are leading or trailing spaces, strip.
#     this should really be part of a pre-import validator.
#     """
#     test_collect = client['we1s']['deletes_humanities']
#     hits = 0
#     fixed = 0
#     for doc in test_collect.find({'pub': {'$exists': True, '$ne': []}}, {'pub':1}):
#         hits += 1
#         if 'pub' in doc and doc['pub']:
#             if doc['pub'] != doc['pub'].strip():
#                 fixed += 1
#                 test_collect.update_one({'_id': doc['_id']},
#                                     {'$set': {'pub': doc['pub'].strip()} },
#                                     upsert=False)
#     print('hits:  ', hits)
#     print('fixed: ', fixed)
# 
# strip_field(1, 2, 3)

In [None]:
print('Strip whitespace from field in all articles in a collection.')
coll = client['we1s']['deletes_humanities']
hits = mongo_coll_field_update(coll, 'term', lambda x: x.strip())
print(hits)

## we1s_mongo_utils.py

In [51]:
"""we1s_mongo_utils.py
WE1S-specific importing and updating tools for pymongo and mongodb.
"""

import copy
import csv
import json
import os
import pymongo
import pprint

pp = pprint.PrettyPrinter(indent=2, compact=False)

def doc_preview(doc, pop_list=None, trim_dict=None, trim_mark='...', object=False, width=None):
    preview = copy.deepcopy(doc)
    if pop_list:
        for key in pop_list:
            if key in preview:
                preview.pop(key)
    if trim_dict:
        for key, value in trim_dict.items():
            if key in preview:
                if len(key) <= value and len(key) > len(trim_mark):
                    preview[key] = ''.join([preview[key][0:value-len(trim_mark)], trim_mark])
    if object:
        return preview
    if width: 
        return pprint.pformat(preview, width=width)
    return pp.pformat(preview)

class SourcesProcessor:
    """Documents the pipeline for populating mongodb source docs
    and aliases from a csv spreadsheet.
    
    Use:
        csv_to_mongo(client, filepath)
        
    Internal class methods are stages in the pipeline.
    """

    def __init__(self,
                 client=MongoClient('mongodb://mongo/'),
                 file_path='../sources_master.csv',
                 source_path=['Sources','Sources'],
                 aliases_path=['Sources','config','source_aliases']):
        self.client = client
        self.file_path = file_path
        self.source_path = source_path
        self.aliases_path = aliases_path
        self.clear()

    def clear(self):
        self.aliases = {}
        self.source_docs = {}
    
    def get_csv_put_mongo(self):
        self.get_csv()
        self.put_mongo_aliases()
        self.put_mongo_source_docs()

    def get_csv(self):
        """Given a DictReader, returns sources docs and an aliases lookup as data structures.
        
        The data format is:
        title,name,canonical_title,tags1,tags2,tags3,tags4,tags5,tags6,country,language
        
        mockup of a Source manifest:
        {
            "name": "advance-titan-university-of-wisconsin-oshkosh",
            "metapath": "Sources",
            "namespace": "we1s2.0",
            "title": "Advance-Titan: University of Wisconsin - Oshkosh",
            "country": "US",
            "language": "en",
            "tags": ["region/US/Midwest", "education/funding/US public college"],
            "collection_identifiers": ["Advance", "Advance-Titan", "Advance-Titan: University of Wisconsin - Oshkosh"]
        }
        """

        self.aliases = {} # alias-to-name lookup entries
        self.source_docs = {}  # sources docs
        self.csv_parse_log = []
        name_prev = ''
        csvfile = open(self.file_path, 'r')
        reader = csv.DictReader(csvfile)
        for row in reader:
            
            # build source_name_aliases
            akey = mdbkey_encode(row['alias'])
            dkey = row['name'].strip()
            if akey and akey not in self.aliases:
                self.aliases[akey] = dkey
            else:
                self.csv_parse_log.append("duplicate key '{0}' found: '{1}'".format(akey, row['name'].strip()))
                # raise ValueError("duplicate key '{0}' found".format(akey))
    
            # build source_docs
            if dkey in self.source_docs:
                # if name is a repeat, add an alias and continue
                self.source_docs[dkey]['aliases'].append(row['alias'].strip())
            else:
                # if source name is new, create
                # this is an explicit add -- new named columns in the sheet won't be imported
                self.source_docs[dkey] = {}
                self.source_docs[dkey]['_id'] = dkey
                self.source_docs[dkey]['name'] = dkey
                self.source_docs[dkey]['country'] = row['country']
                self.source_docs[dkey]['language'] = row['language']
                self.source_docs[dkey]['metapath'] = 'Sources'
                self.source_docs[dkey]['namespace'] = 'we1s2.0'
                self.source_docs[dkey]['title'] = row['canonical_title'].strip()
                tags = [row['tags1'],row['tags2'],row['tags3'],row['tags4'],row['tags5'],row['tags6']]
                self.source_docs[dkey]['tags'] = [x for x in tags if x]
                self.source_docs[dkey]['aliases'] = [row['alias'].strip()]
    
    def get_mongo_aliases(self):
        """get aliases from db"""
        ap = self.aliases_path
        self.aliases = self.client[ap[0]][ap[1]].find_one({'_id' : ap[2]})['aliases']

    def get_mongo_source_docs(self):
        sp = self.source_path
        results = self.client[sp[0]][sp[1]].find({})
        self.source_docs = {}
        for result in results:
            self.source_docs[result['_id']] = result        

    def put_mongo_aliases(self):
        """insert aliases into db as doc"""
        doc = {}
        doc['_id'] = self.aliases_path[2]
        doc['aliases'] = self.aliases
        ap = self.aliases_path
        self.client[ap[0]][ap[1]].replace_one({'_id' : ap[2]}, doc, upsert=True)

    def put_mongo_source_docs(self):
        """insert source docs into db"""
        sp = self.source_path
        for key, source_doc in self.source_docs.items():
            client[sp[0]][sp[1]].replace_one({'_id':source_doc['_id']}, source_doc, upsert=True)

    def __repr__(self):
        return f'{self.__class__.__name__}(client={self.client}, file_path={self.file_path}, source_path={self.source_path}, aliases_path={self.aliases_path})'


class ArticleProcessor:
    """Dynamic rewriting of article data fields, partricularly
    the source field and api fields.
    Relies on data from source aliases -- uploaded / retrieved with SourcesProcessor
    """
    
    def __init__(self, sources_processor):
        self.sp = sources_processor
        self.sp.get_mongo_aliases()
        self.aliases = sp.aliases
        self.source_docs = sp.source_docs

    def display_ipython_table(self, data):
        from IPython.display import display, HTML
        # css_str = '<style>body{background-color:#000000}; table{width:600px !important}; td{width:200 !important};</style>'
        css_str = '<style>td{border: 1px solid black} td{text-align:left; vertical-align:top}</style>'
        display(HTML(
            css_str + '<table><tr style>{}</tr></table>'.format(
                '</tr><tr>'.join(
                '<td><pre>{}</pre></td>'.format('</pre></td><td><pre>'.join(str(_) for _ in row)) for row in data)
            )), metadata=dict(isolated=True))

    def json_add_api_fields(self, json_data, name_hint):
        """Set json api... fields from name hint and optional
        database field. Hint may be generated by
        json_add_api_fields_guess checking the name field.
        """
        translations = {
            'LexisNexis':['we1s-collector', 'LexisNexis'],
            'LexisNexis UniversityWire':['we1s-collector', 'LexisNexis', 'UniversityWire'],
            'chomp':['chomp','chomp'],
            # 'chomp':['chomp','google.com'],
            # 'chomp':['chomp','wordpress.com'],
            'ProQuest':['ProQuest Global Newsstream','ProQuest'],
            'Global Newsstream':['ProQuest Global Newsstream','ProQuest'],
            'Global Newsstrea m':['ProQuest Global Newsstream','ProQuest'],
            'Globa l Newsstream':['ProQuest Global Newsstream','ProQuest'],
            'Global Newss tream':['ProQuest Global Newsstream','ProQuest'],
            'Ethnic NewsWatch':['ProQuest Global Newsstream', 'ProQuest','Ethnic NewsWatch'],
            'Ethnic NewsWatc h':['ProQuest Global Newsstream','ProQuest','Ethnic NewsWatch'],
            'Ethnic N ewsWatch':['ProQuest Global Newsstream','ProQuest','Ethnic NewsWatch'],
            'Ethnic NewsWatch; GenderWatch':['ProQuest Global Newsstream','ProQuest','Ethnic NewsWatch; GenderWatch'],
            'GenderWatch':['ProQuest Global Newsstream','ProQuest','GenderWatch'],
            'reddit':['reddit','reddit'],
            'Twitter':['Twitter','Twitter']
        }
        # manual dictionary
        if 'database' in json_data:
            # lookups and 
            labels = translations[json_data['database']]
        else:
            # fall back to name label
            labels = translations[name_hint]
    
        # set the fields
        if 'api_software' not in json_data:
            json_data['api_software'] = labels[0]
        if 'api_data_provider' not in json_data:
            json_data['api_data_provider'] = labels[1]
        if(len(labels)>2):
            if 'api_data_provider_channel' not in json_data:
                json_data['api_data_provider_channel'] = labels[2]
    
    def json_add_api_fields_guess(self, json_data):
        """Deduce api fields hint from name, try lookup
        and add api_fields to json.
        """
        if 'name' in json_data:
            if 'chomp' in json_data['name'].lower() :
                self.json_add_api_fields(json_data, 'chomp')
            elif 'reddit' in json_data['name'].lower():
                self.json_add_api_fields(json_data, 'reddit')
            elif 'proquest' in json_data['name'].lower():
                self.json_add_api_fields(json_data, 'ProQuest')
            elif 'twitter' in json_data['name'].lower():
                self.json_add_api_fields(json_data, 'Twitter')
            elif 'universitywire' in json_data['name'].lower():
                self.json_add_api_fields(json_data, 'LexisNexis UniversityWire')
            else:
                self.json_add_api_fields(json_data, 'LexisNexis')
    
    def json_add_source(self, json_data, aliases=None):
        """use source name aliases dict to lookup and save canonical source name.
        """
        if not aliases: aliases = self.aliases
        lookup_name = ''
        if 'name' in json_data:
            if 'chomp' in json_data['name'].lower() :
                # chomp zip:  chomp_vox_humanities_2000-01-01_2020-01-01.zip
                # chomp json: chomp_vox_humanities_2000-01-01_2020-01-01_0.json
                lookup_name = json_data['name'].split('_')[1]
            elif 'reddit' in json_data['name'].lower():
                # reddit zip:  reddit-all-the-arts-2006-2018-264.zip
                # reddit json: Reddit-The-Arts-All-2006-2018_180.json
                lookup_name = 'Reddit'
            elif 'proquest' in json_data['name'].lower():
                # proquest has no standard format
                # proquest  zip: proquest_thewallstreetjournal_humanities_1984_1989.zip
                # proquest json: proquest_thewallstreetjournal_humanities_1984_1989_001_.json
                if 'thewallstreetjournal' in json_data['name'].lower():
                    # proquest-wallstreet exception
                    lookup_name = 'thewallstreetjournal'
                else:
                    lookup_name = json_data['pub']
            elif 'universitywire' in json_data['name'].lower():
                # LN University Wire
                # zip: 172244_universitywire_bodypluralhumanitiesorhleadpluralhumanities_2014-01-01_2014-12-31.zip
                # json: 172244_172244_universitywire_bodypluralhumanitiesorhleadpluralhumanities_2014-01-01_2014-12-31_16_0_0.json
                lookup_name = json_data['pub']
            else:
                # LexisNexis default
                # zip: 8006_thelatimes_bodypluralhumanitiesorhleadpluralhumanities_2011-01-01_2011-12-31.zip
                # json: 8006_8006_thelatimes_bodypluralhumanitiesorhleadpluralhumanities_2011-01-01_2011-12-31_55_0_0.json
                lookup_name = json_data['name'].split('_')[2]
        source_name = aliases[mdbkey_encode(lookup_name)]
        # print('cn:', canonical_name, type(canonical_name))
        if 'sources' in json_data:
            json_data.pop('sources')
        json_data['source'] = source_name

    def json_update(self, doc, aliases=None):
        """Given an in-memory doc, do an in-memory rewrite
        based on:
            add_source
            add_api_fields_guess
        This does not add/update the document to a database.
        """
        if not aliases: aliases = self.aliases
        # print('rewrite:', doc['_id'])
        self.json_add_source(doc, aliases)
        self.json_add_api_fields_guess(doc)

    def json_update_previews(self, doc, pop_list=None, trim_dict=None, width=None):
        """Changes the in-memory doc with json_update,
        returns two doc_previews with pretty printing: before and after.
        """
        before_prev = doc_preview(doc, pop_list=pop_list, trim_dict=trim_dict, width=width)
        self.json_update(doc)
        after_prev = doc_preview(doc, pop_list=pop_list, trim_dict=trim_dict, width=width)
        return before_prev, after_prev

    def mongo_replace_docs(self, docs, collection):
        for doc in docs:
            self.json_update(doc)
            collection.replace_one({'_id': doc['_id']}, doc, upsert=False)
    
    def mongo_update_docs(self, docs, collection):
        for doc in docs:
            self.mongo_update_doc(doc, collection)
    
    def mongo_update_doc(self, doc, collection):
        self.json_update(doc)
        update_command = {'$set': {'api_data_provider': doc['api_data_provider'].strip(),
                                   'api_software': doc['api_software'].strip(),
                                   'source': doc['source'].strip()
                                  },
                          '$unset': { 'sources' : "", 'length' : "" }}
        if 'api_data_provider_channel' in doc:
            update_command.setdefault('$set', {})
            update_command['$set']['api_data_provider_channel'] = doc['api_data_provider_channel']            
        if 'api_data_provider' in doc and 'LexisNexis' in doc['api_data_provider']:
            update_command.setdefault('$rename', {})
            update_command['$rename']['doc_id'] = 'ln_doc_id'
            update_command['$rename']['attachment_id'] = 'ln_attachment_id'
        
        updated = collection.find_one_and_update({'_id': doc['_id']},
                                                 update_command,
                                                 return_document=pymongo.ReturnDocument.AFTER,
                                                 upsert=False)
        return updated


In [14]:
## API NOTES

# PROQUEST
# ADD
#   'api_name': 'ProQuest'
#   'api_notes' : '2018'
# RENAME
#   content-hash-ssdeep -> content_hash_ssdeep
#   doc_id -> proquest_doc_id
#   word_count -> content_word_count
# DELETE
#   'attachment-id': '' --> [delete]
#   length -> [delete]
# ??:
# 'namespace': 'we1sv2.0',
# 'ppversion': '0.1',

# 'name': 'BrazzilLosAngeles_ProQuestDocuments-Humanities-2019-05-08',
# 'name': '323722_323722_bournemouthecho_bodypluralartsorhleadpluralarts_2014-01-01_2014-12-31_52_5_0',
  
# LEXISNEXIS
# 'api': 'LexisNexis'
# 'attachment-id': '' --> [delete]

# 'api': 'Chomp:'
# 'api': 'ProQuest'
# 'api': 'Reddit'
# 'api': 'Twitter'

## we1s_mongo_import.py

In [62]:
"""we1s_mongo_import.py
Combine imported ZipProcessor and a custom BatchJSONUploader2
(prev version was added to ziputils)
"""

import json
import os
from pymongo import MongoClient 
from pymongo.errors import DuplicateKeyError, InvalidDocument
import sys

sys.path.insert(0, '/home/jovyan/utils/preprocessing/')
from libs.fuzzyhasher.fuzzyhasher import FuzzyHasher
from libs.zipeditor.zipeditor import ZipEditor, zip_scanner, zip_scanner_excludedirs, ZipProcessor
from we1s_utils.ziputils import BatchJSONUploader

class BatchJSONUploader2:
    """Processor takes a file path, iterates over JSON files,
    and uploads to a mongodb database.
    If a criteria matches, such as being listed in deletes
    or a name matching cant or must, then a document is either
    inserted (if a target collection is provided) or skipped.
    If no rules match, then documents will be uploaded to the
    default collection (if provided).
    """

    def __init__(self,
        default_collection,             # 'humanities-keywords'
        deletes_collection='deletes',   # 'deletes-humanities'
        deletes_file = '_deletes.txt',  # '_deletes.txt'
        filter_collection='filter',     #
        filter_name_cant='',            # 'no-exact-match'
        filter_name_must=''
        ):         # 'humanities-keywords-no-exact-match'

        self.default_collection = default_collection
        self.deletes_collection = deletes_collection
        self.deletes_file = deletes_file
        self.filter_collection = filter_collection
        self.filter_name_cant = filter_name_cant
        self.filter_name_must = filter_name_must

    def get_json(json_path):
        json_data = None
        with open(json_path, 'r+') as f:
            json_data = json.load(f)
            json_data.pop('bag_of_words', None)
        return json_data
        
    def do(self, files_path):
        # create delete list
        try:
            with open(os.path.join(files_path, self.deletes_file), 'r') as f:
                self.deletes_list = f.read().splitlines()
        except OSError:
            self.deletes_list = []
        self.json_paths = [os.path.join(r, file) for r, d, f in os.walk(files_path) for file in f if file.endswith('.json') and not file.startswith('._')]
        for json_path in self.json_paths:
            try:
                json_basename = os.path.split(json_path)[1]
                if json_basename in self.deletes_list:
                    if self.deletes_collection:
                        self.deletes_collection.insert_one(get_json(json_path))
                elif self.filter_name_must and self.filter_name_must not in json_basename:
                    if self.filter_collection:
                        self.filter_collection.insert_one(get_json(json_path))
                elif self.filter_name_cant and self.filter_name_cant in json_basename:
                    if self.filter_collection:
                        self.filter_collection.insert_one(get_json(json_path))
                elif self.default_collection:
                    self.default_collection.insert_one(get_json(json_path))
            except (json.decoder.JSONDecodeError, KeyError, PermissionError, ValueError, InvalidDocument) as err:
                print('\n', err.__class__.__name__, ": ", json_path, err)
                continue


## mongodb import scripts

In [None]:
# %%capture output

print('Import humanities_keywords and reddit')

client = MongoClient('mongodb://mongo/')
db = client['we1s']

upload_list = []

hum_zip_path_list = zip_scanner_excludedirs(
    source_path='/home/jovyan/data/parsed/humanities-keywords/',
    exclude_list=[''], join=True)
hum_uploader = BatchJSONUploader2(
    default_collection=db['humanities_keywords'],
    deletes_file = '_deletes.txt',
    deletes_collection=db['deletes_humanities'],
    filter_name_cant='no-exact-match',
    filter_name_must='',
    filter_collection=db['humanities_keywords_no_exact'])
upload_list.append(hum_zip_path_list, hum_uploader)
 
rzip_path_list = zip_scanner_excludedirs(
    source_path='/home/jovyan/data/parsed/reddit/',
    exclude_list=[''], join=True)
reddit_uploader = BatchJSONUploader2(
    default_collection=db['reddit'],
    deletes_file = '_deletes.txt',
    deletes_collection=db['deletes_reddit'],
    filter_name_cant='',
    filter_name_must='',
    filter_collection=db['deletes_reddit'])
upload_list.append(rzip_path_list, reddit_uploader)

for zip_path_list, uploader in scanner_lists:
    for zip_path in zip_path_list:
        zp = ZipProcessor(zip_path, uploader)
        zp.process()
        # print('...processed: ', zip_path)
        # zp.open()
        # x = os.listdir(zp.getdir())
        # if '_deletes.txt' in x:
        #     print(x)
        #     print()
        # zp.close()


In [None]:
print('Import comparison corpus')

client = MongoClient('mongodb://mongo/')
db = client['we1s']

comp_zip_path_list = zip_scanner_excludedirs(source_path='/home/jovyan/data/parsed/comparison-corpus/',
                                        exclude_list=[''], join=True)
comp_all_uploader = BatchJSONUploader2(
    default_collection=db['comparison-not-humanities'],
    deletes_file = '_deletes.txt',
    deletes_collection=db['deletes_comparison-not-humanities'],
    filter_name_cant='',
    filter_name_must='no-exact-match',
    filter_collection=db['comparison-not-humantiies-filter'])

comp_science_uploader = BatchJSONUploader2(
    default_collection=db['comparison-sciences'],
    deletes_file = '_deletes.txt',
    deletes_collection=db['deletes_comparison-sciences'],
    filter_name_cant='no-exact-match',
    filter_name_must='',
    filter_collection=db['comparison-sciences-filter'])

# The zips are mixed together in the list, so
# two uploaders are used based on the filename:
for zip_path in comp_zip_path_list:
    if 'humanities_' in zip_path:
        zp = ZipProcessor(zip_path, comp_all_uploader)
        zp.process()
    elif 'sciences_' in zip_path:
        zp = ZipProcessor(zip_path, comp_science_uploader)
        zp.process()
    else:
        print('...missed: ', zip_path)


## sources

In [25]:
print('Import csv to mongodb sources, aliases\n')
client=MongoClient('mongodb://mongo/')
sp = SourcesProcessor(client=MongoClient('mongodb://mongo/'),
                      file_path='../sources_master.csv',
                      source_path=['Sources','Sources'],
                      aliases_path=['Sources','config','source_aliases'])
print(sp)
sp.get_csv_put_mongo()
print('aliases:', len(sp.aliases))
print('source_docs:', len(sp.source_docs))
print('csv_parse_log errors:', len(sp.csv_parse_log))
# view errors:
# print('\n'.join(sp.csv_parse_log))

print('Load sources, aliases from mongodb\n')

sp.clear()

print('source_docs:', len(sp.source_docs))
sp.get_mongo_source_docs()
print('source_docs:', len(sp.source_docs))

print('aliases:', len(sp.aliases))
sp.get_mongo_aliases()
print('aliases:', len(sp.aliases))

Import csv to mongodb sources, aliases

SourcesProcessor(client=MongoClient(host=['mongo:27017'], document_class=dict, tz_aware=False, connect=True), file_path=../sources_master.csv, source_path=['Sources', 'Sources'], aliases_path=['Sources', 'config', 'source_aliases'])
aliases: 1644
source_docs: 1298
csv_parse_log errors: 17
Load sources, aliases from mongodb

source_docs: 0
source_docs: 1298
aliases: 0
aliases: 1644


## Article rewriting

In [52]:
print('Setup ArticleProcessor with SourcesProcessor')
client=MongoClient('mongodb://mongo/')
sp = SourcesProcessor(client=MongoClient('mongodb://mongo/'),
                      file_path='../sources_master.csv',
                      source_path=['Sources','Sources'],
                      aliases_path=['Sources','config','source_aliases'])
ap = ArticleProcessor(sp)

Setup ArticleProcessor with SourcesProcessor


In [61]:
print('Rewrite documents in memory and preview in-memory, no mongo update')

# select data
coll = client['we1s']['deletes_humanities']
docs = coll.aggregate([{ '$sample': { 'size': 4 } }])

# preview config
pop_list = ['features','language_model','content-unscrubbed']
trim_dict={'content':500, 'attachment_id': 30, 'doc_id':40, 'metapath':40, 'content-hash-ssdeep': 30, 'name':40}

# display with single table style
data = []
data.append(('before (preview)', 'after (preview)'))
for doc in docs:
    before, after = ap.json_update_previews(doc, pop_list=pop_list, trim_dict=trim_dict, width=60)
    # the full `doc` has been changed at this point, could be written.
    data.append((before,after))
ap.display_ipython_table(data) 

# # ...or display with floating table rows style
# for doc in docs:
#     before, after = ap.json_update_previews(doc, pop_list=pop_list, trim_dict=trim_dict, width=60)
#     # the full `doc` has been changed at this point, could be written.
#     ap.display_ipython_table([(before,after)]) 


Rewrite documents in memory and preview in-memory, no mongo update


0,1
before (preview),after (preview)
"{'_id': ObjectId('5d3ada4af123b8357f4c3ed5'),  'attachment_id': 'LNCDBE032A334E6199AEB6E6EFA...',  'author': 'Philip Sweeney',  'content': 'Fans of the work of Vincent Van Gogh, of '  'industrial heritage and of vintage Hollywood '  'have a treat waiting in the European Capital '  'of Culture program of Mons in Belgium this '  'spring: all three subjects in one intriguing '  'package of exhibitions, screenings and darkly '  'atmospheric topography. The atmospheric '  'destination is the Borinage, the impoverished '  'region around Mons, where long rows of grimy '  ""workers' terraces and the remains of railways ""  'and canals stretch around a landscape of low '  'hills. Thi...',  'content-hash-ssdeep': '96:tBRCddiM2SXsGf4ZSRw6ESTV...',  'copyright': 'All Rights Reserved',  'database': 'LexisNexis',  'doc_id': '02A6A252C52394AB97B14672E56C2F2F91752...',  'length': '1086',  'metapath': 'Corpus,lexisnexis,8200_8200_theindepe...',  'name': '8200_8200_theindependentunitedkingdom...',  'namespace': 'we1sv2.0',  'ppversion': '0.1',  'pub': None,  'pub_date': '2015-02-17T18:51:00Z',  'readability_scores': [45.95332522187235,  11.90957793137295,  9.994435839024987],  'section': 'EUROPE',  'sources': ['theindependentunitedkingdom'],  'title': None,  'word_count': 1009}","{'_id': ObjectId('5d3ada4af123b8357f4c3ed5'),  'api_data_provider': 'LexisNexis',  'api_software': 'we1s-collector',  'attachment_id': 'LNCDBE032A334E6199AEB6E6EFA...',  'author': 'Philip Sweeney',  'content': 'Fans of the work of Vincent Van Gogh, of '  'industrial heritage and of vintage Hollywood '  'have a treat waiting in the European Capital '  'of Culture program of Mons in Belgium this '  'spring: all three subjects in one intriguing '  'package of exhibitions, screenings and darkly '  'atmospheric topography. The atmospheric '  'destination is the Borinage, the impoverished '  'region around Mons, where long rows of grimy '  ""workers' terraces and the remains of railways ""  'and canals stretch around a landscape of low '  'hills. Thi...',  'content-hash-ssdeep': '96:tBRCddiM2SXsGf4ZSRw6ESTV...',  'copyright': 'All Rights Reserved',  'database': 'LexisNexis',  'doc_id': '02A6A252C52394AB97B14672E56C2F2F91752...',  'length': '1086',  'metapath': 'Corpus,lexisnexis,8200_8200_theindepe...',  'name': '8200_8200_theindependentunitedkingdom...',  'namespace': 'we1sv2.0',  'ppversion': '0.1',  'pub': None,  'pub_date': '2015-02-17T18:51:00Z',  'readability_scores': [45.95332522187235,  11.90957793137295,  9.994435839024987],  'section': 'EUROPE',  'source': 'the-independent-united-kingdom',  'title': None,  'word_count': 1009}"
"{'_id': ObjectId('5d3ac54af123b8357f469437'),  'attachment_id': '...',  'author': 'Gracie DiFazio',  'content': 'The CEO of Bottega Veneta, Carlo Alberto '  'Beretta, gave a lecture on Tradition, '  'Elegance and Craftsmanship: The Timeless '  'Luxury Model of Bottega Veneta in Bartley '  'Hall last Tuesday at 4:30 p. m. This lecture '  'was an event in the lecture series on Italian '  'Business entitled: The Difference Italy '  'Makes. Bottega Veneta is a luxury brand '  'produced and made in the Veneto region of '  'Italy. Their signature woven design acts in '  'place of a logo on their bags. ""When your own '  'initials are enough"" has been th...',  'content-hash-ssdeep': '96:t2MkmZkJ/OSvFYYYYYYYYYYY...',  'copyright': 'All Rights Reserved',  'database': 'LexisNexis',  'doc_id': '02A6A252C52394AB97B14672E56C2F2F37435...',  'length': '687',  'metapath': 'Corpus,lexisnexis,172244_172244_unive...',  'name': '172244_172244_universitywire_bodylibe...',  'namespace': 'we1sv2.0',  'ppversion': '0.1',  'pub': 'The Villanovan: Villanova University',  'pub_date': '2015-11-04T00:00:00Z',  'readability_scores': [47.67224534501645,  10.359008762322016,  8.427629572836803],  'section': 'NEWS; Pg. 1',  'sources': ['universitywire'],  'title': 'CEO brings Italian luxury to campus',  'word_count': 666}","{'_id': ObjectId('5d3ac54af123b8357f469437'),  'api_data_provider': 'LexisNexis',  'api_software': 'we1s-collector',  'attachment_id': '...',  'author': 'Gracie DiFazio',  'content': 'The CEO of Bottega Veneta, Carlo Alberto '  'Beretta, gave a lecture on Tradition, '  'Elegance and Craftsmanship: The Timeless '  'Luxury Model of Bottega Veneta in Bartley '  'Hall last Tuesday at 4:30 p. m. This lecture '  'was an event in the lecture series on Italian '  'Business entitled: The Difference Italy '  'Makes. Bottega Veneta is a luxury brand '  'produced and made in the Veneto region of '  'Italy. Their signature woven design acts in '  'place of a logo on their bags. ""When your own '  'initials are enough"" has been th...',  'content-hash-ssdeep': '96:t2MkmZkJ/OSvFYYYYYYYYYYY...',  'copyright': 'All Rights Reserved',  'database': 'LexisNexis',  'doc_id': '02A6A252C52394AB97B14672E56C2F2F37435...',  'length': '687',  'metapath': 'Corpus,lexisnexis,172244_172244_unive...',  'name': '172244_172244_universitywire_bodylibe...',  'namespace': 'we1sv2.0',  'ppversion': '0.1',  'pub': 'The Villanovan: Villanova University',  'pub_date': '2015-11-04T00:00:00Z',  'readability_scores': [47.67224534501645,  10.359008762322016,  8.427629572836803],  'section': 'NEWS; Pg. 1',  'source': 'the-villanovan-villanova-university',  'title': 'CEO brings Italian luxury to campus',  'word_count': 666}"
"{'_id': ObjectId('5d3ac989f123b8357f479cb8'),  'attachment_id': '...',  'author': 'By, Maria Polletta',  'content': '""We have these new schools, but we also had '  '40,000 college students before we started '  'this. Every year, we step back and evaluate '  ""whether we're effective in partnerships with ""  'them or if there[.] something else they need '  'us to do."" Bill Jabjiniak, Mesa[.] director '  'of economic development Three years after a '  'Mesa push to attract old-school liberal - '  'arts universities landed the city five new '  'branch colleges, two of them have cut their '  'losses and decided to leave town. '  'Missouri-based Westminster...',  'content-hash-ssdeep': '192:6gVBYOR58DNVBWyYYYYYYYY...',  'copyright': 'All Rights Reserved',  'database': 'LexisNexis',  'doc_id': '02A6A252C52394AB97B14672E56C2F2F9EAE1...',  'length': '1324',  'metapath': 'Corpus,lexisnexis,286699_286699_thear...',  'name': '286699_286699_thearizonarepublicphoen...',  'namespace': 'we1sv2.0',  'ppversion': '0.1',  'pub': 'The Arizona Republic (Phoenix)',  'pub_date': '2015-08-05T00:00:00Z',  'readability_scores': [44.13776035253713,  10.343478156216154,  9.564550143958822],  'section': 'SCOTTSDALE REPUBLIC 8; Pg. Z812',  'sources': ['thearizonarepublicphoenix'],  'title': 'Do branch colleges have what it takes to thrive '  'in downtown Mesa?',  'word_count': 1192}","{'_id': ObjectId('5d3ac989f123b8357f479cb8'),  'api_data_provider': 'LexisNexis',  'api_software': 'we1s-collector',  'attachment_id': '...',  'author': 'By, Maria Polletta',  'content': '""We have these new schools, but we also had '  '40,000 college students before we started '  'this. Every year, we step back and evaluate '  ""whether we're effective in partnerships with ""  'them or if there[.] something else they need '  'us to do."" Bill Jabjiniak, Mesa[.] director '  'of economic development Three years after a '  'Mesa push to attract old-school liberal - '  'arts universities landed the city five new '  'branch colleges, two of them have cut their '  'losses and decided to leave town. '  'Missouri-based Westminster...',  'content-hash-ssdeep': '192:6gVBYOR58DNVBWyYYYYYYYY...',  'copyright': 'All Rights Reserved',  'database': 'LexisNexis',  'doc_id': '02A6A252C52394AB97B14672E56C2F2F9EAE1...',  'length': '1324',  'metapath': 'Corpus,lexisnexis,286699_286699_thear...',  'name': '286699_286699_thearizonarepublicphoen...',  'namespace': 'we1sv2.0',  'ppversion': '0.1',  'pub': 'The Arizona Republic (Phoenix)',  'pub_date': '2015-08-05T00:00:00Z',  'readability_scores': [44.13776035253713,  10.343478156216154,  9.564550143958822],  'section': 'SCOTTSDALE REPUBLIC 8; Pg. Z812',  'source': 'the-arizona-republic-phoenix',  'title': 'Do branch colleges have what it takes to thrive '  'in downtown Mesa?',  'word_count': 1192}"
"{'_id': ObjectId('5d3ad901f123b8357f4be792'),  'attachment_id': 'LNCDBE032A334E6199591459EC4...',  'author': 'Gordon Rayner',  'content': 'AN MI6 spy found dead inside a padlocked '  'holdall could have been killed by someone who '  'specialised in ""the dark arts of the secret '  'services,"" a coroner was told yesterday. '  'Gareth Williams could not have locked the bag '  'from the inside, meaning a ""third party"" must '  'have done it, according to a lawyer '  'representing his family. They believe his '  'death in 2010 may have been linked to his '  'work at MI6, where he had recently qualified '  'for ""operational deployment"", and that '  'fingerprints, DNA and other e...',  'content-hash-ssdeep': '96:+WQPuPPPE4uvLW6paOXtuht/...',  'copyright': 'All Rights Reserved',  'database': 'LexisNexis',  'doc_id': '02A6A252C52394AB97B14672E56C2F2F32372...',  'length': '1068',  'metapath': 'Corpus,lexisnexis,8109_8109_thedailyt...',  'name': '8109_8109_thedailytelegraph_bodyplura...',  'namespace': 'we1sv2.0',  'ppversion': '0.1',  'pub': None,  'pub_date': '2012-03-31T00:00:00Z',  'readability_scores': [53.594220623501215,  12.361433710174719,  8.414431277834876],  'section': 'NEWS; Pg. 3',  'sources': ['thedailytelegraph'],  'title': None,  'word_count': 1120}","{'_id': ObjectId('5d3ad901f123b8357f4be792'),  'api_data_provider': 'LexisNexis',  'api_software': 'we1s-collector',  'attachment_id': 'LNCDBE032A334E6199591459EC4...',  'author': 'Gordon Rayner',  'content': 'AN MI6 spy found dead inside a padlocked '  'holdall could have been killed by someone who '  'specialised in ""the dark arts of the secret '  'services,"" a coroner was told yesterday. '  'Gareth Williams could not have locked the bag '  'from the inside, meaning a ""third party"" must '  'have done it, according to a lawyer '  'representing his family. They believe his '  'death in 2010 may have been linked to his '  'work at MI6, where he had recently qualified '  'for ""operational deployment"", and that '  'fingerprints, DNA and other e...',  'content-hash-ssdeep': '96:+WQPuPPPE4uvLW6paOXtuht/...',  'copyright': 'All Rights Reserved',  'database': 'LexisNexis',  'doc_id': '02A6A252C52394AB97B14672E56C2F2F32372...',  'length': '1068',  'metapath': 'Corpus,lexisnexis,8109_8109_thedailyt...',  'name': '8109_8109_thedailytelegraph_bodyplura...',  'namespace': 'we1sv2.0',  'ppversion': '0.1',  'pub': None,  'pub_date': '2012-03-31T00:00:00Z',  'readability_scores': [53.594220623501215,  12.361433710174719,  8.414431277834876],  'section': 'NEWS; Pg. 3',  'source': 'the-daily-telegraph',  'title': None,  'word_count': 1120}"


In [50]:
## From-mongo rewrite w/out update -- could instead rewrite before insert

# # def rewrite_docs(aliases, docs):
# for doc in docs: 
#     rewrite(doc, aliases)
#     print(doc_preview(doc, pop_list=['features','language_model','content-unscrubbed'], trim_dict={'content':500}))

# # def rewrite_mongo_coll(aliases, coll, docs):
# for doc in docs: 
#     rewrite(doc)
#     coll.replace_one({'_id':doc['_id']}, doc, upsert=True)
#     print(doc_preview(doc, pop_list=['features','language_model','content-unscrubbed'], trim_dict={'content':500})

# aliases = client['Sources']['config'].find_one({'_id':'source_aliases'})['aliases']
# coll = client['we1s']['deletes_humanities']
# docs = coll.aggregate([{ '$sample': { 'size': 2 } }])
# rewrite_mongo_coll(aliases, coll, docs)

# docs = client['we1s']['deletes_humanities'].aggregate([{ '$sample': { 'size': 2 } }])
# # docs = client['we1s']['deletes_humanities'].collection.find({})

# OUTPUT:
# rewrite: 5d3acd40f123b8357f488425
# { '_id': ObjectId('5d3acd40f123b8357f488425'),
#   'api_data_provider': 'LexisNexis',
#   'api_software': 'we1s-collector',
#   'attachment_id': 'LNCDBE032A334E6199C0814D4A55AE10E64CD13C619E192CC1',
#   'content': 'She has already penned poems in honour of the last British '
#              'soldiers to fight in the First World War, the wedding of Prince '
#              'William and Kate Middleton, the MPs expenses scandal and an '
# ...


## Reports: Sources

In [33]:
%%time
# ~1 min
# list source by article counts, descending
# NOTE: source is distinct from pub (publications)

print('Sources counts in articles collection (descending)')
client = MongoClient('mongodb://mongo/')
pipeline = [
    {'$group' : {'_id' : '$sources', 'count' : {'$sum' : 1}}},
    { '$sort' : {'count' : -1} }
]
result = client['we1s'].command('aggregate', 'humanities_keywords', pipeline=pipeline, explain=False)
for row in result['cursor']['firstBatch']:
    print(row)
    
# OUTPUT
# {'_id': ['thenewyorktimes'], 'count': 57700}
# {'_id': ['thewashingtonpost'], 'count': 37306}
# {'_id': ['thelatimes'], 'count': 34483}
# {'_id': ['chicagotribune'], 'count': 25541}
# {'_id': ['universitywire'], 'count': 24459}
# {'_id': ['theirishtimes'], 'count': 12741}
# ....

Sources counts in articles collection (descending)
{'_id': ['thenewyorktimes'], 'count': 57700}
{'_id': ['thewashingtonpost'], 'count': 37306}
{'_id': ['thelatimes'], 'count': 34483}
{'_id': ['chicagotribune'], 'count': 25541}
{'_id': ['universitywire'], 'count': 24459}
{'_id': ['theirishtimes'], 'count': 12741}
{'_id': ['newsday'], 'count': 9656}
{'_id': ['deseretmorningnewssaltlakecity'], 'count': 6496}
{'_id': ['theguardianlondon'], 'count': 6414}
{'_id': ['arkansasdemocratgazette'], 'count': 6231}
{'_id': ['mirrorcouk'], 'count': 6136}
{'_id': ['thehoustonchronicle'], 'count': 6071}
{'_id': ['tveyesbbcradio4'], 'count': 5651}
{'_id': ['tveyesbbcworld'], 'count': 5078}
{'_id': ['targetednewsservice'], 'count': 5052}
{'_id': ['globalbroadcastdatabase'], 'count': 4813}
{'_id': ['bbcmonitoringinternationalreports'], 'count': 4538}
{'_id': ['theindependentunitedkingdom'], 'count': 4291}
{'_id': ['tveyesbbcradio5live'], 'count': 3401}
{'_id': ['chronicleofhighereducation'], 'count': 3300

## Reports: Documents

Display document counts (or other collstats) across a database, or all databases on the server.

In [242]:
# %%time
# # fast

client=MongoClient('mongodb://mongo/')

print("\nCount collection documents in one database\n")
report = report_collstats(db_colls(client, 'we1s'), key='count')
print_table(report)

print("\nSize collection documents in one datanbase\n")
report = report_collstats(db_colls(client, 'we1s'), key='avgObjSize')
print_table(report)


Count collection documents in one database

count       db          coll        
752243      we1s        reddit      
508490      we1s        humanities_keywords_no_exact
12          we1s        comparison-not-humantiies-filter
418302      we1s        humanities_keywords
6607        we1s        comparison-sciences-filter
628317      we1s        comparison-sciences
111396      we1s        deletes_humanities
635495      we1s        comparison-not-humanities
47320       we1s        deletes_reddit
44114       we1s        deletes_comparison-not-humanities
66612       we1s        deletes_comparison-sciences
1           we1s        _config     

Size collection documents in one datanbase

avgObjSize  db          coll        
19788       we1s        reddit      
105391      we1s        humanities_keywords_no_exact
475810      we1s        comparison-not-humantiies-filter
137407      we1s        humanities_keywords
61186       we1s        comparison-sciences-filter
126635      we1s        compa

In [243]:
# %%time
# # fast

client=MongoClient('mongodb://mongo/')

print("\nCount collection documents in each db\n")
report = report_collstats(client_colls(client), key='count')
print_table(report)


# DEPRECATED
# print("Count documents in each db collection\n")
# mc = MongoCounter(client=MongoClient('mongodb://mongo/'))
# mc.count_docs_report()


# # DEPRECATED
# # list records in all collections
# d = dict((db, [collection for collection in client[db].list_collection_names()])
#              for db in client.list_database_names())
# for db in d:
#     for coll in d[db]:
#         # print(db, coll)
#         print((client[db].command("collstats", coll))['count'], db, coll)



Count collection documents in each db

count       db          coll        
1298        Sources     Sources     
1           admin       system.version
1           app         apps        
0           app         streams     
1           app         metadata    
240         app         deployments 
0           app         drafts      
2           auth        passwords   
10          auth        devices     
1           auth        groups      
0           auth        pushMessages
95          auth        refreshTokens
0           auth        apiKeys     
2           auth        users       
26          config      system.sessions
0           events      unordered_queue
1           events      counters    
1           events      ordered_queue
0           hosting     assets      
0           hosting     usage       
80          local       startup_log 
664         log         log         
0           metadata    settings    
1           metadata    dashboards  
1           metadata    i

In [244]:
# %%time
# # fast

client=MongoClient('mongodb://mongo/')

print("\nManipulate collection documents report as data\n")
report = report_collstats(client_colls(client), key='count', header=False)
s = sum(row[0] for row in report)
print('Documents across collections:')
print(s)


Manipulate collection documents report as data

Documents across collections:
3769681


## Reports: Fields

Summarize counts of field presence/absence in a collection, in all collections in a database,
or across all databases on the server.

Some other possibly approaches --
including server-side map-reduce and a tool valled Variety --
are discussed here:

-  https://stackoverflow.com/questions/2298870/get-names-of-all-keys-in-the-collection

In [32]:
%%time
## SLOW -- ~1 minute
print("Display distinct values for one field in a collection.")
client['we1s']['humanities_keywords'].distinct('term')

# OUTPUT:
# ['LexisNexis',
#  'chomp',
#  'Ethnic NewsWatch',
#  'Ethnic NewsWatch; GenderWatch',
#  'ProQuest',
#  'GenderWatch',
#  'Ethnic NewsWatc h',
#  'Ethnic N ewsWatch',
#  'Global Newsstream',
#  'Global Newsstrea m',
#  'Globa l Newsstream',
#  'Global Newss tream']

# client['we1s']['humanities_keywords'].distinct('term')

# OUTPUT:
# ['humanities', ' humanities', ' liberal_arts']

# OTHER POSSIBLE LOOKUPS:
#     client['we1s']['humanities_keywords'].distinct('api_data_provider')
#     client['we1s']['humanities_keywords'].distinct('api_data_provider_channel')
#     client['we1s']['humanities_keywords'].distinct('api_software')
#     client['we1s']['humanities_keywords'].distinct('database')
#     client['we1s']['humanities_keywords'].distinct('term')

# def store_suggested_lookups():
#     pass


Display distinct values for one field in a collection.
CPU times: user 41.6 ms, sys: 8.78 ms, total: 50.4 ms
Wall time: 1min 9s


In [270]:
%%time
## SLOW

print('\nlist publications by article counts, descending\n')

client=MongoClient('mongodb://mongo/')
coll_list = [(client, 'we1s', 'humanities_keywords')]
pipeline = [
    {'$group' : {'_id' : '$pub', 'count' : {'$sum' : 1}}},
    { '$sort' : {'count' : -1} }
]
result = report_aggregate(coll_list, pipeline)
for row in result:
    print(row)
# for row in result['cursor']['firstBatch']:
#     if row:
#         print(row['count'], '\t', row['_id'])

# ## DEPRECATED
# # list publications by article counts, descending
# client=MongoClient('mongodb://mongo/')
# pipeline = [
#     {'$group' : {'_id' : '$pub', 'count' : {'$sum' : 1}}},
#     { '$sort' : {'count' : -1} }
# ]
# result = client['we1s'].command('aggregate', 'humanities_keywords', pipeline=pipeline, explain=False)
# for row in result['cursor']['firstBatch']:
#     print(row['count'], '\t', row['_id'])

# OUTPUT:
# 89441 	 None
# 55396 	 The New York Times
# 30456 	 Los Angeles Times
# 25541 	 Chicago Tribune
# 23725 	 The Washington Post
# 12741 	 The Irish Times


list publications by article counts, descending

"cursor"
"ok"
CPU times: user 94 ms, sys: 29.7 ms, total: 124 ms
Wall time: 1min 19s


In [268]:
%%time
# list source by article counts, descending
# NOTE: distinct from pub (publications)
pipeline = [
    {'$group' : {'_id' : '$sources', 'count' : {'$sum' : 1}}},
    { '$sort' : {'count' : -1} }
]
result = client['we1s'].command('aggregate', 'humanities_keywords', pipeline=pipeline, explain=False)
for row in result['cursor']['firstBatch']:
    print(row)

## OUTPUT
# {'_id': ['thenewyorktimes'], 'count': 57700}
# {'_id': ['thewashingtonpost'], 'count': 37306}
# {'_id': ['thelatimes'], 'count': 34483}
# {'_id': ['chicagotribune'], 'count': 25541}
# {'_id': ['universitywire'], 'count': 24459}
# {'_id': ['theirishtimes'], 'count': 12741}
# {'_id': ['newsday'], 'count': 9656}

{'_id': ['thenewyorktimes'], 'count': 57700}
{'_id': ['thewashingtonpost'], 'count': 37306}
{'_id': ['thelatimes'], 'count': 34483}
{'_id': ['chicagotribune'], 'count': 25541}
{'_id': ['universitywire'], 'count': 24459}
{'_id': ['theirishtimes'], 'count': 12741}
{'_id': ['newsday'], 'count': 9656}
{'_id': ['deseretmorningnewssaltlakecity'], 'count': 6496}
{'_id': ['theguardianlondon'], 'count': 6414}
{'_id': ['arkansasdemocratgazette'], 'count': 6231}
{'_id': ['mirrorcouk'], 'count': 6136}
{'_id': ['thehoustonchronicle'], 'count': 6071}
{'_id': ['tveyesbbcradio4'], 'count': 5651}
{'_id': ['tveyesbbcworld'], 'count': 5078}
{'_id': ['targetednewsservice'], 'count': 5052}
{'_id': ['globalbroadcastdatabase'], 'count': 4813}
{'_id': ['bbcmonitoringinternationalreports'], 'count': 4538}
{'_id': ['theindependentunitedkingdom'], 'count': 4291}
{'_id': ['tveyesbbcradio5live'], 'count': 3401}
{'_id': ['chronicleofhighereducation'], 'count': 3300}
{'_id': ['thehartfordcourant'], 'count': 2836}
{'

In [253]:
%%time
# VERY SLOW ON LARGE COLLECTIONS

print("Find missing fields in one collection:\n \
EX: Find missing `title` from Sources.Sources\n")
cursor = MongoClient('mongodb://mongo/')['Sources']['Sources'].find({}, {'title': 1})
fcounter = FieldCounter(show_complete=True, includes=['title'])
for doc in cursor:
    fcounter.count_fields(doc)
print(fcounter.report())

## DEPRECATED: count a single field in a single collection
# hits = 0
# for doc in client['we1s']['humanities_keywords'].find({'pub_date': {'$exists': True, '$ne': []}}, {'pub_date':1}):
#     hits += 1;
# print(hits)


Find missing fields in one collection:
 EX: Find missing `title` from Sources.Sources


found     empties   missing   key       
1293      5         5         title     
1298      counted   

CPU times: user 10 ms, sys: 3.83 ms, total: 13.8 ms
Wall time: 18.7 ms


In [254]:
%%time
## VERY SLOW -- MANY MINUTES -- for e.g. we1s database, or whole server
## TEST IS ~30 seconds.
## THIS IS A KEY REPORT, BUT LOOPS COULD BE REFACTORED

print('Count fields in each db.collection\n')
mfc = MongoCounter(MongoClient('mongodb://mongo/'))
mfc.count_fields_db('we1s2018', progress=False, show_complete=False)

# DISABLED BECAUSE VERY SLOW
# print('Count fields in each db, each collection\n')
# mfc.count_fields_all(progress=False, show_complete=False) # everything, including deletes--extremely slow


Count fields in each db.collection

we1s2018.Sources(0 docs)
[no docs]

we1s2018.Corpus(548329 docs)

found     empties   missing   key       
249330    298998    298999    attachment_id
548201    128       128       content   
548328    0         1         doc_id    
548097    231       232       length    
452709    95276     95620     pub_date  
344       0         547985    pub_short 
344       0         547985    search_term
660       0         547669    site      
1         0         548328    source_id 
660       0         547669    term      
497141    51188     51188     title     
1004      0         547325    url       
548329    counted   

we1s2018.testcollection(1 docs)
[no fields empty/missing]

CPU times: user 12.2 s, sys: 6.9 s, total: 19.1 s
Wall time: 27.5 s


## Reports: Indexes

List indexes present on all collections in a database, or across all databases on the server.

In [247]:
# %%time
# # fast

client = MongoClient('mongodb://mongo/')
print("\nList indexes on an individual collection\n")
report = report_indexes(db_colls(client, 'we1s'))
for index in report:
    print_SON(index)

# # DEPRECATED
# print("List indexes on an individual collection")
# idxs = client['we1s']['deletes_humanities'].list_indexes()
# for idx in idxs:
#     print(idx)



List indexes on an individual collection

{'v': 2, 'key': {'_id': 1}, 'name': '_id_', 'ns': 'we1s.reddit'}
{'v': 2, 'key': {'_id': 1}, 'name': '_id_', 'ns': 'we1s.humanities_keywords_no_exact'}
{'v': 2, 'key': {'_id': 1}, 'name': '_id_', 'ns': 'we1s.comparison-not-humantiies-filter'}
{'v': 2, 'key': {'_id': 1}, 'name': '_id_', 'ns': 'we1s.humanities_keywords'}
{'v': 2, 'key': {'_id': 1}, 'name': '_id_', 'ns': 'we1s.comparison-sciences-filter'}
{'v': 2, 'key': {'_id': 1}, 'name': '_id_', 'ns': 'we1s.comparison-sciences'}
{'v': 2, 'key': {'_id': 1}, 'name': '_id_', 'ns': 'we1s.deletes_humanities'}
{'v': 2, 'key': {'_id': 1}, 'name': '_id_', 'ns': 'we1s.comparison-not-humanities'}
{'v': 2, 'key': {'_id': 1}, 'name': '_id_', 'ns': 'we1s.deletes_reddit'}
{'v': 2, 'key': {'_id': 1}, 'name': '_id_', 'ns': 'we1s.deletes_comparison-not-humanities'}
{'v': 2, 'key': {'_id': 1}, 'name': '_id_', 'ns': 'we1s.deletes_comparison-sciences'}
{'v': 2, 'key': {'_id': 1}, 'name': '_id_', 'ns': 'we1s._conf

In [248]:
# %%time
# # fast

print("\nList indexes on all db collections\n")

client = MongoClient('mongodb://mongo/')
report = report_indexes(client_colls(client))
for index in report:
    print_SON(index)

# # DEPRECATED
# mfc = MongoCounter(MongoClient('mongodb://mongo/'))
# mfc.list_indexes()



List indexes on all db collections

{'v': 2, 'key': {'_id': 1}, 'name': '_id_', 'ns': 'Sources.Sources'}
{'v': 2, 'key': {'_id': 1}, 'name': '_id_', 'ns': 'admin.system.version'}
{'v': 2, 'key': {'_id': 1}, 'name': '_id_', 'ns': 'app.apps'}
{'v': 2, 'unique': True, 'key': {'location': 1, 'domain_id_hash': 1, 'clientAppId': 1}, 'name': 'location_1_domain_id_hash_1_clientAppId_1', 'ns': 'app.apps'}
{'v': 2, 'key': {'groupId': 1, 'product': 1}, 'name': 'groupId_1_product_1', 'ns': 'app.apps'}
{'v': 2, 'key': {'eventSubscriptions._id': 1}, 'name': 'eventSubscriptions._id_1', 'ns': 'app.apps'}
{'v': 2, 'key': {'eventSubscriptions.type': 1, 'eventSubscriptions.state.producerState': 1, 'eventSubscriptions.state.updatedAt': 1, 'eventSubscriptions.state.group_notified': 1}, 'name': 'eventSubscriptionsTypeProducerStateUpdatedAtGroupNotified', 'ns': 'app.apps'}
{'v': 2, 'key': {'_id': 1}, 'name': '_id_', 'ns': 'app.streams'}
{'v': 2, 'key': {'servers.hostname': 1}, 'name': 'servers.hostname_1', 

## Aggregation

In [34]:
%%time
# SLOW ~1-2 mins

print("aggregate 'term' by counts, alphabetized")
client = MongoClient('mongodb://mongo/')
pipeline = [
    {'$group' : {'_id' : '$term', 'count' : {'$sum' : 1}}},
    { '$sort' : {'_id' : 1} }
]
result = client['we1s'].command('aggregate', 'humanities_keywords', pipeline=pipeline, explain=False)
for row in result['cursor']['firstBatch']:
    print(row['count'], '\t', row['_id'])

## OUTPUT
# 417733 	 None
# 42 	  humanities
# 53 	  liberal_arts
# 474 	 humanities

## note leading spaces affect alpha-sort

417733 	 None
42 	  humanities
53 	  liberal_arts
474 	 humanities
CPU times: user 63.5 ms, sys: 15.9 ms, total: 79.4 ms
Wall time: 1min 20s


In [35]:
%%time
# SLOW

print("aggregate 'pub' by counts, descending")
client = MongoClient('mongodb://mongo/')
pipeline_pub = [
    {'$group' : {'_id' : '$pub', 'count' : {'$sum' : 1}}},
    { '$sort' : {'count' : -1} }
]
result = client['we1s'].command('aggregate', 'humanities_keywords', pipeline=pipeline, explain=False)
for row in result['cursor']['firstBatch']:
    print(row['count'], '\t', row['_id'])

aggregate 'pub' by counts, descending
417733 	 None
42 	  humanities
53 	  liberal_arts
474 	 humanities
CPU times: user 66.3 ms, sys: 19.9 ms, total: 86.2 ms
Wall time: 1min 19s


In [None]:
# %%time
# # SLOW

# client = MongoClient('mongodb://mongo/')
# pipeline_pub = [
#     {'$group' : {'_id' : '$pub', 'count' : {'$sum' : 1}}},
#     { '$sort' : {'count' : -1} }
# ]
# pipeline_term = [
#     {'$group' : {'_id' : '$term', 'count' : {'$sum' : 1}}},
#     { '$sort' : {'count' : -1} }
# ]
# pipeline = pipeline_term
#
## LOOP OVER EVERYTHING
# d = dict((db, [collection for collection in client[db].list_collection_names()])
#          for db in client.list_database_names())
# for db in d:
#     for coll in d[db]:
#         result = client[db].command('aggregate', 'humanities_keywords', pipeline=pipeline, explain=False)
#         for row in result:
#             json.dumps(row, sort_keys=True, indent=2)

## Index Creation

In [255]:
# WE1S database setup
# -- create indexes
# result = client['we1s']['deletes_humanities'].create_index([('user_id', pymongo.ASCENDING)], unique=True)

# OLD CODE

In [6]:
# list all fields in all articles

# d = dict((db, [collection for collection in client[db].list_collection_names()])
#              for db in client.list_database_names())
# for db in d:
#     for coll in d[db]:
#         print(db, coll)
#         cursor = coll.find({})
#         for doc in cursor:
#             for key in doc:
#                 c[key] += 1



# fc = FieldCounter()
# fc.config('', is_empty=True)

# fc.default_rule(counter_name='empties', value=[None, ''])
# fc.default_rule(counter_name='all')
# fc.field_rule(counter_name='no_content', )



# self.datestamp = datetime.today().strftime('%Y%m%d-%H:%M:%S')
# datetime.today().strftime('%Y-%m-%d')


        
# def my_FieldCounters_across_database:
#     """Iterate across all dbs, all collections, and compute a FieldCounter
#     for each.
#
#     Deprecated. Replaced by a MongoFieldCounter class.
#     """
#
#     results = []
#     d = dict((db, [collection for collection in client[db].list_collection_names()])
#                  for db in client.list_database_names())
#     for db in d:
#         for coll in d[db]:
#             fcounter = FieldCounter(name=db+'.'+coll)
#             cursor = client[db][coll].find({})
#             for doc in cursor:
#                 if fcounter.total !=0 and fcounter.total % 5000 == 0:
#                     fcounter.report()
#                 fcounter.count_fields(doc)
#             print('\n\n', '[FINAL]', db, coll)
#             fcounter.report()
#             results.append(fcounter)
#
#     for result in results:
#         result.report()


In [None]:
# ## DEPRECATED BY FieldCounter

# # list all fields in all articles

# from collections import Counter

# field_dict = Counter()
# field_dict_empty = Counter()

# def key_report(field_dict):
#     print("".join(str(word).ljust(10) for word in ('found', 'missing', 'key')))
#     for key, value in sorted(field_dict.items()):
#         if value != hits:
#             print("".join(str(word).ljust(10) for word in (value, hits-value, key)))
#     print('---------- empty ----------')
#     for key, value in field_dict_empty.items():
#         print(value, '\t\t', key)

# cursor = client['we1s']['humanities_keywords'].find({})
# hits = 0
# for doc in cursor:
#     if hits !=0 and hits % 10000 == 0:
#         print('\n')
#         print('----------', hits ,'----------')
#         key_report(field_dict)
# #     if hits > 5000:
# #         break
#     hits += 1
#     for key, value in doc.items():
#         if value:
#             field_dict[key] += 1
#         else:
#             field_dict_empty[key] += 1
# key_report(field_dict)

# # for key, value in field_dict.items():
# #     if value != hits:
# #         print("".join(str(word).ljust(10) for word in (value, hits-value, key)))

# print('\n\n', field_dict)
# print("Done")

In [None]:
## DEPRECATED -- OLD SOURCE IMPORT CODE

#     def sources_doc_from_mongo(sources_template):
#         result = client[db][collection].find_one({'_id' : sources_template['_id']})
#         return result
        
#     def sources_doc_to_mongo(sources_template, source_name_aliases):
#         sources_template['lookup_canonical'] = source_name_aliases
        
#         db, collection = sources_template['mongo_path']
#         # result = client[db][collection].insert_one(sources_template)
#         result = client[db][collection].replace_one({'_id':'source_name_aliases'}, sources_template)
#         return result
    
        # ## SOURCES IMPORTER
        # # imports Sources / Sources
        # # ... and could import source_name_aliases to _config at the same 
        # source_name_aliases, source_docs = sources_csv_to_mongo('../sources_master.csv')
        # # print('----------')
        # # pp.pprint(source_docs)
        # for key, source_doc in source_docs.items():
        #     client['Sources']['Sources'].replace_one({'_id':source_doc['_id']}, source_doc, upsert=True)
    
        ## Old import _config source_name_aliases
        # source_name_aliases = source_name_aliases_from_csv('../sources_canonical_title_test.csv') # outdated -- now _master.csv
        # result = sources_doc_to_mongo(sources_template, source_name_aliases)
        # print(result)
        # result = sources_doc_from_mongo(sources_template)
        # # print(result)

#     def source_name_aliases_from_csv(filepath):
#         """build a dictionary of canonical source names and
#         non-normalized title lookups from a csv file.
    
#         lookups are based on either 1) query slugs in filenames or
#         2) pub fields from e.g. LexisNexis metadata--which are not
#         normalized and may contain typographic erros. The source_name_aliases
#         maps potential aliases against a canonical name, many to one.
        
#         The csv format is:
#         title,name,canonical_title,tags,tags,tags,tags,tags,tags,country,language
#         """
#         csvfile = open(filepath, 'r')
#         # first row field names: title, canonical_name
#         source_name_aliases = {}
#         reader = csv.DictReader(csvfile)
#         # print(json.dumps( [ row for row in reader ] ))
#         for row in reader:
#             key = mdbkey_encode(row['title'])
#             if key not in source_name_aliases:
#                 source_name_aliases[key] = row['name'].strip()
#             else:
#                 print("duplicate key '{0}' found: '{1}'".format(key, row['name'].strip()))
#                 # raise ValueError("duplicate key '{0}' found".format(key))
#         # pp.pprint(source_name_aliases) ## LONG PREVIEW OUTPUT!
#         return source_name_aliases


# # client['Sources']['Sources'].replace_one({'_id':source_doc['_id']}, source_doc, upsert=True)
# result = sources_doc_to_mongo(sources_template, source_name_aliases)
# print(result)  

# ## Old import _config source_name_aliases
# source_name_aliases = source_name_aliases_from_csv('../sources_canonical_title_test.csv') # outdated -- now _master.csv
# result = sources_doc_to_mongo(sources_template, source_name_aliases)
# print(result)
# result = sources_doc_from_mongo(sources_template)
# # print(result)

In [None]:
## DEPRECATED: previous working version, added to ArticleProcessor class
## displays side-by-side html views of json with pretty formatting.
#
# print('Rewrite in-memory, no mongo update')
# coll = client['we1s']['deletes_humanities']
# docs = coll.aggregate([{ '$sample': { 'size': 2 } }])
# data =[]
# pop_list = ['features','language_model','content-unscrubbed']
# trim_dict={'content':500, 'attachment_id': 30, 'doc_id':40, 'metapath':40, 'content-hash-ssdeep': 30, 'name':40}
# for doc in docs:
#     before = doc_preview(doc, pop_list=pop_list, trim_dict=trim_dict, width=60)
#     ap.json_update(doc)
#     after = doc_preview(doc, pop_list=pop_list, trim_dict=trim_dict, width=60)
#     data.append((before, after))
# from IPython.display import display, HTML
# # css_str = '<style>body{background-color:#000000}; table{width:600px !important}; td{width:200 !important};</style>'
# css_str = '<style>td{border: 1px solid black} td{vertical-align: top}</style>'
# display(HTML(
# #     css_str + '<table style="text-align:left !important; background:red"><tr>{}</tr></table>'.format(
#     css_str + '<table><tr style>{}</tr></table>'.format(
#         '</tr><tr>'.join(
#         '<td><pre>{}</pre></td>'.format('</pre></td><td><pre>'.join(str(_) for _ in row)) for row in data)
#     )), metadata=dict(isolated=True))
