In [1]:
from pymongo import MongoClient

client = MongoClient("localhost", 27017)
db = client.final_orcid_database

In [3]:
# Return all the documents in the "works" collection which are duplicates with respect to their put codes.

results = db.works.aggregate([
    { "$group": { "_id": "$put_code", "count": { "$sum": 1 } } },
    { "$match": { "count": { "$gt": 1 } } },
])

In [7]:
# Return all the documents in the "authors" collection which are duplicates with respect to their orcids.

results = db.authors.aggregate([
    { "$group": { "_id": "$orcid", "count": { "$sum": 1 } } },
    { "$match": { "count": { "$gt": 1 } } },
])

In [2]:
# Remove duplicates from the "works" collection and put the results in the "works_without_duplicates" collection.

db.works.aggregate([
    {
        "$group": {
            "_id": "$put_code",
            "doc": { "$first": "$$ROOT" },
        }
    },
    {
        "$replaceWith": "$doc"
    },
    {
        "$out": "works_without_duplicates"
    }
])

<pymongo.command_cursor.CommandCursor at 0x7fcf7f385c50>

In [5]:
# Remove duplicates from the "authors" collection and put the results in the "authors_without_duplicates" collection.

db.authors.aggregate([
    {
        "$group": {
            "_id": "$orcid_id",
            "doc": { "$first": "$$ROOT" },
        }
    },
    {
        "$replaceWith": "$doc"
    },
    {
        "$out": "authors_without_duplicates"
    }
])

<pymongo.command_cursor.CommandCursor at 0x7fcf7f383910>

In [6]:
client.close()

In [6]:
# Remove the "works" collection.

db.works.drop()

In [14]:
# Rename the "works_without_duplicates" collection into "works".

db.works_without_duplicates.rename("works")

{'ok': 1.0}

In [15]:
# Rename the "authors_without_duplicates" collection into "authors".

db.authors_without_duplicates.rename("authors")

{'ok': 1.0}

In [16]:
# Add indices to the "works" collection.

db.works.create_index([("title", 1)])
db.works.create_index([("publication_year", 1)])
db.works.create_index([("put_code", 1)])

'put_code_1'

In [17]:
# Add indices to the "authors" collection.

db.authors.create_index([("given_names", 1)])
db.authors.create_index([("family_name", 1)])
db.authors.create_index([("orcid", 1)])

'orcid_1'

In [2]:
client.close()

In [6]:
from pymongo import MongoClient

client = MongoClient("localhost", 27017)
db = client.orcid

In [7]:
# Add indices to the "authors" collection in the orcid database (The database with all the extracted information from the Summaries file).

db.authors.create_index([("given names", 1)])
db.authors.create_index([("family name", 1)])
db.authors.create_index([("orcid", 1)])

'orcid_1'