In [19]:
import pandas as pd
from pymongo import MongoClient

In [20]:
def mongodb_find(collection, query_filter={}, projection={}):
    client = MongoClient("mongodb://localhost:27017/")  
    db = client["academicworld"]  
    collection = db[collection]
    data = list(collection.find(query_filter, projection))
    client.close()
    return pd.DataFrame(data)

In [21]:
# Define the query filter and projection
query_filter = {'name': 'Agouris,Peggy'}
projection = {'_id': 0, 'name': 1, 'email': 1, 'phone': 1}  

# Run the query and display the data
df = mongodb_find("faculty", query_filter, projection)
print(df.head())

            name email phone
0  Agouris,Peggy  None      


In [30]:
query = {'name': 'Agouris,Peggy'}
df = mongodb_find("faculty", query_filter=query).iloc[0]
df

_id                                          666fc5fd5c96e15153dc04cf
id                                                                  0
name                                                    Agouris,Peggy
position                                                      Provost
researchInterest                                                 None
email                                                            None
phone                                                                
affiliation         {'id': 0, 'name': 'College of William Mary', '...
photoUrl            https://www.wm.edu/news/images/2019/content/ag...
keywords            [{'id': 174, 'name': 'ontologies', 'score': 44...
publications        [9379453, 38697786, 45886936, 72949384, 128007...
Name: 0, dtype: object

In [22]:
def mongodb_aggregate(collection, pipeline):
    client = MongoClient("mongodb://localhost:27017/")  # Replace with your MongoDB connection string
    db = client["academicworld"]  # Replace with your database name
    collection = db[collection]
    data = list(collection.aggregate(pipeline))
    client.close()
    return pd.DataFrame(data)

In [24]:
# Define the aggregation pipeline
pipeline = [
    {"$match": {"affiliation.name": "University of illinois at Urbana Champaign"}},
    {"$project": {"_id": 0, "name": 1, "email": 1, "phone": 1}},
    {"$sort": {"name": 1}},
    {"$limit": 10}
]

# Perform the aggregation query
faculty_data = mongodb_aggregate("faculty", pipeline)

# Display the results
print(faculty_data)

                              name email phone
0               Abdussalam Alawini  None  None
1               Abigail Wooldridge  None  None
2                       Adam Bates  None  None
3                      Alex Kirlik  None  None
4                Alexander Schwing  None  None
5  Andreas Paul Eberhard Kloeckner  None  None
6                    Andrew Miller  None  None
7                  Benjamin Cosman  None  None
8               Bertram Ludaescher  None  None
9                            Bo Li  None  None


In [36]:
# Define the aggregation pipeline
pipeline = [
    {"$unwind": "$keywords"},
    {"$match": {"year": {"$eq": 2011}}},
    {"$group": {"_id": "$keywords.name", "pub_cnt": {"$sum": 1}}},
    {"$sort": {"pub_cnt": -1}},
    {"$limit": 10}
]

# Perform the aggregation query
publications_data = mongodb_aggregate("publications", pipeline)
publications_data

Unnamed: 0,_id,pub_cnt
0,algorithms,392
1,internet,365
2,users,364
3,queries,290
4,research,289
5,learning,274
6,routing,267
7,attacks,259
8,execution,252
9,genome,251


In [41]:
pipeline = [
    {"$group": {
        "_id": None,
        "min_year": {"$min": "$year"},
        "max_year": {"$max": "$year"}
    }}
]

result = mongodb_aggregate("publications", pipeline)
min_year, max_year = result["min_year"][0], result["max_year"][0]
min_year, max_year

(0, 2021)

In [52]:
pipeline = [
    {"$group": {"_id": "$year"}},
    {"$sort": {"_id": 1}}
]

years = mongodb_aggregate("publications", pipeline)
years['_id'] = years['_id'].astype(int)
years = years[years['_id'] >= 1900]
min_year, max_year = years['_id'].min(), years['_id'].max() 
min_year, max_year 

(1903, 2021)