In [None]:
%load_ext autoreload
%autoreload 2

# Mongo DB connection Prototyping

In [None]:
# imports and config
import pandas as pd
from pymongo import UpdateOne
import numpy as np

# siblings
from mongo_common import print_collection_sizes, get_mongo_client
import data_loader

In [None]:
# creating the client
local_connect_dict = {
    "host":"localhost",
    "port":27017,
    "user":'',
    "pass":'',
    "auth_source":''
}
mongo_client = get_mongo_client(local_connect_dict, tls_flag=False, tlsAllowInvalidCertificates=True)

"""
Test client by reading out current database names
"""
print(mongo_client.list_database_names())
if 'pv_db' in mongo_client.list_database_names():
    print("Database already exists")
    pv_db = mongo_client['pv_db']
    print_collection_sizes(pv_db)
else:
    print('Creating pv_db database')
    pv_db = mongo_client['pv_db']
    pv_db.create_collection('projects')
    pv_db.create_collection('users')

In [None]:
pv_dl = data_loader.MongoDataLoader(local_connect_dict, 'pv_db','users','projects','project_data' )
pv_dl.getUserData()

In [None]:
project_name = ''
if project_name:
    print('yay')
else:
    print('nay')

In [None]:
# list all unique projects
pv_projects = pv_dl.getProjects()
set([x['project_name'] for x in pv_projects])

In [None]:
# try name='' or name='cncr_hist_mc_demo'
pv_dl.getProjects(project_name='cncr_hist_mc_demo')

In [None]:
prj_data = pv_dl.getProjectData(project_name='movie_reviews_demo', not_annotated_only=True)
print(len(prj_data))

In [None]:
prj_data = pv_dl.getProjectData(project_name='movie_reviews_demo', not_annotated_only=False)
print(len(prj_data))

In [None]:
prj_data

In [None]:
pv_dl.saveAnnot(prj_data[0],{"label":"testing"})

In [None]:
pv_dl.deleteAnnot(prj_data[0])

In [None]:
pv_dl.deleteAnnot(prj_data[0],)

## Loading/reformatting Example data into JSON

In [None]:
proj_classes_df = pd.read_csv('../example_database/project_classes.csv')
proj_users = pd.read_csv('../example_database/project_users.csv')
proj_df = pd.read_csv('../example_database/projects.csv')

# project data - the documents
proj_data_df = pd.read_csv('../example_database/project_data.csv')
# user information
user_df = pd.read_csv('../example_database/user_details.csv')

# these data WONT be loaded in to the example database - this structure will be considered completely legacy
annot_df = pd.read_csv('../example_database/annotation_events.csv')

### Merging project data together

In [None]:
proj_users_grp = (proj_users.groupby('project_name')
                  .agg(
                      user_list=('user_name',lambda x:list(x))
                      )
                    .reset_index()
                )

proj_classes_grp = (proj_classes_df.groupby('project_name')
                  .agg(
                      class_list=('class',lambda x:list(x))
                      )
                    .reset_index()
                )

proj_final_df = pd.merge(proj_df, proj_users_grp, how='left', on='project_name')
proj_final_df = pd.merge(proj_final_df, proj_classes_grp, how='left', on='project_name')

proj_final_df

In [None]:
# write project data to mongo
proj_dict_list = proj_final_df.to_dict(orient='records')
for one_obj in proj_dict_list:
    for k,v in one_obj.items():
        if isinstance(v, (pd._libs.tslibs.nattype.NaTType)):
            one_obj[k] = None
        if (isinstance(v, float) and np.isnan(v)):
            one_obj[k] = None
        if isinstance(v, np.ndarray):
            one_obj[k] = v.tolist()

proj_upserts = [UpdateOne({"_id": str(x.get('project_name'))},
                    {"$set": x},
                    upsert=True) for x in proj_dict_list]

In [None]:
bulk_res = pv_db['projects'].bulk_write(proj_upserts)
bulk_api_result_dict = bulk_res.bulk_api_result
bulk_api_result_dict

### Project Data

In [None]:
proj_data_df["_id"] = proj_data_df['project_name'] + '_' + proj_data_df['example_id'].astype(str)
proj_data_df

In [None]:
proj_data_df['_id'].is_unique

In [None]:
# write project data to mongo
proj_data_dict_list = proj_data_df.to_dict(orient='records')
for one_obj in proj_data_dict_list:
    for k,v in one_obj.items():
        if isinstance(v, (pd._libs.tslibs.nattype.NaTType)):
            one_obj[k] = None
        if (isinstance(v, float) and np.isnan(v)):
            one_obj[k] = None
        if isinstance(v, np.ndarray):
            one_obj[k] = v.tolist()

proj_data_upserts = [UpdateOne({"_id": str(x.get('_id'))},
                    {"$set": x},
                    upsert=True) for x in proj_data_dict_list]

In [None]:
bulk_res = pv_db['project_data'].bulk_write(proj_data_upserts)
bulk_api_result_dict = bulk_res.bulk_api_result
bulk_api_result_dict

### User Data

In [None]:
user_df["_id"] = user_df['user_name']
user_df

In [None]:
# write project data to mongo
user_dict_list = user_df.to_dict(orient='records')
for one_obj in user_dict_list:
    for k,v in one_obj.items():
        if isinstance(v, (pd._libs.tslibs.nattype.NaTType)):
            one_obj[k] = None
        if (isinstance(v, float) and np.isnan(v)):
            one_obj[k] = None
        if isinstance(v, np.ndarray):
            one_obj[k] = v.tolist()

user_upserts = [UpdateOne({"_id": str(x.get('_id'))},
                    {"$set": x},
                    upsert=True) for x in user_dict_list]
user_upserts

In [None]:
bulk_res = pv_db['users'].bulk_write(user_upserts)
bulk_api_result_dict = bulk_res.bulk_api_result
bulk_api_result_dict

## Annotation Data

> NOTE!  In the mongo version of this project, there will not be a seperate annoation collection.  We will simply update the original documents in project_data with additional k:v pairs resulting from the annotation process

In [None]:
annot_df