In [187]:
from datetime import datetime, timezone
import pandas as pd
import langdetect
import json

lang_code = pd.read_json('ISO-639-1-language.json')

def get_language(text=None):
    langs = []
    if text:
        langs = [{
            "name": lang_code.set_index('code').loc[lang.lang, 'name'],
            "code": lang.lang.upper()
        } for lang in langdetect.detect_langs(text)]
        
    return langs


def get_software(data):
    software = [dict(
        name=lang.get('node', {}).get('name', ''),
        version="",
        library=(
            [] if lang.get('node', {}).get('name', '') not in ['R', 'Python'] else 
            data.get('r_libs', []) if lang.get('node', {}).get('name', '') == 'R' else
            data.get('py_libs', []))
    ) for lang in data.get('languages', [])]
    
    return software


def has_content(node):
    if isinstance(node, dict):
        unode = {}
        for key, value in node.items():
            v = has_content(value)
            if isinstance(v, int) or v:  # This is placed here to prevent the `published` parameter from being removed in case `published` is set to 0.
                unode[key] = v
    elif isinstance(node, list):
        unode = []
        for element in node:
            v = has_content(element)
            if isinstance(v, int) or v:  # This is placed here to prevent the `published` parameter from being removed in case `published` is set to 0.
                unode.append(v)
    else:  # str, int, float <for now, assume no other container data types, e.g., tuples.>
        unode = node

    return unode

In [191]:
import pymongo

mongo = pymongo.MongoClient(port=27018)
db = mongo['nlp']
collection = db['github-bq']

In [192]:
collection.create_index([('readme', pymongo.TEXT)], name='readme_text_idx')
# collection.create_index(('description', pymongo.TEXT), name='description_text_idx')

'readme_text_idx'

# Fill the script template

In [231]:
def build_template_for_github(data, overwrite='no', published=1, repositoryid='central', cleanup=False):
    title_idno = data['_id'].replace('/', '_')
    
    tp_template = dict(
      repositoryid=repositoryid,
      published=published,
      overwrite=overwrite,
      doc_desc=dict(
        title="",
        idno="",
        producers=[
          dict(
            name= "GitHub Bot",
            abbr="",
            affiliation="",
            role="bot"
          )
        ],
        prod_date=datetime.now().strftime('%d %B %Y'),
        version=""
      ),
      project_desc=dict(
        title_statement=dict(
          idno=title_idno,
          title=data.get('description', data.get('name', '')),
          sub_title="",
          alternate_title="",
          translated_title=""
        ),
        production_date=[
#           pd.to_datetime(data['repo_created_at']).strftime('%d %B %Y')
            pd.to_datetime(data['repo_created_at']).strftime('%B %Y')
        ],
        geographic_units=[
          dict(
            name="",
            code="",
            type=""
          )
        ],
        authoring_entity=[
          dict(
            name=data['owner'],
            role="owner",
            affiliation=data.get('homepage_url', ''),
            abbreviation="",
            email=""
          )
        ],
        contributors=[
          dict(
            name="",
            role="",
            affiliation="",
            abbreviation="",
            email="",
            url=""
          )
        ],
        curators= [
          dict(
            name= "",
            role= "",
            affiliation= "",
            abbreviation= "",
            email= "",
            url= ""
          )
        ],
        abstract=data.get('readme', data.get('description', data.get('name', ''))),
        keywords=[
          dict(
            name="",
            vocabulary="",
            uri=""
          )
        ],
        themes=[
          dict(
            name="",
            vocabulary="",
            uri=""
          )
        ],
        topics=[
          dict(
            id="",
            name="",
            parent_id="",
            vocabulary="",
            uri=""
          )
        ],
        disciplines=[
          dict(
            name="",
            vocabulary="",
            uri=""
          )
        ],
        output_types=[
          dict(
            type="",
            description="",
            uri="",
            doi=""
          )
        ],  
        repository_uri=[
          dict(
            name=data['_id'],
            type="Github",
            uri=f"https://github.com/{data['_id']}"
          )
        ],
        project_website=[
          data.get('homepage_url', '')
        ],
        version_statement=dict(
          version="latest",
          version_date=pd.to_datetime(data['repo_updated_at']).strftime('%d %B %Y'),
          version_resp="",
          version_notes="Latest update"
        ),
        language=get_language(data.get('readme', data.get('description', data.get('name', 'english')))),
        methods=[
          dict(
            name="",
            note=""
          )
        ],
        software=get_software(data),
        technology_environment="",
        technology_requirements="",
        reproduction_instructions="",
        license=[
          dict(
            name=data.get('license_info', ''),
            uri=""
          )
        ],
        review_process=[
          dict(
            submission_date="",
            reviewer="",
            review_status="",
            approval_authority="",
            approval_date=""
          )
        ],
        disclaimer="",
        confidentiality="",
        citation_requirement="",
        datasets=[
          dict(
            name="",
            idno="",
            note="",
            access_type="",
            uri=""
          )
        ],
        sponsors=[
          dict(
            name="",
            abbr="",
            role="",
            grant_no=""
          )
        ],
        acknowledgements=[
          dict(
            name="",
            affiliation="",
            role=""
          )
        ],
        related_projects=[
          dict(
            name="",
            uri="",
            note=""
          )
        ],
        contacts=[
          dict(
            name="",
            affiliation="",
            uri="",
            phone=""
          )
        ],
        scripts=[
          dict(
            file_name="",
            title="",
            authors=[
              dict(
                name="",
                abbr="",
                role=""
              )
            ],
            date="",
            format="",
            software= "",
            description= "",
            methods= "",
            dependencies= "",
            instructions= "",
            source_code_repo= "",
            notes= ""
          )
        ]
      )
    )
    
    if cleanup:
        tp_template = has_content(tp_template)
        
    return tp_template

# Get data from the database

In [225]:
collection.count_documents({'$text': {'$search': '"economic"'}, 'readme': {'$exists': True}})

759

In [235]:
%%time
keywords = ['economic', 'nutrition', 'income inequality', 'agriculture', 'climate change', 'poverty', 'fragility', 'refugee']
payloads = []

for kw in keywords:
    for data in collection.find({'$text': {'$search': f'"{kw}"'}, 'readme': {'$exists': True}}):
        payloads.append(build_template_for_github(data, overwrite='yes', cleanup=True))

CPU times: user 20 s, sys: 0 ns, total: 20 s
Wall time: 21.6 s


In [236]:
with open('github_nada_data.json', 'w') as fl:
    json.dump(payloads, fl)

In [211]:
# climate_dataset = collection.find({'$text': {'$search': '"climate change"'}, 'readme': {'$exists': True}})
# poverty_dataset = collection.find({'$text': {'$search': '"poverty"'}, 'readme': {'$exists': True}})
# nutrition_dataset = collection.find({'$text': {'$search': '"nutrition"'}, 'readme': {'$exists': True}})
# refugee_dataset = collection.find({'$text': {'$search': '"refugee"'}, 'readme': {'$exists': True}})
# fragility_dataset = collection.find({'$text': {'$search': '"fragility"'}, 'readme': {'$exists': True}})
# agriculture_dataset = collection.find({'$text': {'$search': '"agriculture"'}, 'readme': {'$exists': True}})
# income_dataset = collection.find({'$text': {'$search': '"income inequality"'}, 'readme': {'$exists': True}})
# economics_dataset = collection.find({'$text': {'$search': '"economics"'}, 'readme': {'$exists': True}})

In [131]:
# data = collection.find_one({'primary_language': 'Python', '$text': {'$search': '"climate change"'}})
# data = collection.find_one({'primary_language': 'Python', '$text': {'$search': 'poverty'}})

In [189]:
# tp_template = build_template_for_github(data, cleanup=True)
# tp_template['doc_desc']

In [140]:
# template = {
#   "repositoryid": 'central',
#   "published": 1,
#   "overwrite": "yes",
#   "doc_desc": {
# #     "title": "",
#     "idno": "",
#     "producers": [
#       {
#         "name": "GitHub Bot",
#         "abbr": "",
#         "affiliation": "",
#         "role": "bot"
#       }
#     ],
#     "prod_date": datetime.now().strftime('%d %B %Y'),
#     "version": "1.0"
#   },
#   "project_desc": {
#     "title_statement": {
#       "idno": data['_id'].replace('/', '_'),
#       "title": data.get('description', data.get('name', '')),
#       "sub_title": "",
#       "alternate_title": "",
#       "translated_title": ""
#     },
#     "production_date": [
#       datetime.now().strftime('%B %Y')
#     ],
#     "authoring_entity": [
#       {
#         "name": data['owner'],
#         "role": "owner",
#         "affiliation": data.get('homepage_url', ''),
#       }
#     ],
#     "abstract": data.get('readme', data.get('description', data.get('name', ''))),

#     "repository_uri": [
#       {
#         "name": data['_id'],
#         "type": "GitHub",
#         "uri": f"https://github.com/{data['_id']}"
#       }
#     ],
#     "project_website": [
#       data.get('homepage_url', '')
#     ],
#     "version_statement": {
#       "version": "latest",
#       "version_date": pd.to_datetime(data['repo_updated_at']).strftime('%d %B %Y'),
#       "version_resp": "",
#       "version_notes": "Latest update"
#     },
#     "language": get_language(data.get('readme', data.get('description', data.get('name', 'english')))),
#     "software": [
#         {
#             "name": lang.get('node', {}).get('name', ''),
#             "version": "",
#             "library": (
#                 [] if lang.get('node', {}).get('name', '') not in ['R', 'Python'] else 
#                 data.get('r_libs', []) if lang.get('node', {}).get('name', '') == 'R' else 
#                 data.get('py_libs', []))
#         } for lang in data.get('languages', [])
#     ],
#     "license": [
#       {
#         "name": data.get('license_info', ''),
#         "uri": ""
#       }
#     ],
#   }
# }

# Post to API

In [None]:
idno = data['project_desc']['title_statement']['idno'].replace('/', '_')
headers = {'X-API-KEY': '<API_KEY>'}
api_url = 'http://dev.ihsn.org/nada/index.php/api/datasets/create/script/'
response = requests.post(api_url + idno, headers=headers, json=template)

In [None]:
# collection.find_one({'primary_language': 'R', '$text': {'$search': 'poverty'}})
# collection.find({'$text': {'$search': "\"climate change\""}})

In [59]:
# data = {
#   "_id": "00tau/skyline-addon-easyqc",
#   "description": "Add-on script for performing easy quality control tasks within Skyline",
#   "fork_count": 0,
#   "insertion_date": "2019-11-24T04:36:07.963844+00:00",
#   "languages": [
#     {
#       "node": {
#         "name": "R"
#       }
#     }
#   ],
#   "last_updated_date": "2019-11-24T04:36:07.963844+00:00",
#   "license_info": "GNU General Public License v3.0",
#   "name": "skyline-addon-easyqc",
#   "owner": "00tau",
#   "primary_language": "R",
#   "py_libs": [],
#   "r_libs": [
#     "chron",
#     "ggplot2",
#     "plyr"
#   ],
#   "readme": "# Start using easyQC for statistical process and quality control in mass spectrometry workflows\n\n## Introduction\n\nThe program `easyQC` is an external tool for statistical process and quality\ncontrol in mass spectrometry workflows that integrates nicely in the [Skyline\nTargeted Proteomics\nEnvironment](https://skyline.gs.washington.edu/labkey/project/home/software/Skyline/begin.view).\n\n## Feature list at a glance\n\n- Automatically sorts your data by date and time, and orders your observations\n  with the most recent on the right.  (\"What? Does this mean I don't need to\n  sort my data manually, as it is the case for some other software tools out\n  there?\", \"Yes.\")\n- Dynamically adapts to custom report templates. (See details below.)\n- Flow charts for single peptides can optionally be grouped together by their\n  common protein accession.   (See details below.)\n- Plots are generated in a nice page layout, ready for printing.\n- Observations are colour-coded by a beneficial four-colour-code.  This makes\n  it particularly easy to detect deviations from the norm.\n- Has a built in outlier detection, which provides you with useful robust\n  features.  (See details below.)\n- Plot as _many_ flow charts for as _many_ peptides as you like.\n\n## How to cite this software\n\nThe [Harvard UoB format]\n(http://lrweb.beds.ac.uk/guides/a-guide-to-referencing/cite_computer_program)\nsuggests to cite this software in the following fashion:\n\n    Möbius, T.W. and Malchow, S. (2014) easyQC: Statistical Process and Quality\n    Control in Mass Spectrometry Workflows (Version 1.0) [Computer program].\n    Available at: http://00tau.github.io/skyline-addon-easyqc/ (Accessed 03.\n    April, 2014)\n\nThank you for using (and citing) this software.\n\n## Installation using the skyline GUI\n\nSimply follow the GUI-clicking adventure by successively clicking on `Tools ->\nExternal Tools -> External Tool Store`.  In the appearing list select (click\non) `easyQC`.  You will be promoted for the path to `Rscript`, which needs to\nbe installed on you system.\n\nWe have realised that since the introduction of \"Live Reports\" in new Versions\nof Skyline, the import of new templates might fail.  If this is the case for\nyou, make sure two switch off \"Live Reports\", restart Skyline, and try the\ninstallation again.\n\nThe underlying code-base of `easyQC` relies on the R-packages\n[ggplot2](http://ggplot2.org/), [plyr](http://plyr.had.co.nz/) and\n[chron](http://cran.r-project.org/web/packages/chron/index.html).  Fortunately,\nall these packages are hosted on [CRAN](http://cran.r-project.org/), and should\nautomatically be installed into your R-environment, when installing `easyQC` in\nSkyline.  If, for some reasons, this should not be the case for you, make sure\nthese three packages are installed in your R-environment.\n\n## Description\n\nThe software comes with an exemplary report template called `easyQC`.  We\nrecommend to just go with this template, but feel free to create your own.  The\nabsolute necessary fields your template should contain are:\n`PeptideModifiedSequence` and `PrecursorMz`.  These two fields are used as\nidentifiers for your peptides, and, thus, all other fields should uniquely be\nidentifiable by these two.  Optionally, the field `ProteinName` can be added to\nyour template.\n\nBy default, the flow charts of ten peptides are grouped together into one plot\neach.  If your report template also contains the associated protein accession\nof each peptide, namely the field `ProteinName`, then all peptides which belong\nto the same protein accession are grouped into one plot.\n\nBefore the calculation of the mean and standard deviations of each flow chart,\nthe software will do some outlier detection of your data, namely [Grubbs' test\nfor outliers](http://en.wikipedia.org/wiki/Grubbs%27_test_for_outliers) will be\napplied.  Observations which are classified as outliers by this test are\ndiscarded in the estimation of the mean and standard deviations.   This gives\nthe estimated means and standard deviations some desirable\n[robust](http://en.wikipedia.org/wiki/Robust_statistics) features.\n\n## You can also use easyQC as a stand-alone command line program\n\nOn Linux, you simply need to add the directory in which you have cloned\n`easyQC`'s repository to your path.  Also make sure that `easyQC.r` is\nexecutable.\n\n```\n% git clone https://github.com/00tau/skyline-addon-easyqc.git\n% cd skyline-addonn-easyqc\n% chmod +x easyQC.r\n% PATH=$(pwd):$PATH\n```\n\nThe synopsis is as follows:\n\n```\neasyQC.r [OPTIONS] REPORTFILE\n```\n\nWhere `OPTIONS` is either `verbose` or noting.  For example, to produce some\nquality control plots from a file `some-report-file.csv` that has been\ngenerated by Skyline via some report template (e.g. the template `easyQC.skyr`\nshould come in mind here), run either one of the following two code lines from\nthe command line.\n\n```\n% easyQC.r some-report-file.csv\n% easyQC.r verbose some-report-file.csv\n```\n\nThis will produce a file `some-report-file.pdf` with all the plots you need.\n\nYou what to install the most recent and latest version in Skyline\n-----------------------------------------------------------------\n\nIf for some reasons, you are interested in installing the latest GitHub-version\n(or any other version of this software that is available on GitHub), the\nrepository contains a convenient Makefile that will create the necessary files\nfor the installation process for you.  Simply type:\n\n```\n% make\n```\n\nThis will create a `easyQC.zip` file which contains the needed install scripts\nfor Skyline.  Now, just follow your Skyline-GUI.\n\nAuthors\n-------\n\nThomas W. D. Möbius (Maintainer, R-programming), Sebastian Malchow (Skyline wizard)\n",
#   "repo_created_at": "2014-02-25T15:26:30Z",
#   "repo_id": "MDEwOlJlcG9zaXRvcnkxNzE3NzYxOQ==",
#   "repo_updated_at": "2014-04-04T14:56:54Z",
#   "stargazers": 0,
#   "topics": [],
#   "watchers": 1
# }

In [None]:
# collection.create_index([('readme', pymongo.TEXT)], name='readme_text_idx')
# collection.create_index([('description', pymongo.TEXT)], name='description_text_idx')

In [122]:
# template = {
#   "repositoryid": 'central',
#   "published": 1,
#   "overwrite": "yes",
#   "doc_desc": {
# #     "title": "",
#     "idno": "",
#     "producers": [
#       {
#         "name": "GitHub Bot",
#         "abbr": "",
#         "affiliation": "",
#         "role": "bot"
#       }
#     ],
#     "prod_date": datetime.now().strftime('%B %Y'),
#     "version": ""
#   },
#   "project_desc": {
#     "title_statement": {
#       "idno": data['_id'].replace('/', '_'),
#       "title": data.get('description', data.get('name', '')),
#       "sub_title": "",
#       "alternate_title": "",
#       "translated_title": ""
#     },
#     "production_date": [
#       datetime.now().strftime('%B %Y')
#     ],
# #     "geographic_units": [
# #       {
# #         "name": "",
# #         "code": "",
# #         "type": ""
# #       }
# #     ],
#     "authoring_entity": [
#       {
#         "name": data['owner'],
#         "role": "owner",
#         "affiliation": data.get('homepage_url', ''),
# #         "abbreviation": null,
# #         "email": null
#       }
#     ],
# #     "contributors": [
# #       {
# #         "name": "string",
# #         "role": "string",
# #         "affiliation": "string",
# #         "abbreviation": null,
# #         "email": null,
# #         "url": null
# #       }
# #     ],
# #     "curators": [
# #       {
# #         "name": "string",
# #         "role": "string",
# #         "affiliation": "string",
# #         "abbreviation": null,
# #         "email": null,
# #         "url": null
# #       }
# #     ],
#     "abstract": data.get('readme', data.get('description', data.get('name', ''))),
# #     "keywords": [
# #       {
# #         "name": "string",
# #         "vocabulary": "string",
# #         "uri": "string"
# #       }
# #     ],
# #     "themes": [
# #       {
# #         "name": "string",
# #         "vocabulary": "string",
# #         "uri": "string"
# #       }
# #     ],
# #     "topics": [
# #       {
# #         "id": "string",
# #         "name": "string",
# #         "parent_id": "string",
# #         "vocabulary": "string",
# #         "uri": "string"
# #       }
# #     ],
# #     "disciplines": [
# #       {
# #         "name": "string",
# #         "vocabulary": "string",
# #         "uri": "string"
# #       }
# #     ],
# #     "output_types": [
# #       {
# #         "type": "string",
# #         "description": "string",
# #         "uri": "string",
# #         "doi": "string"
# #       }
# #     ],
#     "repository_uri": [
#       {
#         "name": data['_id'],
#         "type": "GitHub",
#         "uri": f"https://github.com/{data['_id']}"
#       }
#     ],
#     "project_website": [
#       data.get('homepage_url', '')
#     ],
#     "version_statement": {
#       "version": "latest",
#       "version_date": pd.to_datetime(data['repo_updated_at']).strftime('%d %B %Y'),
#       "version_resp": "",
#       "version_notes": "Latest update"
#     },
#     "language": get_language(data.get('readme', data.get('description', data.get('name', 'english')))),
# #     "methods": [
# #       {
# #         "name": "string",
# #         "note": "string"
# #       }
# #     ],
#     "software": [
#         {
#             "name": lang.get('node', {}).get('name', ''),
#             "version": "",
#             "library": (
#                 [] if lang.get('node', {}).get('name', '') not in ['R', 'Python'] else 
#                 data.get('r_libs', []) if lang.get('node', {}).get('name', '') == 'R' else 
#                 data.get('py_libs', []))
#         } for lang in data.get('languages', [])
#     ],
# #     "technology_environment": "string",
# #     "technology_requirements": "string",
# #     "reproduction_instructions": "string",
#     "license": [
#       {
#         "name": data.get('license_info', ''),
#         "uri": ""
#       }
#     ],
# #     "review_process": [
# #       {
# #         "submission_date": "string",
# #         "reviewer": "string",
# #         "review_status": "string",
# #         "approval_authority": "string",
# #         "approval_date": "string"
# #       }
# #     ],
# #     "disclaimer": "string",
# #     "confidentiality": "string",
# #     "citation_requirement": "string",
# #     "datasets": [
# #       {
# #         "name": "string",
# #         "idno": "string",
# #         "note": "string",
# #         "access_type": "string",
# #         "uri": "string"
# #       }
# #     ],
# #     "sponsors": [
# #       {
# #         "name": "string",
# #         "abbr": "string",
# #         "role": "string",
# #         "grant_no": "string"
# #       }
# #     ],
# #     "acknowledgements": [
# #       {
# #         "name": "string",
# #         "affiliation": "string",
# #         "role": "string"
# #       }
# #     ],
# #     "related_projects": [
# #       {
# #         "name": "string",
# #         "uri": "string",
# #         "note": "string"
# #       }
# #     ],
# #     "contacts": [
# #       {
# #         "name": "string",
# #         "affiliation": "string",
# #         "uri": "string",
# #         "phone": "string"
# #       }
# #     ],
# #     "scripts": [
# #       {
# #         "file_name": "string",
# #         "title": "string",
# #         "authors": [
# #           {
# #             "name": "string",
# #             "abbr": "string",
# #             "role": "string"
# #           }
# #         ],
# #         "date": "string",
# #         "format": "string",
# #         "software": "string",
# #         "description": "string",
# #         "methods": "string",
# #         "dependencies": "string",
# #         "instructions": "string",
# #         "source_code_repo": "string",
# #         "notes": "string"
# #       }
# #     ]
#   }
# }