# Setting up the server to access GCS

Download the google-cloud-storage package

```
conda install -c conda-forge google-cloud-storage
```

Create a service account key for the server by following the instructions in https://cloud.google.com/docs/authentication/getting-started.

In creating a service account key, the permission can be set to `Role > Project > Viewer` to give read access to all resources. **Make sure to keep the key safe at all times!**

Save the key and add upload it to the server. Use it as specified in the instruction above or in python it can be done as:

```python
import os

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'PATH/TO/SERVICE_KEY.json'
```

A quickstart guide can then be found here: https://googleapis.dev/python/storage/latest/index.html


In [42]:
from google.cloud import storage
import os

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = './WB Analytics-74d8961073eb.json'

bucket_name = 'wb-analytics'
# prefix = 'ipython_notebooks_python_and_R_scripts_imports_full/'
download_dir = '../data/'

In [40]:
def gcs_download_csv_json_gz(bucket_name, prefix, download_dir):
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    target_dir = os.path.join(download_dir, prefix)

    assert(os.path.isdir(download_dir))
    assert(os.path.isdir(target_dir) == False)  # If the target directory exists, back it up first and delete the directory.

    os.makedirs(target_dir)
    
    blobs = bucket.list_blobs(prefix=prefix)  # This includes the prefix

    for ix, blob in enumerate(blobs):
        print(blob.name)
        if blob.name.endswith('.json.gz') or blob.name.endswith('.csv.gz'):
            print(f'Saving {ix + 1}. {blob.name}...')
            filename = blob.name[len(prefix):].replace('/', '__')
            blob.download_to_filename(target_dir + filename)

In [43]:
gcs_download_csv_json_gz(bucket_name=bucket_name, prefix='module_usage_per_repository/', download_dir=download_dir)

module_usage_per_repository/
module_usage_per_repository/module_usage_per_repository_20191120011240.csv.gz
Saving 2. module_usage_per_repository/module_usage_per_repository_20191120011240.csv.gz...
module_usage_per_repository/module_usage_per_repository_20191120233530.csv.gz
Saving 3. module_usage_per_repository/module_usage_per_repository_20191120233530.csv.gz...


In [2]:
gcs_download_csv_json_gz(bucket_name=bucket_name, prefix='ipython_notebooks_python_and_R_scripts_imports_full/', download_dir=download_dir)

Saving 2. ipython_notebooks_python_and_R_scripts_imports_full/ipython_notebooks_python_and_R_scripts_imports_full-000000000000.json.gz...
Saving 3. ipython_notebooks_python_and_R_scripts_imports_full/ipython_notebooks_python_and_R_scripts_imports_full-000000000001.json.gz...
Saving 4. ipython_notebooks_python_and_R_scripts_imports_full/ipython_notebooks_python_and_R_scripts_imports_full-000000000002.json.gz...
Saving 5. ipython_notebooks_python_and_R_scripts_imports_full/ipython_notebooks_python_and_R_scripts_imports_full-000000000003.json.gz...
Saving 6. ipython_notebooks_python_and_R_scripts_imports_full/ipython_notebooks_python_and_R_scripts_imports_full-000000000004.json.gz...
Saving 7. ipython_notebooks_python_and_R_scripts_imports_full/ipython_notebooks_python_and_R_scripts_imports_full-000000000005.json.gz...
Saving 8. ipython_notebooks_python_and_R_scripts_imports_full/ipython_notebooks_python_and_R_scripts_imports_full-000000000006.json.gz...
Saving 9. ipython_notebooks_python

In [44]:
gcs_download_csv_json_gz(bucket_name=bucket_name, prefix='stata_files_github/', download_dir=download_dir)

stata_files_github/
stata_files_github/stata_files_github_20191120233730.csv.gz
Saving 2. stata_files_github/stata_files_github_20191120233730.csv.gz...


In [68]:
import json

In [71]:
import pymongo

mongo = pymongo.MongoClient(port=27018)
db = mongo['nlp']
collection = db['github-bq']

In [70]:
print(json.dumps(collection.find_one({'languages': {'$exists': True}})))

{"_id": "00tau/skyline-addon-easyqc", "description": "Add-on script for performing easy quality control tasks within Skyline", "fork_count": 0, "insertion_date": "2019-11-24T04:36:07.963844+00:00", "languages": [{"node": {"name": "R"}}], "last_updated_date": "2019-11-24T04:36:07.963844+00:00", "license_info": "GNU General Public License v3.0", "name": "skyline-addon-easyqc", "owner": "00tau", "primary_language": "R", "py_libs": [], "r_libs": ["chron", "ggplot2", "plyr"], "readme": "# Start using easyQC for statistical process and quality control in mass spectrometry workflows\n\n## Introduction\n\nThe program `easyQC` is an external tool for statistical process and quality\ncontrol in mass spectrometry workflows that integrates nicely in the [Skyline\nTargeted Proteomics\nEnvironment](https://skyline.gs.washington.edu/labkey/project/home/software/Skyline/begin.view).\n\n## Feature list at a glance\n\n- Automatically sorts your data by date and time, and orders your observations\n  with

In [72]:
collection.count_documents({})

498610

In [81]:
collection.count_documents({})

529976

In [None]:
collection.create_index([('readme', pymongo.TEXT)], name='readme_text_idx')
collection.create_index([('description', pymongo.TEXT)], name='description_text_idx')

In [60]:
# collection.find_one({'primary_language': 'R', 'readme': {'$text': {'$search': 'poverty'}}})
collection.find({'$text': {'$search': "\"climate change\""}})