# Google Drive Cleanup (incomplete)
### Nicolas Chan, 10/17/2017
Looks through Google Drive results folder and finds consecutive ranges of completed results. Combines these results into one folder of a larger range. This also identifies ranges that have not been completed.

In [13]:
# Configuration
results_folder_id = '0B1297pLT9WXLV29iNmlWcTlvakk'
temp_folder = '/global/scratch/nicolaschan/tmp/'

In [18]:
# Copied from AdamAndersonFindSumerianWorkflow

#Make Directories if they do not exist
import os
import sys
import errno

def dir_create(path):
    try:
        os.makedirs(path)
    except OSError as exception:
        if exception.errno != errno.EEXIST:
            raise
        else:
            print('Folder at: ' + path + ' already exists. Skipping...')

# bDrive Authorization
import os
import codecs
import httplib2
from googleapiclient.http import MediaIoBaseDownload
from googleapiclient.http import MediaFileUpload
from apiclient import discovery, errors
from oauth2client import client
from oauth2client import tools
from oauth2client.file import Storage

SCOPES = 'https://www.googleapis.com/auth/drive'
CLIENT_SECRET_FILE = 'client_secret.json'
APPLICATION_NAME = 'gDriveConnect'

import argparse
parser = argparse.ArgumentParser(parents=[tools.argparser])
parser.add_argument('-f', help=argparse.SUPPRESS)

flags = parser.parse_known_args()[0]
flags.noauth_local_webserver = True

def get_credentials():
    
    home_dir = os.path.expanduser('~')
    credential_dir = os.path.join(home_dir, '.credentials')
    if not os.path.exists(credential_dir):
        os.makedirs(credential_dir)
    credential_path = os.path.join(credential_dir, 'gDriveConnect.json')
    
    store = Storage(credential_path)    
    credentials = store.get()
    
    if not credentials or credentials.invalid:
        flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES)
        flow.user_agent = APPLICATION_NAME
        if flags:
            credentials = tools.run_flow(flow, store, flags)
        else: # Needed only for compatibility with Python 2.6
            credentials = tools.run(flow, store)
        print('Storing credentials to ' + credential_path)
        
    return credentials

credentials = get_credentials()
print('credential_path:', credentials)
http = credentials.authorize(httplib2.Http())
service = discovery.build('drive', 'v3', http=http)

def get_drive_contents(folder_id):
    contents = []
    query="'" + folder_id + "' in parents and trashed=false"
    
    # This implementation is copied from below (commented out)
    page_token = None
    while True:
        response = service.files().list(q=query,
                                             spaces='drive',
                                             fields='nextPageToken, files(id, name)',
                                             pageToken=page_token).execute()
        for file in response.get('files', []):
            # Process change
            # print('Found file: %s (%s)' % (file.get('name'), file.get('id')) )
            tup = (file.get('name'), file.get('id'))
            contents.append(tup)
        page_token = response.get('nextPageToken', None)
        if page_token is None:
            break;
    return contents

import io
def download_file(google_id, destination):
    """Downloads a file from Google Drive"""
    
    request = service.files().get_media(fileId=google_id)
    fh = io.FileIO(destination, mode='wb')
    downloader = MediaIoBaseDownload(fh, request, chunksize=1024*1024)
    
    done = False
    while done is False:
        status, done = downloader.next_chunk()
        
    
# Added by Nicolas Chan, 10/12/2017
def create_drive_folder(name, parent=None):
    "Create a new Google Drive folder, nested under parent"
    
    body = { 'name': name, 'mimeType': 'application/vnd.google-apps.folder' }
    if parent:
        body['parents'] = [parent]
    return service.files().create(body = body).execute()

credential_path: <oauth2client.client.OAuth2Credentials object at 0x2af5b032e630>


In [19]:
def folder_finished(contents):
    """Returns Google IDs of bighitlist.txt and smallhitlist.txt, if they exist"""
    bighitlist = None
    smallhitlist = None
    for file in contents:
        if file[0] == 'bighitlist.txt':
            bighitlist = file[1]
        if file[0] == 'smallhitlist.txt':
            smallhitlist = file[1]
        if bighitlist and smallhitlist:
            return bighitlist, smallhitlist # short circuits for efficiency
    return None, None

In [None]:
results_folders = get_drive_contents(results_folder_id)
results_folders = sorted(results_folders, key=lambda folder: int(folder[0].split('_')[1].split('-')[0]))
# print(results_folders)

consecutive_completed_ranges = [[]]
for results_folder in results_folders:
    contents = get_drive_contents(results_folder[1])
    bighitlist, smallhitlist = folder_finished(contents)
    
    if bighitlist and smallhitlist:
        consecutive_completed_ranges[-1].append({
                'results_folder': results_folder,
                'smallhits': smallhitlist,
                'bighits': bighitlist
            })
    elif len(consecutive_completed_ranges[-1]) > 0:
        consecutive_completed_ranges.append([])
    
    print(results_folder[0], ':', bool(bighitlist and smallhitlist))

# Remove ranges of length 1
consecutive_completed_ranges = [ r for r in consecutive_completed_ranges if len(r) > 1 ]

print(consecutive_completed_ranges)

Results_0-1 : True
Results_2-3 : False
Results_4-4 : True
Results_5-5 : False
Results_6-8 : True
Results_9-11 : False
Results_12-14 : False
Results_15-17 : True
Results_18-22 : False
Results_23-27 : False
Results_28-32 : False
Results_33-33 : True
Results_34-34 : True
Results_35-44 : True
Results_45-45 : True
Results_46-55 : True
Results_56-60 : True
Results_61-65 : True
Results_66-70 : True
Results_71-75 : True
Results_76-80 : True
Results_81-85 : True
Results_86-90 : False
Results_91-95 : True
Results_96-100 : True
Results_101-105 : True


In [21]:
# concatenate_files based on https://stackoverflow.com/a/13613375/8706910
def concatenate_files(files, output_file):
    with open(output_file, 'w') as output:
        for file in files:
            print(file)
            with open(file) as input_file:
                for line in input_file:
                    output.write(line)
                

def results_folder_to_range(folder_name):
    start, end = folder_name.split('_')[1].split('-')
    return int(start), int(end)

new_ranges = []
for ranges in consecutive_completed_ranges:
    smallhits = []
    bighits = []
    start_index = None
    end_index = None
    for folder in ranges:
        folder_name = folder['results_folder'][0]
        start, end = results_folder_to_range(folder_name)
        
        start_index = start if start_index == None  else min(start, start_index)
        end_index = end if end_index == None else max(end, end_index)
        
        path = temp_folder + folder_name + '/'
        dir_create(path)
        smallhit_file = path + 'smallhitlist.txt'
        bighit_file = path + 'bighitlist.txt'
        download_file(folder['smallhits'], smallhit_file)
        download_file(folder['bighits'], bighit_file)
        smallhits.append(smallhit_file)
        bighits.append(bighit_file)
        
    combined_folder = temp_folder + 'Combined_'  + str(start_index) + '-' + str(end_index)
    dir_create(combined_folder)
    
    concatenate_files(smallhits, combined_folder + '/smallhitlist.txt')
    concatenate_files(bighits, combined_folder + '/bighitlist.txt')
    print(smallhits, bighits, start_index, end_index)
    
    # print('range', start_index, end_index)
        

Folder at: /global/scratch/nicolaschan/tmp/Results_33-33/ already exists. Skipping...
Folder at: /global/scratch/nicolaschan/tmp/Results_34-34/ already exists. Skipping...


HttpError: <HttpError 416 when requesting https://www.googleapis.com/drive/v3/files/0B1297pLT9WXLNTZWaGNUSk90NjA?alt=media returned "Request range not satisfiable">