In [1]:
# imports
import os
import shutil
import pandas as pd
import pickle
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from googleapiclient.errors import HttpError

In [2]:
# stored information
actual_folder_destination = "TODO - replace with your actual folder's ID"
test_folder_destination = "TODO - replace with your test folder's ID. this is what is currently being used in the script"
client_secret_path = "TODO - replace with the path to your client secret"

In [3]:
# authenticate and create Google Drive service
def authenticate_google_drive():
    SCOPES = ['https://www.googleapis.com/auth/drive']
    creds = None
    
    # load previously saved credentials
    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            creds = pickle.load(token)

    # if no valid credentials, authenticate user
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                client_secret_path, SCOPES)
            creds = flow.run_local_server(port=0)

        # save credentials
        with open('token.pickle', 'wb') as token:
            pickle.dump(creds, token)

    return build('drive', 'v3', credentials=creds)

# upload a file to Google Drive
def upload_to_drive(service, file_path, parent_folder_id, mime_type):
    file_metadata = {'name': os.path.basename(file_path), 'parents': [parent_folder_id]}
    media = MediaFileUpload(file_path, mimetype=mime_type)
    file = service.files().create(body=file_metadata, media_body=media, fields='id').execute()
    return file.get('id')

# upload a folder to Google Drive
def upload_folder_to_drive(service, folder_path, parent_folder_id):
    folder_name = os.path.basename(folder_path)
    folder_metadata = {
        'name': folder_name,
        'mimeType': 'application/vnd.google-apps.folder',
        'parents': [parent_folder_id]
    }
    folder = service.files().create(body=folder_metadata, fields='id').execute()
    folder_id = folder.get('id')

    for root, _, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            mime_type = 'application/vnd.google.colaboratory' if file.endswith('.ipynb') else None
            if mime_type or file.endswith('.py'):
                new_file_path = convert_to_colab_format(file_path)
                upload_to_drive(service, new_file_path, folder_id, mime_type)
            else:
                upload_to_drive(service, file_path, folder_id, None)

    return folder_id

# convert .ipynb or .py files to Colab-compatible format
def convert_to_colab_format(file_path):
    if file_path.endswith('.ipynb') or file_path.endswith('.py'):
        return file_path  # no conversion needed
    return file_path


In [4]:
def main():
    # load DataFrame with 'folder URL' and 'repo URL' columns
    df = pd.read_csv("download_git_TEST.csv")
    df_copy = df.copy()

    # temporary directory for cloning repositories
    temp_dir = 'temp'
    os.makedirs(temp_dir, exist_ok=True)

    # Google Drive parent folder ID
    parent_folder_id = test_folder_destination  
    service = authenticate_google_drive()

    for index, row in df.iterrows():
        print()
        folder_url = row.get('folder URL', None)
        print("folder_url:", folder_url)
        repo_url = row.get('repo URL', None)

        try:
            if isinstance(folder_url, str):
                # if folder URL is provided
                df_copy['Resource'] = folder_url+'||'+df['file name']
                print(f"Processing folder URL: {folder_url}")
                if '/tree/' in folder_url:
                    repo_base_url, folder_path = folder_url.split('/tree/', 1)
                    folder_subpath = folder_path.split('/', 1)[-1] if '/' in folder_path else folder_path
                else:
                    print(f"Invalid folder URL format: {folder_url}. Skipping...")
                    continue
                repo_url = repo_base_url
            elif isinstance(repo_url, str):
                # if repository URL is provided
                print(f"Processing repository URL: {repo_url}")
                folder_subpath = ""  # process entire repository
            else:
                print(f"Invalid row: no URL provided. Skipping...")
                continue
            
            # clone repository
            repo_name = os.path.basename(repo_url)
            local_repo_path = os.path.join(temp_dir, repo_name)
            print(f"Cloning {repo_url} to {local_repo_path}...")
            clone_result = os.system(f'git clone {repo_url} {local_repo_path}')

            if clone_result != 0:
                print(f"Failed to clone repository: {repo_url}. Skipping...")
                continue

            # verify the folder path
            if folder_subpath:
                local_folder_path = os.path.join(local_repo_path, folder_subpath)
                if not os.path.exists(local_folder_path):
                    print(f"Subfolder {folder_subpath} not found in {repo_url}. Skipping...")
                    continue
            else:
                local_folder_path = local_repo_path

            print(f"Uploading folder: {local_folder_path} to Google Drive...")
            uploaded_folder_id = upload_folder_to_drive(service, local_folder_path, parent_folder_id)
            print(f"Uploaded folder ID: {uploaded_folder_id}")

        except HttpError as e:
            print(f"Google Drive API error: {e}")
        except Exception as e:
            print(f"Error processing row {index}: {e}")

        finally:
            # remove cloned repository
            if os.path.exists(local_repo_path):
                shutil.rmtree(local_repo_path)

    # remove temporary directory
    shutil.rmtree(temp_dir)

    # reorder columns and save to new CSV
    df_reordered = df_copy.loc[:, ['Resource', 'Computing Topics', 'Context Topics', 'Libraries Used', 'Language', 'Level', 'Last Updated', 'Source Institution']]
    df_reordered.to_csv("updated_resources_GIT_test.csv", index=False)
    print("All done!")

In [5]:
if __name__ == '__main__':
    main()


folder_url: https://github.com/ds-modules/LINGUIS-110/tree/master/FormantsUpdated
Processing folder URL: https://github.com/ds-modules/LINGUIS-110/tree/master/FormantsUpdated
Cloning https://github.com/ds-modules/LINGUIS-110 to temp/LINGUIS-110...


Cloning into 'temp/LINGUIS-110'...


Uploading folder: temp/LINGUIS-110/FormantsUpdated to Google Drive...
Uploaded folder ID: 1AS2qGVgDfUrWN8VCYdBjRMJFXjic3V8e

folder_url: https://github.com/ds-modules/LINGUIS-110/tree/master/VOT
Processing folder URL: https://github.com/ds-modules/LINGUIS-110/tree/master/VOT
Cloning https://github.com/ds-modules/LINGUIS-110 to temp/LINGUIS-110...


Cloning into 'temp/LINGUIS-110'...


Uploading folder: temp/LINGUIS-110/VOT to Google Drive...
Uploaded folder ID: 1CUMQaVW7HQlrJz0EQKy9TsFJCLf9KY3i

folder_url: nan
Processing repository URL: https://github.com/ds-modules/SOC-130AC
Cloning https://github.com/ds-modules/SOC-130AC to temp/SOC-130AC...


Cloning into 'temp/SOC-130AC'...


Uploading folder: temp/SOC-130AC to Google Drive...
Uploaded folder ID: 13ocTpuqL6CYUFHTCFZP7x_G8VZiN4OWc

folder_url: https://github.com/ds-modules/ECON-101B/tree/master/Previous/Problem%20Set%201
Processing folder URL: https://github.com/ds-modules/ECON-101B/tree/master/Previous/Problem%20Set%201
Cloning https://github.com/ds-modules/ECON-101B to temp/ECON-101B...


Cloning into 'temp/ECON-101B'...


Subfolder Previous/Problem%20Set%201 not found in https://github.com/ds-modules/ECON-101B. Skipping...

folder_url: https://github.com/ds-modules/ECON-101B/tree/master/Problem%20Set%203
Processing folder URL: https://github.com/ds-modules/ECON-101B/tree/master/Problem%20Set%203
Cloning https://github.com/ds-modules/ECON-101B to temp/ECON-101B...


Cloning into 'temp/ECON-101B'...


Subfolder Problem%20Set%203 not found in https://github.com/ds-modules/ECON-101B. Skipping...

folder_url: nan
Processing repository URL: https://github.com/ds-modules/XENGLIS-31AC
Cloning https://github.com/ds-modules/XENGLIS-31AC to temp/XENGLIS-31AC...


Cloning into 'temp/XENGLIS-31AC'...


Uploading folder: temp/XENGLIS-31AC to Google Drive...
Uploaded folder ID: 18D7fjMdPLF--rQARRFp9reMUUWlVU508

folder_url: https://github.com/ds-modules/PSYCH-167AC/blob/master/01-Intro-to-Importing-Data-Tables-Graphs.ipynb
Processing folder URL: https://github.com/ds-modules/PSYCH-167AC/blob/master/01-Intro-to-Importing-Data-Tables-Graphs.ipynb
Invalid folder URL format: https://github.com/ds-modules/PSYCH-167AC/blob/master/01-Intro-to-Importing-Data-Tables-Graphs.ipynb. Skipping...

folder_url: https://github.com/ds-modules/PSYCH-167AC/blob/master/02-Correlation-Regression.ipynb
Processing folder URL: https://github.com/ds-modules/PSYCH-167AC/blob/master/02-Correlation-Regression.ipynb
Invalid folder URL format: https://github.com/ds-modules/PSYCH-167AC/blob/master/02-Correlation-Regression.ipynb. Skipping...

folder_url: nan
Processing repository URL: https://github.com/ds-modules/PSYCH-167AC/tree/master
Cloning https://github.com/ds-modules/PSYCH-167AC/tree/master to temp/master...


Cloning into 'temp/master'...
fatal: repository 'https://github.com/ds-modules/PSYCH-167AC/tree/master/' not found
Cloning into 'temp/XRHETOR-R1A'...


Failed to clone repository: https://github.com/ds-modules/PSYCH-167AC/tree/master. Skipping...

folder_url: https://github.com/ds-modules/XRHETOR-R1A/tree/master/01%20-%20Data%20Science%20in%20xRhetoric%20Intro
Processing folder URL: https://github.com/ds-modules/XRHETOR-R1A/tree/master/01%20-%20Data%20Science%20in%20xRhetoric%20Intro
Cloning https://github.com/ds-modules/XRHETOR-R1A to temp/XRHETOR-R1A...
Subfolder 01%20-%20Data%20Science%20in%20xRhetoric%20Intro not found in https://github.com/ds-modules/XRHETOR-R1A. Skipping...

folder_url: https://github.com/ds-modules/XRHETOR-R1A/tree/master/02-Moral-Foundations-Analysis
Processing folder URL: https://github.com/ds-modules/XRHETOR-R1A/tree/master/02-Moral-Foundations-Analysis
Cloning https://github.com/ds-modules/XRHETOR-R1A to temp/XRHETOR-R1A...


Cloning into 'temp/XRHETOR-R1A'...


Uploading folder: temp/XRHETOR-R1A/02-Moral-Foundations-Analysis to Google Drive...
Uploaded folder ID: 1LjOidJcxK12UiSJen-0YlNyoiyiqtE8h

folder_url: https://github.com/ds-modules/XRHETOR-R1A/tree/master/03-Rhetoric-of-Data
Processing folder URL: https://github.com/ds-modules/XRHETOR-R1A/tree/master/03-Rhetoric-of-Data
Cloning https://github.com/ds-modules/XRHETOR-R1A to temp/XRHETOR-R1A...


Cloning into 'temp/XRHETOR-R1A'...


Uploading folder: temp/XRHETOR-R1A/03-Rhetoric-of-Data to Google Drive...
Uploaded folder ID: 1uYKyZde51-3sTyTdfBXfCpKnG92-fsNO

folder_url: nan
Processing repository URL: https://github.com/ds-modules/CUNEIF-102A/tree/master
Cloning https://github.com/ds-modules/CUNEIF-102A/tree/master to temp/master...


Cloning into 'temp/master'...
fatal: repository 'https://github.com/ds-modules/CUNEIF-102A/tree/master/' not found
Cloning into 'temp/master'...


Failed to clone repository: https://github.com/ds-modules/CUNEIF-102A/tree/master. Skipping...

folder_url: nan
Processing repository URL: https://github.com/ds-modules/CUNEIF-102A/tree/master
Cloning https://github.com/ds-modules/CUNEIF-102A/tree/master to temp/master...


fatal: repository 'https://github.com/ds-modules/CUNEIF-102A/tree/master/' not found
Cloning into 'temp/LEGALST-190'...


Failed to clone repository: https://github.com/ds-modules/CUNEIF-102A/tree/master. Skipping...

folder_url: https://github.com/ds-modules/LEGALST-190/tree/master/labs/3-22
Processing folder URL: https://github.com/ds-modules/LEGALST-190/tree/master/labs/3-22
Cloning https://github.com/ds-modules/LEGALST-190 to temp/LEGALST-190...


Updating files: 100% (502/502), done.


Uploading folder: temp/LEGALST-190/labs/3-22 to Google Drive...
Uploaded folder ID: 1AKkLgz-YZa7DSSvyojSQ3iWXFcUBQV8b
All done!
