<a href="https://colab.research.google.com/github/wawan-ikhwan/colab-content-backup-and-recover/blob/main/colab_content_backup_and_recover.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Bare Cells**


In [4]:
#@title ##**List Available Contents**
#@markdown (*require mount permission*)
from google.colab import drive, userdata
import os
import pandas as pd
from subprocess import getoutput

!apt install xattr &> /dev/null
# Function to get Google Drive file ID
def get_fid(fn):
  fid = getoutput("xattr -p 'user.drive.id' '%s'" % fn)
  return fid

# Get GDrive Absolute Path
try:
  BARE_DIRECTORY = userdata.get('BARE_DIRECTORY')
except:
  BARE_DIRECTORY = 'bare-colab-contents' # default bare directory
print('Using BARE_DIRECTORY:', BARE_DIRECTORY)
GDRIVE_ROOT = '/drive/MyDrive'
BARE_PATH = os.path.join(GDRIVE_ROOT, BARE_DIRECTORY)

# List available contents
def list_contents():
  data = !ls -al {BARE_PATH}/*.zip
  # Extracting the information and creating a list of dictionaries
  result_list = []
  for entry in data[1:]:
    parts = entry.split()
    size = int(parts[4])
    timestamp = f'{parts[5]} {parts[6]} {parts[7]}'
    identifier = int(parts[-1].split('_')[0])
    is_protected = parts[-1].split('_')[1] == 'protected'
    name = ''.join(parts[-1].split('_')[2:]).replace('.zip', '')
    file_path = os.path.join(BARE_PATH, entry.split()[-1])

    # Get Google Drive file ID
    drive_id = get_fid(file_path)

    result_dict = {
      'identifier': identifier,
      'name': name,
      'size': '{:.2f} {}'.format(size / 1024**3, 'GB') if size >= 1024**3 else ('{:.2f} {}'.format(size / 1024**2, 'MB') if size >= 1024**2 else '{:.2f} {}'.format(size / 1024, 'KB')),
      'bytes': size,
      'is_protected': is_protected,
      'timestamp': timestamp,
      'drive_id': drive_id
    }
    result_list.append(result_dict)

  # Creating a pandas DataFrame from the list of dictionaries
  df = pd.DataFrame(result_list)
  try:
    return df[['identifier', 'name', 'size', 'bytes', 'is_protected', 'timestamp', 'drive_id']]
  except KeyError:
    print('There is no available contents in:', BARE_DIRECTORY)
    return pd.DataFrame([{
      'identifier': None,
      'name': None,
      'size': None,
      'bytes': None,
      'is_protected': None,
      'timestamp': None,
      'drive_id': None
    }])

# GDrive Mount
drive.mount('/drive', force_remount=True)

# Display List of Contents in DataFrame
display(list_contents())
# GDrive Unmount
drive.flush_and_unmount()
print('Unmounted from /drive')


Using BARE_DIRECTORY: signal-batch-5-progress
Mounted at /drive
There is no available contents in: signal-batch-5-progress


Unnamed: 0,identifier,name,size,bytes,is_protected,timestamp,drive_id
0,,,,,,,


Unmounted from /drive


In [22]:
#@title ##**Recover**
#@markdown (*Doesn't require mount permission*)
from re import search

DRIVE_ID = '1VcSB2YiomGS-3fwc0kSD0RNxTgBkNXVt' #@param {"type":"string"}

#@markdown * Make sure **SHARE** permission of **BARE_DIRECTORY** is set to **VIEWER TO ANYONE** in Google Drive.
#@markdown * To check **DRIVE_ID**, run cell **List Available Contents** (*require mount*) or check directly in Google Drive (*no need mount*).
#@markdown * Set **DRIVE_ID** with `identifier == 0` to recover **DIFFERENTIAL BACKUP** content.
#@markdown * To enable decryption, use *Secrets* Google Colaboratory feature with setting password with filling value at name `BARE_ZIPKEY`.

# DRIVE_ID Normalization
if 'drive.google.com' in DRIVE_ID: # If it's an URL
  DRIVE_ID = search(r'/d/([^/]+)/', DRIVE_ID).group(1)

# Create the Working Directory and Download the Content
!mkdir -p '/tmp/bare_download'
%cd '/tmp/bare_download'
is_downloaded = !ls .
if is_downloaded:
  print('Already downloaded, do nothing!')
else:
  !gdown {DRIVE_ID}
%cd '/content'

from datetime import datetime
import pandas as pd

list_of_downloaded_file = !ls /tmp/bare_download
try:
  FILE_NAME = list_of_downloaded_file[0]
except:
  raise Exception("You didn't set share permission of BARE_DIRECTORY or the file to VIEWER TO ANYONE.")
CONTENT_NAME = FILE_NAME.split('_')[2:][0].split('.')[:-1][0]
IS_PROTECTED = FILE_NAME.split('_')[1] == 'protected'
IDENTIFIER = int(FILE_NAME.split('_')[0])
FILE_PATH = f'/tmp/bare_download/{FILE_NAME}'
SIZE = !wc -c < {FILE_PATH}
SIZE = int(str(SIZE[0]))
TIMESTAMP = !ls -al /tmp/bare_download/{FILE_NAME}
TIMESTAMP = ' '.join([TIMESTAMP[0].split()[5], TIMESTAMP[0].split()[6], TIMESTAMP[0].split()[7]])

CONTENT_METADATA = {
  'identifier': IDENTIFIER,
  'name': CONTENT_NAME,
  'size': '{:.2f} {}'.format(SIZE / 1024**3, 'GB') if SIZE >= 1024**3 else ('{:.2f} {}'.format(SIZE / 1024**2, 'MB') if SIZE >= 1024**2 else '{:.2f} {}'.format(SIZE / 1024, 'KB')),
  'bytes': SIZE,
  'is_protected': IS_PROTECTED,
  'timestamp': TIMESTAMP,
  'drive_id': DRIVE_ID
}

# Display in DataFrame
display(pd.DataFrame(CONTENT_METADATA, index=[0]))

# Change Directory to Root
%cd '/'

# Backup Current Content
!mv '/content' '/content_backup'

# Getting BARE_ZIPKEY
BARE_ZIPKEY = 'this_is_bare_default_password'
if IS_PROTECTED:
  try:
    from google.colab import userdata
    BARE_ZIPKEY = userdata.get('BARE_ZIPKEY')
    print('Using custom ZIPKEY, protection enabled!')
  except:
    print("Warning: You didn't specify BARE_ZIPKEY in Secrets or you didn't grant BARE_ZIPKEY secret access. Using default password, protection disabled!")

# Extracting Content
!unzip -oqP {BARE_ZIPKEY} {FILE_PATH} -d '/'

# Check Return Code, 0 means success.
from IPython import get_ipython
exit_code = get_ipython().__dict__['user_ns']['_exit_code']
# print(exit_code)
if exit_code != 0:
  # Recovery current content due to failure.
  !rm -rf '/content'
  !mv '/content_backup' '/content'
  # Raise an error
  if IS_PROTECTED and BARE_ZIPKEY == 'this_is_bare_default_password':
    raise Exception("This content is protected but you didn't specify BARE_ZIPKEY in Secrets! Error code:", exit_code)
  raise Exception('Invalid password! Error code:', exit_code)

# Remove Temporary Content Backup
!rm -rf '/content_backup'

# Remove Temporary Download Directory
!rm -rf '/tmp/bare_download'

# Change Directory to Initial State
%cd '/content'

# Notify Success
!echo 'The content has been recovered: {CONTENT_NAME}'

/tmp/bare_download
Downloading...
From: https://drive.google.com/uc?id=1VcSB2YiomGS-3fwc0kSD0RNxTgBkNXVt
To: /tmp/bare_download/0_protected_hello-world.ipynb.zip
100% 11.1k/11.1k [00:00<00:00, 21.2MB/s]
/content


Unnamed: 0,identifier,name,size,bytes,is_protected,timestamp,drive_id
0,0,hello-world,10.81 KB,11068,True,Nov 29 10:30,1VcSB2YiomGS-3fwc0kSD0RNxTgBkNXVt


/
Using custom ZIPKEY, protection enabled!
0
/content
The content has been recovered: hello-world


In [3]:
#@title ##**Backup**
#@markdown (*require mount permission*)
from google.colab import drive, userdata
from time import time
import os

# Define function to get notebook name for default CONTENT_NAME.
def get_notebook_name():
  colab_ip = %system hostname -I   # uses colab magic to get list from bash
  colab_ip = colab_ip[0].strip()   # returns "172.28.0.12"
  colab_port = 9000                # could use 6000, 8080, or 9000
  import requests
  return requests.get(f'http://{colab_ip}:{colab_port}/api/sessions').json()[0]['name']

# Environment Setup
CONTENT_NAME = '' #@param {"type":"string"}
if CONTENT_NAME == '':
  CONTENT_NAME = get_notebook_name()
  print(f'CONTENT_NAME empty, using default name: {CONTENT_NAME}')

OVERWRITE = True #@param {"type":"boolean"}

#@markdown * Leave **CONTENT_NAME** empty to use default name (notebook name).
#@markdown * It will perform differential backup if **OVERWRITE** (saving GD storage), else incremental backup (timestamp recovery).
#@markdown * To enable encryption, use *Secrets* Google Colaboratory feature with setting password with filling value at name `BARE_ZIPKEY`.
try:
  BARE_ZIPKEY = userdata.get('BARE_ZIPKEY')
  print('Using custom BARE_ZIPKEY, protection enabled!')
except:
  BARE_ZIPKEY = 'this_is_bare_default_password'
  print("Warning: You didn't specify BARE_ZIPKEY in Secrets or You didn't grant BARE_ZIPKEY secret access. Using default password, protection disabled!")

# Get GDrive Absolute Path
try:
  BARE_DIRECTORY = userdata.get('BARE_DIRECTORY')
except:
  BARE_DIRECTORY = 'bare-colab-contents' # default bare directory
print('Using BARE_DIRECTORY:', BARE_DIRECTORY)
GDRIVE_ROOT = '/drive/MyDrive'
BARE_PATH = os.path.join(GDRIVE_ROOT, BARE_DIRECTORY)
print('BARE_PATH:', BARE_PATH)

FILE_NAME = f"{'0' if OVERWRITE else int(time())}_{'un' if BARE_ZIPKEY == 'this_is_bare_default_password' else ''}protected_{CONTENT_NAME}.zip"

# Delete Built-in Dataset
!rm -rf /content/sample_data/ &> /dev/null

# Compress Entire Content Folder
# Use the -P flag for password protection
!zip -qrP {BARE_ZIPKEY} /tmp/{FILE_NAME} /content/

# GDrive Mount
drive.mount('/drive', force_remount=True)

# Create New Root Directory if not exists
!mkdir -p {BARE_PATH} &> /dev/null

# Backing Up
if OVERWRITE:

  OLD_CONTENT = !ls {BARE_PATH}/0_*_{CONTENT_NAME}.zip 2> /dev/null
  if len(OLD_CONTENT) != 0: # Backing Up OLD_CONTENT Temporarily
    OLD_CONTENT = str(OLD_CONTENT[0])
    print('Overwriting old content:', OLD_CONTENT)
    OLD_CONTENT_BACKUP = OLD_CONTENT.replace(f'{BARE_PATH}/0_', f'{BARE_PATH}/1_')
    !mv {OLD_CONTENT} {OLD_CONTENT_BACKUP}
  else:
    print("No old content to overwrite, it's fine.")
  !mv /tmp/{FILE_NAME} {BARE_PATH}

  # Check Return Code, 0 means success.
  exit_code = get_ipython().__dict__['user_ns']['_exit_code']
  if exit_code != 0:
    # Recovery old content due to failure.
    if len(OLD_CONTENT) != 0:
      !mv {OLD_CONTENT_BACKUP} {OLD_CONTENT}
    drive.flush_and_unmount()
    print('/drive has been unmounted!')
    # Raise an error
    raise Exception('Error code:', exit_code)
  else:
    print('Backup operation is success.')

  # Old Content Backup is not needed anymore
  !rm -rf {BARE_PATH}/1_*_{CONTENT_NAME}.zip
else:
  !mv /tmp/{FILE_NAME} {BARE_PATH}

# GDrive Umount
drive.flush_and_unmount()
print('/drive has been unmounted!')

# Notify
print(f"The content has been backed up: {os.path.join(BARE_PATH, FILE_NAME)}")

# Clear BARE_ZIPKEY
BARE_ZIPKEY = None

CONTENT_NAME empty, using default name: hello-world.ipynb
Using custom BARE_ZIPKEY, protection enabled!
Using BARE_DIRECTORY: signal-batch-5-progress
BARE_PATH: /drive/MyDrive/signal-batch-5-progress
Mounted at /drive
No old content to overwrite, it's fine.
Backup operation is success.
/drive has been unmounted!
The content has been backed up: /drive/MyDrive/signal-batch-5-progress/0_protected_hello-world.ipynb.zip


### How it works?
* It will create directory based on `BARE_DIRECTORY` environment variable in secret Google Colaboratory feature. If it's not set, it will create default directory so your MyDrive will looks like `MyDrive/bare-colab-contents/`. Setting this environment will be good idea for colaborating with other colaborators with *Add to Shortcut* to *My Drive*.
* The directory will be used to store compressed contents in zip format.
* The compression is protected by password that based on `BARE_KEY` environment variable in secret Google Colaboratory feature. If it's not set, it will use default password.
* To backup, simply run the **Backup** cell. It will zipping entire `/content` and store it to the `BARE_DIRECTORY`. Backup requires mount permission.
* To recover, simply input public **DRIVE_ID** which means the zipped content file or **BARE_DIRECTORY** must be set to **VIEWER** then run **Recover** cell. Restore doesn't require mount permission.
* To list available **DRIVE_ID**, you can simply run **List Available Contents** cell. But, it requires mount permission. If you don't want to mount the runtime, you can **manually** inspect the **DRIVE_ID** in **BARE_DIRECTORY**.

By: Muhammad Ikhwan Perwira

Last Modified: 29/11/2023