# Tool to create image data set repo on github

Used when the connectivity with github is not good enough to clone normally. Steps are:

1. Clone empty github repo on colab
2. Add images to colab repo
3. Push changes to github

In [1]:
!pip install -Uqq fastbook
!pip install git+https://github.com/vtecftwy/ecutils.git@develop -q

import fastbook
import os
import shutil
import subprocess
import urllib

from ecutils import ml
from fastbook import *
from google.colab import drive
from pathlib import Path

drive.mount('/content/gdrive')

[K     |████████████████████████████████| 727kB 12.9MB/s 
[K     |████████████████████████████████| 1.2MB 20.4MB/s 
[K     |████████████████████████████████| 51kB 5.9MB/s 
[K     |████████████████████████████████| 194kB 34.8MB/s 
[K     |████████████████████████████████| 51kB 6.1MB/s 
[K     |████████████████████████████████| 61kB 6.9MB/s 
[K     |████████████████████████████████| 12.8MB 38.5MB/s 
[K     |████████████████████████████████| 776.8MB 22kB/s 
[31mERROR: torchtext 0.9.0 has requirement torch==1.8.0, but you'll have torch 1.7.1 which is incompatible.[0m
[?25h  Building wheel for ecutils (setup.py) ... [?25l[?25hdone
Mounted at /content/gdrive


## 1. Clone the github repo on Colab

In [2]:
repo_url = 'https://github.com/vtecftwy/unpackai-image-set.git'
repo_name = 'unpackai-image-set'

In [3]:
def run_cli(cmd='ls -l', suppress_cmd_echo=False):
    """
    Wrapper to use subprocess.run with passed command, and print the shell messagescmd: str    
    
    Arg:   cmd (str): command to execute 
    """
    if suppress_cmd_echo:
        print('Command echo suppressed')
    else:
        print(f"> {cmd}")
    
    p = subprocess.run(cmd, stdout=subprocess.PIPE, shell=True, check=True)
    print(str(p.stdout, 'utf-8'))

In [4]:
p2content = Path('/content')
p2repo = p2content / repo_name
os.chdir(p2content)
os.makedirs(p2repo, exist_ok=True)
Path.cwd()
run_cli()

> ls -l
total 12
drwx------ 5 root root 4096 Mar  9 12:58 gdrive
drwxr-xr-x 1 root root 4096 Mar  5 14:37 sample_data
drwxr-xr-x 2 root root 4096 Mar  9 12:59 unpackai-image-set



In [5]:
run_cli(f"git clone {repo_url} {repo_name}")
os.chdir(p2repo)
Path.cwd()

run_cli('git status')
run_cli('git branch')

> git clone https://github.com/vtecftwy/unpackai-image-set.git unpackai-image-set

> git status
On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean

> git branch
* main



In [6]:
run_cli('git branch')

> git branch
* main



In [None]:
# Removes the entire repo from the drive
# shutil.rmtree(p2repo)

## 2. Add images to colab repo

### Retrieve Azure key from gdrive config file

In [8]:
# Get the Azure API key for Bing Image Search API
path_to_config = Path('/content/gdrive/MyDrive/fastai/config-api-keys.cfg')
key = ml.get_config_value('azure', 'fastai-image-search-2021-1', path_to_config_file=path_to_config)

### Go to the repo folder and load images from Azure

In [None]:
os.chdir(p2repo)
!ls -l

total 4
-rw-r--r-- 1 root root 210 Mar  8 16:28 README.md


In [None]:
bear_types = 'grizzly','black','teddy'
path = p2repo / 'bears'

if not path.exists():
    path.mkdir()
    for o in bear_types:
        dest = (path/o)
        dest.mkdir(exist_ok=True)
        results = search_images_bing(key, f'{o} bear')
        download_images(dest, urls=results.attrgot('contentUrl'))

### Make ZIP files

One zip for each class of images. Move file in root of the repo

In [None]:
os.chdir(p2repo / 'bears')
for t in bear_types:
    shutil.make_archive(base_name=t, format='zip', base_dir=t)
    shutil.move(f"{t}.zip", p2repo / f"{t}.zip")

In [None]:
! ls -l

total 12
drwxr-xr-x 2 root root 4096 Mar  8 16:30 black
drwxr-xr-x 2 root root 4096 Mar  8 16:29 grizzly
drwxr-xr-x 2 root root 4096 Mar  8 16:30 teddy


## 3. Commit and Push changes to github

#### Technical notes:
Stackoverflow for how to configure git: [here](https://stackoverflow.com/questions/22147574/fatal-could-not-read-username-for-https-github-com-no-such-file-or-directo).
- Git command is: `git remote add origin https://{username}:{password}@github.com/{username}/project.git`. 
- Also could use `git remote set-url origin ....` in order not to have to remove the original `origin`
- Because user name and password may include special characters, we must encode it in url encoding using `urllib.parse.quote`.
- Retrieve user and password from config file for security reasons

In [10]:
path_to_config = Path('/content/gdrive/MyDrive/fastai/config-api-keys.cfg')
github_user = ml.get_config_value('github', 'github_username', path_to_config)
github_pswd =  ml.get_config_value('github', 'github_password', path_to_config)

git_user = ml.get_config_value('github', 'git_name', path_to_config)
git_email = ml.get_config_value('github', 'git_email', path_to_config)

In [11]:
git_user, git_email

('Etienne Charlier', 'github@procurasia.com')

In [None]:
url = f"https://{urllib.parse.quote(github_user)}:{urllib.parse.quote(github_pswd)}@{repo_url[8:]}"

In [None]:
! git remote

! git remote remove origin

# use run_cli to prevent user and passwords to be visible in the notebook
run_cli(cmd=f"git remote add origin {url}", suppress_cmd_echo=True)
run_cli(cmd=f'git config --global user.name "{git_user}" ',  suppress_cmd_echo=True)
run_cli(cmd=f'git config --global user.email "{git_email}" ',  suppress_cmd_echo=True)

In [None]:
os.chdir(p2repo)
! ls -l

total 200572
drwxr-xr-x 5 root root     4096 Mar  8 16:30 bears
-rw-r--r-- 1 root root 74338880 Mar  8 16:30 black.zip
-rw-r--r-- 1 root root 55916868 Mar  8 16:30 grizzly.zip
-rw-r--r-- 1 root root      210 Mar  8 16:28 README.md
-rw-r--r-- 1 root root 75112817 Mar  8 16:30 teddy.zip


In [None]:
! git status

On branch main
Untracked files:
  (use "git add <file>..." to include in what will be committed)

	[31mbears/[m
	[31mblack.zip[m
	[31mgrizzly.zip[m
	[31mteddy.zip[m

nothing added to commit but untracked files present (use "git add" to track)


In [None]:
os.chdir(p2repo)
! ls -l

for t in bear_types:
    run_cli(cmd=f"git add {t}.zip")

total 200572
drwxr-xr-x 5 root root     4096 Mar  8 16:30 bears
-rw-r--r-- 1 root root 74338880 Mar  8 16:30 black.zip
-rw-r--r-- 1 root root 55916868 Mar  8 16:30 grizzly.zip
-rw-r--r-- 1 root root      210 Mar  8 16:28 README.md
-rw-r--r-- 1 root root 75112817 Mar  8 16:30 teddy.zip
> git add grizzly.zip

> git add black.zip

> git add teddy.zip



In [None]:
! git status

On branch main
Changes to be committed:
  (use "git reset HEAD <file>..." to unstage)

	[32mnew file:   black.zip[m
	[32mnew file:   grizzly.zip[m
	[32mnew file:   teddy.zip[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)

	[31mbears/[m



In [None]:
! git commit -m "Add bear sets in three zip files"

[main a76a234] Add bear sets in three zip files
 3 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 black.zip
 create mode 100644 grizzly.zip
 create mode 100644 teddy.zip


In [None]:
! git push --set-upstream origin main

Counting objects: 5, done.
Delta compression using up to 2 threads.
Compressing objects: 100% (5/5), done.
Writing objects: 100% (5/5), 195.89 MiB | 13.56 MiB/s, done.
Total 5 (delta 0), reused 0 (delta 0)
To https://github.com/vtecftwy/unpackai-image-set.git
   2c4827f..a76a234  main -> main
Branch 'main' set up to track remote branch 'main' from 'origin'.


## Load datasets from repo

In [None]:
destination = Path('/content/bears')
os.makedirs(destination, exist_ok=True)
for t in ['black', 'grizzly','teddy']:
    print(t)
    path = untar_data(f"https://github.com/vtecftwy/unpackai-image-set/blob/main/{t}.zip?raw=true", 
                     fname=f"{t}.zip", 
                     dest=destination)

black
/content/bears/black
True
[Path('/content/bears/black/00000065.jpg'), Path('/content/bears/black/00000039.jpeg'), Path('/content/bears/black/00000020.png'), Path('/content/bears/black/00000044.jpg'), Path('/content/bears/black/00000121.jpg'), Path('/content/bears/black/00000120.jpg'), Path('/content/bears/black/00000084.jpg'), Path('/content/bears/black/00000063.jpg'), Path('/content/bears/black/00000054.jpg'), Path('/content/bears/black/00000001.jpg'), Path('/content/bears/black/00000093.jpg?ve=1&tl=1?ve=1&tl=1'), Path('/content/bears/black/00000129.jpg'), Path('/content/bears/black/00000149.jpg'), Path('/content/bears/black/00000012.JPG'), Path('/content/bears/black/00000112.jpg'), Path('/content/bears/black/00000110.jpg'), Path('/content/bears/black/00000142.jpg'), Path('/content/bears/black/00000000.jpg?width=3200&height=1680&fit=crop'), Path('/content/bears/black/00000051.jpg'), Path('/content/bears/black/00000144.jpg'), Path('/content/bears/black/00000138.jpg'), Path('/cont