# This is the Jupyter Notebook for the MAGICODE project - TEST

In [1]:
import os
os.chdir('datasets')

Unzip the dataset

In [None]:
# join the volumes together into a single zip file
!zip -s 0 dataset.zip -O dataset_joined.zip
# unzip the newly assembled archive
!unzip dataset_joined.zip -d ./all_data
print ('files unzipped')

Split dataset into training and evaluation sets

In [5]:
!pip install sklearn

You should consider upgrading via the '/home/va281/projects/python-stuff/jupyterenvironment/bin/python -m pip install --upgrade pip' command.[0m


In [21]:
import sklearn.model_selection as model_selection
import glob
import shutil
from pathlib import Path
from os.path import join

# define source folder
source = join('all_data', 'dataset')
# get all file paths
all_files = os.listdir(source)
# build a generic image path (e.g. 'all_data/dataset/*.png')
images_path = os.path.join(source, '*.png')
# get all images paths
img_files = glob.glob(images_path)
# remove files extension from files paths
img_files_without_extension = [Path(img_file).stem for img_file in img_files]

# training set will be 6 times the size of the evaluation set
distribution=6
TRAINING_SET_NAME = "training_set"
EVALUATION_SET_NAME = "eval_set"

# splits randomly the files into two sets
# Path(*).stem returns the file name without extension
train_set,eval_set = model_selection.train_test_split(img_files_without_extension, train_size=(distribution / 10))

# create the TRAINING_SET_NAME and EVALUATION_SET_NAME directories if they do not exist
if not os.path.exists(join(source, TRAINING_SET_NAME)):
    os.makedirs(join(source, TRAINING_SET_NAME))
if not os.path.exists(os.path.join(source, EVALUATION_SET_NAME)):
    os.makedirs(join(source, EVALUATION_SET_NAME))

# copy the files (img and gui) from the all_data folder into the training_set folder
for file in train_set:
    shutil.copyfile(join(source, file + '.png'), join(source, TRAINING_SET_NAME, file + '.png'))
    shutil.copyfile(join(source, file + '.gui'), join(source, TRAINING_SET_NAME, file + '.gui'))

# copy the files (img and gui) from the all_data folder into the eval_set folder
for file in eval_set:
    shutil.copyfile(join(source, file + '.png'), join(source, EVALUATION_SET_NAME, file + '.png'))
    shutil.copyfile(join(source, file + '.gui'), join(source, EVALUATION_SET_NAME, file + '.gui'))

print("Training dataset: {}".format(join(source, TRAINING_SET_NAME)))
print("Evaluation dataset: {}".format(join(source, EVALUATION_SET_NAME)))


Transoform training set into numpy arrays

In [27]:
# install opencv to use cv2 module in get_preprocessed_img
!pip install opencv-python

Collecting opencv-python
  Downloading opencv_python-4.5.1.48-cp38-cp38-manylinux2014_x86_64.whl (50.4 MB)
[K     |████████████████████████████████| 50.4 MB 9.7 MB/s eta 0:00:01
Installing collected packages: opencv-python
Successfully installed opencv-python-4.5.1.48
You should consider upgrading via the '/home/va281/projects/python-stuff/jupyterenvironment/bin/python -m pip install --upgrade pip' command.[0m


In [29]:
from model.classes.Utils import *

IMAGE_SIZE = 256

#define source and destination folders
source = join('all_data', 'dataset', 'training_set')
destination = join('all_data', 'dataset', 'training_features')

# create the training_features directory if it does not exist
if not os.path.exists(destination):
    os.makedirs(destination)

# transform images in training dataset (i.e. normalized pixel values and resized pictures) to numpy arrays (smaller files, useful if uploading the set to train a model in the cloud)
for f in os.listdir(source):
    if f.find(".png") != -1:
        img = Utils.get_preprocessed_img(join(source, f), IMAGE_SIZE)
        file_name = f[:f.find(".png")]

        np.savez_compressed(join(destination, file_name), features=img)
        retrieve = np.load(join(destination, file_name + '.npz'))["features"]
        
        assert np.array_equal(img, retrieve)
        
        shutil.copyfile(join(source, file_name + '.gui'), join(destination, file_name + '.gui'))


In [30]:
os.chdir('../')
if not os.path.exists('bin'):
    os.mkdir('bin')
os.chdir('model')

Train the model using a generator (to avoid having to fit all the data in memory)

In [33]:
!pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.4.0-cp38-cp38-manylinux2010_x86_64.whl (394.8 MB)
[K     |████████████████████████████████| 394.8 MB 39 kB/s s eta 0:00:01    |████████████▋                   | 156.0 MB 1.1 MB/s eta 0:03:36     |███████████████████▊            | 243.2 MB 10.0 MB/s eta 0:00:16     |███████████████████████████████ | 382.6 MB 8.3 MB/s eta 0:00:02
Collecting absl-py~=0.10
  Downloading absl_py-0.11.0-py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 3.0 MB/s eta 0:00:01
Collecting astunparse~=1.6.3
  Downloading astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Collecting flatbuffers~=1.12.0
  Downloading flatbuffers-1.12-py2.py3-none-any.whl (15 kB)
Collecting gast==0.3.3
  Downloading gast-0.3.3-py2.py3-none-any.whl (9.7 kB)
Collecting google-pasta~=0.2
  Downloading google_pasta-0.2.0-py3-none-any.whl (57 kB)
[K     |████████████████████████████████| 57 kB 3.6 MB/s eta 0:00:011
Collecting grpcio~=1.32.0
  Downloading grpcio-1.32.0

  Building wheel for wrapt (setup.py) ... [?25ldone
[?25h  Created wheel for wrapt: filename=wrapt-1.12.1-cp38-cp38-linux_x86_64.whl size=78560 sha256=a87fda820ba2ad9b64c0215a45717981266c7f96905431c94660063352c23e09
  Stored in directory: /home/va281/.cache/pip/wheels/5f/fd/9e/b6cf5890494cb8ef0b5eaff72e5d55a70fb56316007d6dfe73
Successfully built termcolor wrapt
Installing collected packages: urllib3, pyasn1, idna, chardet, certifi, rsa, requests, pyasn1-modules, oauthlib, cachetools, requests-oauthlib, google-auth, werkzeug, tensorboard-plugin-wit, protobuf, markdown, grpcio, google-auth-oauthlib, absl-py, wrapt, typing-extensions, termcolor, tensorflow-estimator, tensorboard, opt-einsum, keras-preprocessing, h5py, google-pasta, gast, flatbuffers, astunparse, tensorflow
Successfully installed absl-py-0.11.0 astunparse-1.6.3 cachetools-4.2.0 certifi-2020.12.5 chardet-4.0.0 flatbuffers-1.12 gast-0.3.3 google-auth-1.24.0 google-auth-oauthlib-0.4.2 google-pasta-0.2.0 grpcio-1.32.0 h5py-2

In [38]:
import tensorflow as tf
sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(log_device_placement=True))

import sys

from classes.dataset.Generator import *
from classes.model.pix2code import *

np.random.seed(1234)
BATCH_SIZE = 64

training_features = join('..', 'datasets', 'all_data', 'dataset', 'training_features')
output_path = join('..', 'bin')

dataset = Dataset()
dataset.load(training_features, generate_binary_sequences=True)
dataset.save_metadata(output_path)
dataset.voc.save(output_path)

gui_paths, img_paths = Dataset.load_paths_only(training_features)

input_shape = dataset.input_shape
output_size = dataset.output_size
steps_per_epoch = dataset.size / BATCH_SIZE
voc = Vocabulary()
voc.retrieve(output_path)

generator = Generator.data_generator(voc, gui_paths, img_paths, batch_size=BATCH_SIZE, generate_binary_sequences=True)

model = pix2code(input_shape, output_size, output_path)

model.fit_generator(generator, steps_per_epoch=steps_per_epoch)

Device mapping: no known devices.
Loading data...
Generating sparse vectors...
Dataset size: 75787
Vocabulary size: 26
Input shape: (256, 256, 3)
Output size: 26
Parsing data...
Epoch 1/10


KeyboardInterrupt: 