In [1]:
import argparse
import os
import numpy as np
import time
import csv
import sklearn
from datetime import datetime
from sklearn import preprocessing
from skimage.color import rgb2gray
from sklearn import svm
from sklearn.metrics import accuracy_score, fbeta_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.externals import joblib
from sklearn.utils import shuffle
from pymongo import MongoClient
import helpers

from azureml.core.run import Run

In [2]:
def get_data(data_dir, number_of_samples, shape, color_insensitive):
    if color_insensitive:
        converter = lambda x: 1 - rgb2gray(x)  # inverting the image makes it process faster
    else:
        converter = None
    X, y = helpers.images_to_dataset(dataset_path=data_dir,
                                     shape=shape,
                                     smoothing=0.1,
                                     denoising=0.0,
                                     with_hog_attached=True,
                                     with_dominant_color_attached=True,
                                     pixels_per_cell=(shape[0] / 8, shape[0] / 8),
                                     cells_per_block=(8, 8),
                                     orientations=9,
                                     samples=number_of_samples,
                                     converter=converter,
                                     debug=False)
    X = preprocessing.scale(X, with_mean=False)
    return X, y

In [11]:
client = MongoClient(host="localhost", port=27017)
natural_data = client.get_database("lego_vision").get_collection("natural_data_hog_dom")
synthetic_data = client.get_database("lego_vision").get_collection("synthetic_data_hog_dom")
for data_dir, is_syn in [("D:\LEGO Vision Datasets\classification-natural-data", False),
                         ("D:\LEGO Vision Datasets\classification-synthetic-data", True)]:
    for shape in ['32', '64', '128', '256']:
        shape = (int(shape), int(shape))
        for color_insensitive in [0, 1]:
            if is_syn:
                X, y = get_data(data_dir, 3200, shape, color_insensitive)
                synthetic_data.insert_many(documents=[
                    dict(_id=hash(an_X.tostring()), features=an_X.tolist(), label=an_y, shape=shape[0],
                         color=True if color_insensitive else False) for an_X, an_y in zip(X, y)])
            else:
                X, y = get_data(data_dir, 3200, shape, color_insensitive)      
                natural_data.insert_many(documents=[
                    dict(_id=hash(an_X.tostring()), features=an_X.tolist(), label=an_y, shape=shape[0],
                         color=True if color_insensitive else False) for an_X, an_y in zip(X, y)])

BulkWriteError: batch op errors occurred