In [1]:
import numpy as np
from matplotlib import pyplot as plt
import math
import random

import cv2
from sklearn.metrics import precision_score, recall_score, log_loss

import glob
import ntpath
from collections import Counter
import pickle
import copyreg
from tqdm import tqdm


In [2]:
%matplotlib notebook

In [3]:
def find_keypoints(path=r'/home/syakushin/DONE_SAME_SCALE/dress/*.jpg'):
    """Function that calculates all KAZE keypoints of images 
    corresponding to path template
    
    first part of the name before '_' (FIRSTPART_1_2_3.jpg) 
    is supposed to be similar for the photos of one thing

    Args:
        path: full path to folder with images. '*.jpg' is needed

    Returns:
        A list of (full_filename, keypoint_coordinate, keypoint_description)

    """
    kp_des = []
    kaze = cv2.KAZE_create()
    for filename_a in tqdm(glob.glob(path)):
        img1 = cv2.imread(filename_a, 0)
        kp1, des1 = kaze.detectAndCompute(img1,None)
        if (des1 is not None):
            kp_des.append((filename_a, kp1, des1))
        kp_des = sorted(kp_des, key = lambda x:x[0])
    return kp_des

In [4]:
def getComponents(normalised_homography):
  '''Calculates translationx, translationy, rotation, 
  scalex, scaley and  shear of homography matrix
  
    Args:
    normalised_homography: (3,3) numpy array

    Returns:
        translationx, translationy, rotation, 
        scalex, scaley and shear
  '''
  a = normalised_homography[0,0]
  b = normalised_homography[0,1]
  c = normalised_homography[0,2]
  d = normalised_homography[1,0]
  e = normalised_homography[1,1]
  f = normalised_homography[1,2]

  p = math.sqrt(a*a + b*b)
  r = (a*e - b*d)/(p)
  q = (a*d+b*e)/(a*e - b*d)

  translation_x, translation_y = (c,f)
  scale_x, scale_y = (p,r)
  shear = q
  theta = math.atan2(b,a)

  return (translation_x, translation_y, theta, scale_x, scale_y, shear)

In [20]:
def create_dataset_p(kp_des, MIN_MATCH_COUNT=4, compare_limit=256, negative_groups=1, ransacReprojThreshold=4, h=840, w=560):
    """Function that creates dataset from keypoints  of 'find_keypoints'
    
    Args:
        kp_des: keypoints from 'find_keypoints' output
        MIN_MATCH_COUNT: minimal number of keypoint matches to find a homography matrix
        compare_limit: max number of image comparison with images of particular item
        negative_groups: how many items are compared with another item. 
            Value 1 leads to more balanced dataset. 
            Value 0 leads to only positive examples.
        ransacReprojThreshold: findHomography parameter of RANSAC distance theshold

    Returns:
        X_data_bc: list of normalized features for each pair:
                number of matched pairs in homography
                translationx
                translationy
                rotation
                maximal  scale change
                scale change aspect ratio
                shear
                mean distance of matching keypoints description 
                max distance of matching keypoints description
                mean distance between homography and true keypoint 
                max distance between homography and true keypoint 
        y_data_b: list of true answers if a pair of images belongs to a single item 
        idx_data_b: indices of two images according to kp_des for described pairs
        idx_data_empty_b: indices of two images according to kp_des 
            for pairs with failed matching
        y_data_empty_b: list of true answers for pairs with failed matching 

    """
    kaze = cv2.KAZE_create()

    X_data_b = []
    y_data_b = []
    idx_data_b = []
    idx_data_empty_b = []
    y_data_empty_b = []                 

    kp_des_size = len(kp_des)

    for idx, (filename_a, kp1, des1) in tqdm(enumerate(kp_des)):
            next_touched = 0
            next_name = ''
            for idx_b in range(idx, np.minimum(kp_des_size, idx + compare_limit)):
                if next_touched <= negative_groups:
                    filename_b, kp2, des2 = kp_des[idx_b]
                    if not filename_a==filename_b:
                        if not ntpath.basename(filename_a).split("_")[0]==ntpath.basename(filename_b).split("_")[0]:
                            if next_touched < negative_groups:
                                next_touched += 1
                                next_name = ntpath.basename(filename_b).split("_")[0]
                            elif next_touched == negative_groups and not ntpath.basename(filename_b).split("_")[0]==next_name:
                                break
                        bf = cv2.BFMatcher(crossCheck=True)
                        idx_data_empty_b.append((idx, idx_b))
                        y_data_empty_b.append(int(int(ntpath.basename(filename_a).split("_")[0]) == int(ntpath.basename(filename_b).split("_")[0])))
                        if des2 is not None:
                            matches = bf.match(des1,des2)
                            matches = sorted(matches, key = lambda x:x.distance)
                            if len(matches) >= MIN_MATCH_COUNT:
                                src_pts = np.float32([ kp1[m.queryIdx].pt for m in matches ]).reshape(-1,1,2)
                                dst_pts = np.float32([ kp2[m.trainIdx].pt for m in matches ]).reshape(-1,1,2)
                                features = []
                                M, mask = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, ransacReprojThreshold)
                                
                                if M is not None:
                                    matchesMask = np.where(np.array(mask.ravel().tolist())==1)
                                    distances = np.abs([o.distance for o in matches])[matchesMask]
                                    match_max = np.max(distances)
                                    match_meansq = np.mean(distances)

                                    dst = np.array(cv2.perspectiveTransform(src_pts[matchesMask], M))

                                    dist = np.linalg.norm(dst_pts[matchesMask] - dst, axis=2)
                                    src_arr_pts = np.array(src_pts[matchesMask]).reshape(-1,2)
#                                     print(src_arr_pts)
                                    size = ((np.max(src_arr_pts[:,0]) - np.min(src_arr_pts[:,0]))*
                                            (np.max(src_arr_pts[:,1]) - np.min(src_arr_pts[:,1])))
                                    
#                                     print(np.min(src_arr_pts[:,1]))
                                    
                                    meansq_dist = np.mean(dist)
                                    max_dist = np.max(dist)
                                    translation_x, translation_y, rotation, scale_x, scale_y, shear = getComponents(M)
                                    features_i = [(np.sum(mask)-3)**(-1),
                                                  np.abs(translation_x)/2e6, 
                                                  np.abs(translation_y)/2e6, 
                                                  np.abs(rotation)/math.pi, 
                                                  np.min([np.abs(scale_x)**(-1), 
                                                         np.abs(scale_x),
                                                         np.abs(scale_y)**(-1), 
                                                         np.abs(scale_y)]), 
                                                  np.minimum(np.abs(scale_x/scale_y), np.abs(scale_y/scale_x)), 
                                                  np.abs(shear)/1e7,
                                                  match_max,
                                                  match_meansq,
                                                  0.1*meansq_dist/ransacReprojThreshold,
                                                  0.1*max_dist/ransacReprojThreshold,
                                                  size/w/h]    
                                    features.append(features_i)
                                    X_data_b.append(features)
                                    y_data_b.append(int(int(ntpath.basename(filename_a).split("_")[0]) == int(ntpath.basename(filename_b).split("_")[0])))
                                    del y_data_empty_b[-1]
                                    idx_data_b.append((idx, idx_b))
                                    del idx_data_empty_b[-1]
    X_data_bc = np.array(X_data_b)

    return X_data_bc, y_data_b, idx_data_b, idx_data_empty_b, y_data_empty_b

In [39]:
# from skimage import data
# from skimage.util import img_as_float
# from skimage.feature import (corner_harris, corner_subpix, corner_peaks,
#                              plot_matches)
from skimage.transform import warp, AffineTransform
# from skimage.exposure import rescale_intensity
# from skimage.color import rgb2gray
from skimage.measure import ransac

In [79]:
def create_dataset_affine(kp_des, max_trials=1000, MIN_MATCH_COUNT=4, compare_limit=256, negative_groups=1, ransacReprojThreshold=4, h=840, w=560):
    """Function that creates dataset from keypoints  of 'find_keypoints'
    
    Args:
        kp_des: keypoints from 'find_keypoints' output
        MIN_MATCH_COUNT: minimal number of keypoint matches to find a homography matrix
        compare_limit: max number of image comparison with images of particular item
        negative_groups: how many items are compared with another item. 
            Value 1 leads to more balanced dataset. 
            Value 0 leads to only positive examples.
        ransacReprojThreshold: parameter of RANSAC distance theshold for Affine matrix

    Returns:
        X_data_bc: list of normalized features for each pair:
                number of matched pairs in homography
                translationx
                translationy
                rotation
                maximal  scale change
                scale change aspect ratio
                shear
                mean distance of matching keypoints description 
                max distance of matching keypoints description
                mean distance between homography and true keypoint 
                max distance between homography and true keypoint 
        y_data_b: list of true answers if a pair of images belongs to a single item 
        idx_data_b: indices of two images according to kp_des for described pairs
        idx_data_empty_b: indices of two images according to kp_des 
            for pairs with failed matching
        y_data_empty_b: list of true answers for pairs with failed matching 

    """
    kaze = cv2.KAZE_create()
    X_data_b = []
    y_data_b = []
    idx_data_b = []
    idx_data_empty_b = []
    y_data_empty_b = []                 
    kp_des_size = len(kp_des)
    for idx, (filename_a, kp1, des1) in tqdm(enumerate(kp_des)):
            next_touched = 0
            next_name = ''
            for idx_b in range(idx, np.minimum(kp_des_size, idx + compare_limit)):
                if next_touched <= negative_groups:
                    filename_b, kp2, des2 = kp_des[idx_b]
                    if not filename_a==filename_b:
                        if not ntpath.basename(filename_a).split("_")[0]==ntpath.basename(filename_b).split("_")[0]:
                            if next_touched < negative_groups:
                                next_touched += 1
                                next_name = ntpath.basename(filename_b).split("_")[0]
                            elif next_touched == negative_groups and not ntpath.basename(filename_b).split("_")[0]==next_name:
                                break
                        bf = cv2.BFMatcher(crossCheck=True)
                        idx_data_empty_b.append((idx, idx_b))
                        y_data_empty_b.append(int(int(ntpath.basename(filename_a).split("_")[0]) == int(ntpath.basename(filename_b).split("_")[0])))
                        if des2 is not None:
                            matches = bf.match(des1,des2)
                            matches = sorted(matches, key = lambda x:x.distance)
                            if len(matches) >= MIN_MATCH_COUNT:
                                src_pts = np.float32([ kp1[m.queryIdx].pt for m in matches ]).reshape(-1,2)[:,[1, 0]]
                                dst_pts = np.float32([ kp2[m.trainIdx].pt for m in matches ]).reshape(-1,2)[:,[1, 0]]
                                features = []
                                features_i = [0,                                                  
                                              1, 
                                              1, 
                                              1, 
                                              0, 
                                              0, 
                                              0,
                                              1,
                                              1,
                                              1,
                                              1,
                                              0]
                                try:
                                    model_robust, inliers = ransac((src_pts, dst_pts), 
                                                               AffineTransform, 
                                                                max_trials=max_trials,
                                                               residual_threshold=ransacReprojThreshold,
                                                               min_samples=MIN_MATCH_COUNT
                                                              )
                                    outliers = inliers == False
                                    if model_robust is not None:
                                        matchesMask = np.array(inliers)
                                        distances = np.abs([o.distance for o in matches])[matchesMask]
                                        match_max = np.max(distances)
                                        match_meansq = np.mean(distances)
                                        dist = model_robust.residuals(src_pts[matchesMask], dst_pts[matchesMask])
                                        src_arr_pts = np.array(src_pts[matchesMask]).reshape(-1,2)
                                        size = ((np.max(src_arr_pts[:,0]) - np.min(src_arr_pts[:,0]))*
                                                (np.max(src_arr_pts[:,1]) - np.min(src_arr_pts[:,1])))
                                        meansq_dist = np.mean(dist)
                                        max_dist = np.max(dist)
                                        translation_x, translation_y = model_robust.translation[0], model_robust.translation[1] 
                                        scale_x, scale_y = model_robust.scale[0], model_robust.scale[1] 
                                        features_i = [np.sum(inliers),
                                                      np.abs(translation_x), 
                                                      np.abs(translation_y), 
                                                      np.abs(model_robust.rotation)/math.pi, 
                                                      np.min([np.abs(scale_x)**(-1), 
                                                             np.abs(scale_x),
                                                             np.abs(scale_y)**(-1), 
                                                             np.abs(scale_y)]), 
                                                      np.minimum(np.abs(scale_x/scale_y), np.abs(scale_y/scale_x)), 
                                                      np.abs(model_robust.shear),
                                                      match_max,
                                                      match_meansq,
                                                      0.1*meansq_dist/ransacReprojThreshold,
                                                      0.1*max_dist/ransacReprojThreshold,
                                                      size/w/h]
                                
                                except:
                                    pass
                                M = model_robust.params
                                features.append(features_i)
                                X_data_b.append(features)
                                y_data_b.append(int(int(ntpath.basename(filename_a).split("_")[0]) == int(ntpath.basename(filename_b).split("_")[0])))
                                del y_data_empty_b[-1]
                                idx_data_b.append((idx, idx_b))
                                del idx_data_empty_b[-1]
    X_data_bc = np.array(X_data_b)

    return X_data_bc, y_data_b, idx_data_b, idx_data_empty_b, y_data_empty_b

In [6]:
def get_safe_balanced_split(target, trainSize=0.8, getTestIndexes=True, shuffle=False, seed=None):
    """Function that splits dataset to a balanced train and test
    
    Args:
        target: 'y' - answers
        trainSize:
        getTestIndexes: 
        shuffle: 
        seed: random seed

    Returns:
        trainIndexes, [testIndexes]

    """
    classes, counts = np.unique(target, return_counts=True)
    nPerClass = float(len(target))*float(trainSize)/float(len(classes))
    if nPerClass > np.min(counts):
        print("Insufficient data to produce a balanced training data split.")
        print("Classes found %s"%classes)
        print("Classes count %s"%counts)
        ts = float(trainSize*np.min(counts)*len(classes)) / float(len(target))
        print("trainSize is reset from %s to %s"%(trainSize, ts))
        trainSize = ts
        nPerClass = float(len(target))*float(trainSize)/float(len(classes))
    # get number of classes
    nPerClass = int(nPerClass)
    print("Data splitting on %i classes and returning %i per class"%(len(classes),nPerClass ))
    # get indexes
    trainIndexes = []
    for c in classes:
        if seed is not None:
            np.random.seed(seed)
        cIdxs = np.where(target==c)[0]
        cIdxs = np.random.choice(cIdxs, nPerClass, replace=False)
        trainIndexes.extend(cIdxs)
    # get test indexes
    testIndexes = None
    if getTestIndexes:
        testIndexes = list(set(range(len(target))) - set(trainIndexes))
    # shuffle
    if shuffle:
        trainIndexes = random.shuffle(trainIndexes)
        if testIndexes is not None:
            testIndexes = random.shuffle(testIndexes)
    # return indexes
    return trainIndexes, testIndexes

In [7]:
kp_des = find_keypoints(r'/srv/hd1/data/syakushin/dress/*.jpg')

100%|██████████| 5158/5158 [30:57<00:00,  2.78it/s]


In [None]:
# kp_des_blouse = find_keypoints(r'/home/syakushin/DONE_SAME_SCALE/blouse/*.jpg')

In [None]:
X_data_dress_8_aff, y_data_dress_8_aff, idx_data_dress_8_aff, idx_data_empty_dress_8_aff, y_data_empty_dress_8_aff = create_dataset_affine(kp_des, MIN_MATCH_COUNT=4, compare_limit=256, negative_groups=2, ransacReprojThreshold=8)
np.save("X_data_dress_8_aff", X_data_dress_8_aff)
np.save("y_data_dress_8_aff", y_data_dress_8_aff)
np.save("idx_data_dress_8_aff", idx_data_dress_8_aff)
np.save("idx_data_empty_dress_8_aff", idx_data_empty_dress_8_aff)
np.save("y_data_empty_dress_8_aff", y_data_empty_dress_8_aff)










0it [00:00, ?it/s][A[A[A[A[A[A[A[A[A








1it [00:01,  1.52s/it][A[A[A[A[A[A[A[A[A








  return umr_minimum(a, axis, None, out, keepdims)









3it [00:04,  1.50s/it][A[A[A[A[A[A[A[A[A








  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  rms = math.sqrt(np.sum((points - centroid) ** 2) / points.shape[0])









5it [00:08,  1.67s/it][A[A[A[A[A[A[A[A[A








6it [00:10,  1.70s/it][A[A[A[A[A[A[A[A[A








7it [00:11,  1.67s/it][A[A[A[A[A[A[A[A[A








8it [00:13,  1.63s/it][A[A[A[A[A[A[A[A[A








9it [00:14,  1.58s/it][A[A[A[A[A[A[A[A[A








10it [00:15,  1.54s/it][A[A[A[A[A[A[A[A[A








11it [00:16,  1.49s/it][A[A[A[A[A[A[A[A[A








12it [00:17,  1.44s/it][A[A[A[A[A[A[A[A[A








13it [00:18,  1.42s/it][A[A[A[A[A[A[A[A[A








14it [00:19,  1.36s/it][A[A[A[A[A[A[A[A[A








15it [00:20,  1.35s/i

127it [01:37,  1.30it/s][A[A[A[A[A[A[A[A[A








128it [01:40,  1.28it/s][A[A[A[A[A[A[A[A[A








129it [01:42,  1.26it/s][A[A[A[A[A[A[A[A[A








130it [01:44,  1.24it/s][A[A[A[A[A[A[A[A[A








131it [01:46,  1.23it/s][A[A[A[A[A[A[A[A[A








132it [01:48,  1.21it/s][A[A[A[A[A[A[A[A[A








133it [01:50,  1.20it/s][A[A[A[A[A[A[A[A[A








134it [01:52,  1.19it/s][A[A[A[A[A[A[A[A[A








135it [01:54,  1.18it/s][A[A[A[A[A[A[A[A[A








136it [01:57,  1.16it/s][A[A[A[A[A[A[A[A[A








137it [01:59,  1.15it/s][A[A[A[A[A[A[A[A[A








138it [02:01,  1.14it/s][A[A[A[A[A[A[A[A[A








139it [02:03,  1.12it/s][A[A[A[A[A[A[A[A[A








140it [02:05,  1.11it/s][A[A[A[A[A[A[A[A[A








141it [02:07,  1.10it/s][A[A[A[A[A[A[A[A[A








142it [02:09,  1.09it/s][A[A[A[A[A[A[A[A[A








143it [02:11,  1.09it/s][A[A[A[A[A

400it [05:18,  1.25it/s][A[A[A[A[A[A[A[A[A








401it [05:19,  1.26it/s][A[A[A[A[A[A[A[A[A








402it [05:20,  1.26it/s][A[A[A[A[A[A[A[A[A








403it [05:20,  1.26it/s][A[A[A[A[A[A[A[A[A








404it [05:21,  1.26it/s][A[A[A[A[A[A[A[A[A








405it [05:22,  1.26it/s][A[A[A[A[A[A[A[A[A








406it [05:23,  1.26it/s][A[A[A[A[A[A[A[A[A








407it [05:23,  1.26it/s][A[A[A[A[A[A[A[A[A








408it [05:24,  1.26it/s][A[A[A[A[A[A[A[A[A








409it [05:25,  1.26it/s][A[A[A[A[A[A[A[A[A








410it [05:26,  1.25it/s][A[A[A[A[A[A[A[A[A








411it [05:27,  1.25it/s][A[A[A[A[A[A[A[A[A








412it [05:28,  1.25it/s][A[A[A[A[A[A[A[A[A








413it [05:29,  1.25it/s][A[A[A[A[A[A[A[A[A








414it [05:29,  1.25it/s][A[A[A[A[A[A[A[A[A








415it [05:30,  1.26it/s][A[A[A[A[A[A[A[A[A








416it [05:31,  1.25it/s][A[A[A[A[A

672it [10:42,  1.05it/s][A[A[A[A[A[A[A[A[A








673it [10:43,  1.05it/s][A[A[A[A[A[A[A[A[A








674it [10:44,  1.05it/s][A[A[A[A[A[A[A[A[A








675it [10:45,  1.04it/s][A[A[A[A[A[A[A[A[A








676it [10:47,  1.04it/s][A[A[A[A[A[A[A[A[A








677it [10:48,  1.04it/s][A[A[A[A[A[A[A[A[A








678it [10:49,  1.04it/s][A[A[A[A[A[A[A[A[A








679it [10:49,  1.04it/s][A[A[A[A[A[A[A[A[A








680it [10:52,  1.04it/s][A[A[A[A[A[A[A[A[A








681it [10:55,  1.04it/s][A[A[A[A[A[A[A[A[A








682it [10:58,  1.04it/s][A[A[A[A[A[A[A[A[A








683it [11:00,  1.03it/s][A[A[A[A[A[A[A[A[A








684it [11:03,  1.03it/s][A[A[A[A[A[A[A[A[A








685it [11:05,  1.03it/s][A[A[A[A[A[A[A[A[A








686it [11:08,  1.03it/s][A[A[A[A[A[A[A[A[A








687it [11:10,  1.02it/s][A[A[A[A[A[A[A[A[A








688it [11:12,  1.02it/s][A[A[A[A[A

944it [16:22,  1.04s/it][A[A[A[A[A[A[A[A[A








945it [16:23,  1.04s/it][A[A[A[A[A[A[A[A[A








946it [16:23,  1.04s/it][A[A[A[A[A[A[A[A[A








947it [16:24,  1.04s/it][A[A[A[A[A[A[A[A[A








948it [16:25,  1.04s/it][A[A[A[A[A[A[A[A[A








949it [16:25,  1.04s/it][A[A[A[A[A[A[A[A[A








950it [16:26,  1.04s/it][A[A[A[A[A[A[A[A[A








951it [16:27,  1.04s/it][A[A[A[A[A[A[A[A[A








952it [16:29,  1.04s/it][A[A[A[A[A[A[A[A[A








953it [16:30,  1.04s/it][A[A[A[A[A[A[A[A[A








954it [16:31,  1.04s/it][A[A[A[A[A[A[A[A[A








955it [16:32,  1.04s/it][A[A[A[A[A[A[A[A[A








956it [16:33,  1.04s/it][A[A[A[A[A[A[A[A[A








957it [16:34,  1.04s/it][A[A[A[A[A[A[A[A[A








958it [16:36,  1.04s/it][A[A[A[A[A[A[A[A[A








959it [16:39,  1.04s/it][A[A[A[A[A[A[A[A[A








960it [16:41,  1.04s/it][A[A[A[A[A

In [99]:
X_data = X_data_dress_1_aff

In [100]:
y_data = np.array(y_data_dress_1_aff)

In [109]:
trainIndexes, testIndexes = get_safe_balanced_split(y_data)

Insufficient data to produce a balanced training data split.
Classes found [0 1]
Classes count [58807 31729]
trainSize is reset from 0.8 to 0.5607316426614828
Data splitting on 2 classes and returning 25383 per class


In [110]:
X_train = [X_data[x] for x in trainIndexes]
y_train = [y_data[x] for x in trainIndexes]
X_test = [X_data[x] for x in testIndexes]
y_test = [y_data[x] for x in testIndexes]

**ДАЛЕЕ ЕЩЕ НЕ ДОШЛО, НИЖЕ - ПРЕДЫДУЩИЕ РЕЗУЛЬТАТЫ**

In [111]:
np.random.seed(0)
from sklearn import datasets
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import calibration_curve

In [112]:
lr = LogisticRegression(solver='lbfgs', C=6, verbose=1, class_weight='balanced', max_iter=1000, n_jobs=8)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
print("recall = {}, precision = {}".format(recall_score(y_test, y_pred_lr), precision_score(y_test, y_pred_lr)))

[Parallel(n_jobs=8)]: Done   1 out of   1 | elapsed:   21.8s finished


recall = 0.5885597226599433, precision = 0.2678187293847698


In [113]:
classifier_lsvc = LinearSVC(random_state=0, C=0.1, class_weight='balanced', max_iter=1000, tol=1e-6)
classifier_lsvc.fit(X_train, y_train)
y_pred_lr = classifier_lsvc.predict(X_test)
print("recall = {}, precision = {}".format(recall_score(y_test, y_pred_lr), precision_score(y_test, y_pred_lr)))

recall = 0.5824141191301607, precision = 0.2702742230347349


In [114]:
model_rf = RandomForestClassifier(n_estimators = 2000, n_jobs = -1, random_state =50, max_features = 9, min_samples_leaf = 5)
model_rf.fit(X_train, y_train)
y_pred_lr = model_rf.predict(X_test)
y_proba=model_rf.predict_proba(X_test)[:,1]
print("recall = {}, precision = {}, logloss = {}".format(recall_score(y_test, y_pred_lr), precision_score(y_test, y_pred_lr), log_loss(y_test, y_proba)))


recall = 0.5568862275449101, precision = 0.3237746220797068, logloss = 0.5892866711624382


In [119]:
X_data = np.hstack((X_data_dress_1_p.tolist(), 
                    X_data_dress_4_p.tolist()[:12860] + X_data_dress_4_p.tolist()[12861:50469] + X_data_dress_4_p.tolist()[50470:57096] + X_data_dress_4_p.tolist()[57097:], 
                    X_data_dress_8_p.tolist()[:2596] + 
                    X_data_dress_8_p.tolist()[2597:12861]+
                    X_data_dress_8_p.tolist()[12862:38453]+
                    X_data_dress_8_p.tolist()[38454:50471]+
                    X_data_dress_8_p.tolist()[50472:55280]+
                    X_data_dress_8_p.tolist()[55281:57099]+
                    X_data_dress_8_p.tolist()[57100:88026]+
                    X_data_dress_8_p.tolist()[88027:],
                    X_data_dress_16_p.tolist()[:2596] + 
                    X_data_dress_16_p.tolist()[2597:12861]+
                    X_data_dress_16_p.tolist()[12862:38453]+
                    X_data_dress_16_p.tolist()[38454:50471]+
                    X_data_dress_16_p.tolist()[50472:55280]+
                    X_data_dress_16_p.tolist()[55281:57099]+
                    X_data_dress_16_p.tolist()[57100:88026]+
                    X_data_dress_16_p.tolist()[88027:]))
X_data = X_data.transpose(0,2,1).reshape(-1, 4*11)

In [120]:
trainIndexes, testIndexes = get_safe_balanced_split(y_data)

Insufficient data to produce a balanced training data split.
Classes found [0 1]
Classes count [58807 31729]
trainSize is reset from 0.8 to 0.5607316426614828
Data splitting on 2 classes and returning 25383 per class


In [121]:
X_train = [X_data[x] for x in trainIndexes]
y_train = [y_data[x] for x in trainIndexes]
X_test = [X_data[x] for x in testIndexes]
y_test = [y_data[x] for x in testIndexes]

In [126]:
lr = LogisticRegression(solver='lbfgs', C=10, verbose=1, class_weight='balanced', max_iter=1000, n_jobs=8)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
print("recall = {}, precision = {}".format(recall_score(y_test, y_pred_lr), precision_score(y_test, y_pred_lr)))

[Parallel(n_jobs=8)]: Done   1 out of   1 | elapsed:   29.6s finished


recall = 0.5877718247715096, precision = 0.26696249642141423


In [138]:
classifier_lsvc = LinearSVC(random_state=0, C=0.05, class_weight='balanced', max_iter=1000, tol=1e-6)
classifier_lsvc.fit(X_train, y_train)
y_pred_lr = classifier_lsvc.predict(X_test)
print("recall = {}, precision = {}".format(recall_score(y_test, y_pred_lr), precision_score(y_test, y_pred_lr)))

recall = 0.5820989599747872, precision = 0.2707218761451081


In [135]:
model_rf = RandomForestClassifier(n_estimators = 2000, n_jobs = -1, random_state =50, max_features = 44, min_samples_leaf = 5)
model_rf.fit(X_train, y_train)
y_pred_lr = model_rf.predict(X_test)
y_proba=model_rf.predict_proba(X_test)[:,1]
print("recall = {}, precision = {}, logloss = {}".format(recall_score(y_test, y_pred_lr), precision_score(y_test, y_pred_lr), log_loss(y_test, y_proba)))


recall = 0.5624015127639458, precision = 0.31522699169757995, logloss = 0.592270003260718


In [139]:
lr_0 = LogisticRegression(solver='lbfgs', C=6, verbose=1, class_weight='balanced', n_jobs=8)
lr_0.fit(np.array(X_train)[:,0:11], y_train)
lr_1 = LogisticRegression(solver='lbfgs', C=6, verbose=1, class_weight='balanced', n_jobs=8)
lr_1.fit(np.array(X_train)[:,11:22], y_train)
lr_2 = LogisticRegression(solver='lbfgs', C=6, verbose=1, class_weight='balanced', n_jobs=8)
lr_2.fit(np.array(X_train)[:,22:33], y_train)
y_pred_lr0 = lr_0.predict(np.array(X_test)[:,0:11])
y_pred_lr1 = lr_1.predict(np.array(X_test)[:,11:22])
y_pred_lr2 = lr_2.predict(np.array(X_test)[:,22:33])

model_rf0 = RandomForestClassifier(n_estimators = 2000,random_state =50, max_features = 9, min_samples_leaf = 5, n_jobs=8)
model_rf0.fit(np.array(X_train)[:,0:11], y_train)
model_rf1 = RandomForestClassifier(n_estimators = 2000, random_state =50, max_features = 9, min_samples_leaf = 5, n_jobs=8)
model_rf1.fit(np.array(X_train)[:,11:22], y_train)
model_rf2 = RandomForestClassifier(n_estimators = 2000, random_state =50, max_features = 9, min_samples_leaf = 5, n_jobs=8)
model_rf2.fit(np.array(X_train)[:,22:33], y_train)
y_pred_rf0 = model_rf0.predict(np.array(X_test)[:,0:11])
y_pred_rf1 = model_rf1.predict(np.array(X_test)[:,11:22])
y_pred_rf2 = model_rf2.predict(np.array(X_test)[:,22:33])

classifier_lsvc0 = LinearSVC(random_state=0, C=0.1, class_weight='balanced', max_iter=1000, tol=1e-6)
classifier_lsvc0.fit(np.array(X_train)[:,0:11], y_train)
classifier_lsvc1 = LinearSVC(random_state=0, C=0.1, class_weight='balanced', max_iter=1000, tol=1e-6)
classifier_lsvc1.fit(np.array(X_train)[:,11:22], y_train)
classifier_lsvc2 = LinearSVC(random_state=0, C=0.1, class_weight='balanced', max_iter=1000, tol=1e-6)
classifier_lsvc2.fit(np.array(X_train)[:,22:33], y_train)



y_proba0=lr_0.predict_proba(np.array(X_test)[:,0:11])[:,1]
y_proba1=lr_1.predict_proba(np.array(X_test)[:,11:22])[:,1]
y_proba2=lr_2.predict_proba(np.array(X_test)[:,22:33])[:,1]

y_probarf0=model_rf0.predict_proba(np.array(X_test)[:,0:11])[:,1]
y_probarf1=model_rf1.predict_proba(np.array(X_test)[:,11:22])[:,1]
y_probarf2=model_rf2.predict_proba(np.array(X_test)[:,22:33])[:,1]



y_tr_proba0=lr_0.predict_proba(np.array(X_train)[:,0:11])[:,1]
y_tr_proba1=lr_1.predict_proba(np.array(X_train)[:,11:22])[:,1]
y_tr_proba2=lr_2.predict_proba(np.array(X_train)[:,22:33])[:,1]

y_tr_probarf0=model_rf0.predict_proba(np.array(X_train)[:,0:11])[:,1]
y_tr_probarf1=model_rf1.predict_proba(np.array(X_train)[:,11:22])[:,1]
y_tr_probarf2=model_rf2.predict_proba(np.array(X_train)[:,22:33])[:,1]

y_pred_svc0 = classifier_lsvc0.predict(np.array(X_test)[:,0:11])
y_pred_svc1 = classifier_lsvc1.predict(np.array(X_test)[:,11:22])
y_pred_svc2 = classifier_lsvc2.predict(np.array(X_test)[:,22:33])

y_tr_pred_svc0 = classifier_lsvc0.predict(np.array(X_train)[:,0:11])
y_tr_pred_svc1 = classifier_lsvc1.predict(np.array(X_train)[:,11:22])
y_tr_pred_svc2 = classifier_lsvc2.predict(np.array(X_train)[:,22:33])

model_rf_common.fit(np.vstack(
(
y_tr_proba0,
y_tr_proba1,
y_tr_proba2,
y_tr_probarf0,
y_tr_probarf1,
y_tr_probarf2,
y_tr_pred_svc0,
y_tr_pred_svc1,
y_tr_pred_svc2
)
).T, y_train)

y_rf_common = model_rf_common.predict(np.vstack(
(
y_proba0,
y_proba1,
y_proba2,
y_probarf0,
y_probarf1,
y_probarf2,
y_pred_svc0,
y_pred_svc1,
y_pred_svc2
)).T)

y_proba_rf_common = model_rf_common.predict_proba(np.vstack(
(y_proba0,
y_proba1,
y_proba2,
y_probarf0,
y_probarf1,
y_probarf2,
y_pred_svc0,
y_pred_svc1,
y_pred_svc2
)).T)


print("recall = {}, precision = {}, logloss = {}".format(recall_score(y_test, y_rf_common), precision_score(y_test, y_rf_common), log_loss(y_test, y_proba_rf_common)))

recall = 0.43696816892530727, precision = 0.3443864878291108, logloss = 2.749522206370469


In [57]:
np.max(X_data_dress_1, axis=0)

array([[9.75200000e+03, 2.55209691e+06, 2.19701085e+06, 3.14154559e+00,
        6.22084175e+04, 2.32094736e+04, 1.16327147e+07]])

In [58]:
np.min(np.abs(X_data_dress_1), axis=0)

array([[4.00000000e+00, 7.14365367e-15, 2.93899502e-14, 2.87911484e-17,
        1.73345649e-03, 4.34100568e-08, 7.97470452e-17]])

In [59]:
np.min(X_data_dress_1, axis=0)

array([[ 4.00000000e+00, -2.47461028e+06, -3.68806252e+06,
        -3.14158745e+00,  1.73345649e-03, -4.01423666e+03,
        -1.33809768e+06]])

In [62]:
np.min(X_data_dress_1, axis=0)

array([[1.02574623e-04, 3.57182683e-21, 1.46949751e-20, 9.16450716e-18,
        4.34100568e-08, 6.92604894e-10, 7.97470452e-24]])

In [63]:
for attr in range(7):
    print("{}: {} vs {}".format(attr, np.mean(np.array(X_data_dress_1)[np.where(np.array(y_data_dress_1)==0)[0].tolist(),0,attr]),
          np.mean(np.array(X_data_dress_1)[np.where(np.array(y_data_dress_1)==1)[0].tolist(),0,attr])))

0: 0.48287702830795576 vs 0.434274035989538
1: 0.0003169627214605548 vs 0.0002857639096767088
2: 0.0005039974729245563 vs 0.00041675132147805
3: 0.7197063460906475 vs 0.6492341947938064
4: 0.15474356772741413 vs 0.2023425192310393
5: 0.8364813976405246 vs 0.7794135771094866
6: 6.753206951445352e-05 vs 7.084766362914253e-05


In [64]:
for attr in range(7):
    print("{}: {} vs {}".format(attr, np.median(np.array(X_data_dress_1)[np.where(np.array(y_data_dress_1)==0)[0].tolist(),0,attr]),
          np.median(np.array(X_data_dress_1)[np.where(np.array(y_data_dress_1)==1)[0].tolist(),0,attr])))

0: 0.5 vs 0.5
1: 0.00014277453101972691 vs 0.00013799833877229723
2: 0.0001703617487787965 vs 0.0001569680822585866
3: 0.8371754222524841 vs 0.8053717001370825
4: 0.07821734231351637 vs 0.12089038071084868
5: 1.0 vs 1.0
6: 1.105834251972017e-06 vs 5.774624236566336e-07


In [179]:
trainIndexes_dress_1, testIndexes_dress_1 = get_safe_balanced_split(y_data_dress_1)

Data splitting on 2 classes and returning 3919 per class


In [194]:
X_train_dress_1 = [X_data_dress_1[x] for x in trainIndexes_dress_1]

In [195]:
y_train_dress_1 = [y_data_dress_1[x] for x in trainIndexes_dress_1]

In [196]:
X_test_dress_1 = [X_data_dress_1[x] for x in testIndexes_dress_1]

In [197]:
y_test_dress_1 = [y_data_dress_1[x] for x in testIndexes_dress_1]

In [199]:
scale_thr = 3

y_simple_pred = []
for features in X_test_s:
    y_simple_pred.append((features[0]<1/6 and
                         (0.5 < features[5]) and 
                        (1/scale_thr < features[4]) and
                         np.abs(features[3]) < 0.3))

In [202]:
precision_score(y_test_s, y_simple_pred, average='macro')  

0.57878787878787885

In [203]:
precision_score(y_test_s, y_simple_pred, average='micro')

0.85051020408163269

In [204]:
precision_score(y_test_s, y_simple_pred, average='weighted')

0.78735312306740879

In [205]:
precision_score(y_test_s, y_simple_pred)

0.29090909090909089

In [206]:
recall_score(y_test_s, y_simple_pred, average='macro')  

0.51809116809116806

In [207]:
recall_score(y_test_s, y_simple_pred, average='micro')

0.85051020408163269

In [208]:
recall_score(y_test_s, y_simple_pred)

0.059259259259259262

In [60]:
np.random.seed(0)
from sklearn import datasets
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import calibration_curve

In [209]:
lr = LogisticRegression(solver='lbfgs', C=1, verbose=1, class_weight='balanced', n_jobs=16)
lr.fit(X_train_dress_1, y_train_dress_1)


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='lbfgs', tol=0.0001, verbose=1, warm_start=False)

In [210]:
lr.score(X_train_s, y_train_s)

0.51288594029089052

In [211]:
lr.score(X_test_s, y_test_s)

0.62397959183673468

In [212]:
y_pred_lr = lr.predict(X_test_s)

In [219]:
precision_score(y_test_s, y_pred_lr, average='macro')  

0.51404347441190434

In [220]:
precision_score(y_test_s, y_pred_lr, average='micro')

0.62397959183673468

In [221]:
precision_score(y_test_s, y_pred_lr, average='weighted')

0.7733612264771188

In [222]:
precision_score(y_test_s, y_pred_lr)

0.1561119293078056

In [224]:
recall_score(y_test_s, y_pred_lr, average='macro')  

0.52676966907736134

In [225]:
recall_score(y_test_s, y_pred_lr, average='micro')

0.62397959183673468

In [226]:
recall_score(y_test_s, y_pred_lr, average='weighted')

0.62397959183673468

In [227]:
recall_score(y_test_s, y_pred_lr)

0.3925925925925926

In [153]:
y_proba=lr.predict_proba(X_test)[:,1]

In [155]:
log_loss(y_test, y_proba)

0.73541963757310214

In [157]:
lr.intercept_

array([-0.01676302])

In [156]:
lr.coef_

array([[  3.31340167e-02,  -1.00369353e-04,  -1.77983958e-05,
          1.33083926e-02,  -3.48076988e-02,   4.06765520e-02,
         -6.32365498e-06,  -1.53714891e-02,   8.58076131e-06,
          2.70563267e-02]])

In [163]:
for f, g in zip(lr.coef_[0], ["np.sum(mask)", "translation_x", "translation_y", "rotation", "scale_x", "scale_x/scale_y", "shear"]):
    print("{:15s} - {:+1.6f}".format(g,f))

np.sum(mask)    - +0.033134
translation_x   - -0.000100
translation_y   - -0.000018
rotation        - +0.013308
scale_x         - -0.034808
scale_y         - +0.040677
shear           - -0.000006
1/scale_x       - -0.015371
scale_x/scale_y - +0.000009
scale_y/scale_x - +0.027056


In [228]:
import keras, keras.layers as L
import tensorflow as tf

config = tf.ConfigProto( device_count = {'GPU': 1 , 'CPU': 8} ) 
sess = tf.Session(config=config) 
keras.backend.set_session(sess)

In [232]:
model = keras.models.Sequential()
model.add(L.InputLayer(input_shape=[7]))
model.add(L.BatchNormalization())
model.add(L.Dense(512, activation='relu'))
model.add(L.BatchNormalization())
model.add(L.Dropout(0.1))
model.add(L.Dense(512, activation='relu'))
model.add(L.BatchNormalization())
model.add(L.Dropout(0.1))
model.add(L.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy')

In [None]:
model.fit(np.array(X_train_s), (np.array(y_train_s)), epochs=10000, validation_data=[(np.array(X_test_s)), (np.array(y_test_s))])

Train on 7838 samples, validate on 1960 samples
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Epoch 28/10000
Epoch 29/10000
Epoch 30/10000
Epoch 31/10000
Epoch 32/10000
Epoch 33/10000
Epoch 34/10000
Epoch 35/10000
Epoch 36/10000
Epoch 37/10000
Epoch 38/10000
Epoch 39/10000
Epoch 40/10000
Epoch 41/10000
Epoch 42/10000
Epoch 43/10000
Epoch 44/10000
Epoch 45/10000
Epoch 46/10000
Epoch 47/10000
Epoch 48/10000
Epoch 49/10000
Epoch 50/10000
Epoch 51/10000
Epoch 52/10000
Epoch 53/10000
Epoch 54/10000
Epoch 55/10000
Epoch 56/10000
Epoch 57/10000
Epoch 58/10000
Epoch 59/10000
Epoch 60/10000
Epoch 61/10000
Epoch 62/10000
Epoch 63/10000
Epoch 64/10000
E

Epoch 152/10000
Epoch 153/10000
Epoch 154/10000
Epoch 155/10000
Epoch 156/10000
Epoch 157/10000
Epoch 158/10000
Epoch 159/10000
Epoch 160/10000
Epoch 161/10000
Epoch 162/10000
Epoch 163/10000
Epoch 164/10000
Epoch 165/10000
Epoch 166/10000
Epoch 167/10000
Epoch 168/10000
Epoch 169/10000
Epoch 170/10000
Epoch 171/10000
Epoch 172/10000
Epoch 173/10000
Epoch 174/10000
Epoch 175/10000
Epoch 176/10000
Epoch 177/10000
Epoch 178/10000
Epoch 179/10000
Epoch 180/10000
Epoch 181/10000
Epoch 182/10000
Epoch 183/10000
Epoch 184/10000
Epoch 185/10000
Epoch 186/10000
Epoch 187/10000
Epoch 188/10000
Epoch 189/10000
Epoch 190/10000
Epoch 191/10000
Epoch 192/10000
Epoch 193/10000
Epoch 194/10000
Epoch 195/10000
Epoch 196/10000
Epoch 197/10000
Epoch 198/10000
Epoch 199/10000
Epoch 200/10000
Epoch 201/10000
Epoch 202/10000
Epoch 203/10000
Epoch 204/10000
Epoch 205/10000
Epoch 206/10000
Epoch 207/10000
Epoch 208/10000
Epoch 209/10000
Epoch 210/10000
Epoch 211/10000
Epoch 212/10000
Epoch 213/10000
Epoch 21

Epoch 302/10000
Epoch 303/10000
Epoch 304/10000
Epoch 305/10000
Epoch 306/10000
Epoch 307/10000
Epoch 308/10000
Epoch 309/10000
Epoch 310/10000
Epoch 311/10000
Epoch 312/10000
Epoch 313/10000
Epoch 314/10000

In [102]:
log_loss(y_test_s, model.predict(np.array(X_test_s)))

  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


nan

In [103]:
log_loss(y_train_s, model.predict(np.array(X_train_s)))

  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


nan

In [108]:
np.sum(model.predict(np.array(X_test_s)) > 0.1)

1953

In [117]:
model2 = keras.models.Sequential()
model2.add(L.InputLayer(input_shape=[7]))
model2.add(L.BatchNormalization())
model2.add(L.Dense(64, activation='relu'))
model2.add(L.BatchNormalization())
model2.add(L.Dropout(0.1))
model2.add(L.Dense(64, activation='relu'))
model2.add(L.BatchNormalization())
model2.add(L.Dropout(0.1))
model2.add(L.Dense(64, activation='relu'))
model2.add(L.BatchNormalization())
model2.add(L.Dense(1, activation='sigmoid'))
model2.compile(optimizer='adam', loss='binary_crossentropy')

In [118]:
model2.fit(np.array(X_train_s), (np.array(y_train_s)), epochs=10000, validation_data=[(np.array(X_test_s)), (np.array(y_test_s))])

Train on 7838 samples, validate on 1960 samples
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Epoch 28/10000
Epoch 29/10000
Epoch 30/10000
Epoch 31/10000
Epoch 32/10000
Epoch 33/10000
Epoch 34/10000
Epoch 35/10000
Epoch 36/10000
Epoch 37/10000
Epoch 38/10000
Epoch 39/10000
Epoch 40/10000
Epoch 41/10000
Epoch 42/10000
Epoch 43/10000
Epoch 44/10000
Epoch 45/10000
Epoch 46/10000
 672/7838 [=>............................] - ETA: 1s - loss: 0.6864

KeyboardInterrupt: 