In [None]:
import os
import time

import cv2
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from PIL import Image as pil_img
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    mean_squared_error,
    precision_recall_curve,
    precision_score,
    recall_score,
    roc_auc_score,
    roc_curve,
)
from sklearn.model_selection import (
    GridSearchCV,
    RepeatedStratifiedKFold,
    StratifiedKFold,
    cross_val_score,
    train_test_split,
)
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [None]:
plt_params = {'axes.labelsize': 'xx-large',
         'axes.titlesize':'xx-large',
         'xtick.labelsize':'x-large',
         'ytick.labelsize':'xx-large'}
plt.rcParams.update(plt_params)

In [None]:
class Image():
    def __init__(self, open_dir, filename):
        self.filename = filename
        self.image_og = cv2.cvtColor(cv2.imread(open_dir+self.filename, cv2.IMREAD_UNCHANGED), cv2.COLOR_BGR2RGB)
        #if self.image_og.shape[2] == 4 or  self.image_og.shape[2] == 3:
        self.height_og, self.width_og, channel = self.image_og.shape

    def resize_stretch(self, desired_size):
        self.im = cv2.resize(self.image_og, (desired_size, desired_size), interpolation = cv2.INTER_AREA)
        self.height, self.width, channel = self.im.shape

    def find_contours(self):
        self.gray = cv2.cvtColor(self.im, cv2.COLOR_BGR2GRAY)
        self.thresh = cv2.threshold(self.gray, 50, 255, cv2.THRESH_BINARY_INV)[1]
        self.contours, hierarchy = cv2.findContours(self.thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        self.contours = sorted(self.contours, key=cv2.contourArea, reverse = True)
#         plt.imshow(self.thresh)
#         plt.show()
        
    def morph_contours(self):
        kernel = np.ones((5,5), dtype='uint8')
        image_close = cv2.morphologyEx(self.thresh, cv2.MORPH_CLOSE, kernel)
        
        self.contours, hierarchy = cv2.findContours(image_close, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)  
        draw=cv2.drawContours(self.thresh, self.contours, -1, (0,0,255), 2)
        draw = cv2.fillPoly(self.thresh, self.contours, color=(255,255,255))
#         plt.imshow(draw)
#         plt.show()

        self.contours, hierarchy = cv2.findContours(draw, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        #self.contours = sorted(contours, key=cv2.contourArea, reverse = True)
        
    def mask_background(self):
        mask = np.zeros(self.im.shape[:2], dtype="uint8")
        draw = cv2.drawContours(mask, [self.largest_contour()], 0, (255,255,255), -1)
        self.im = cv2.bitwise_and(self.im, self.im, mask=mask)
#         plt.imshow(self.im)
#         plt.show()
    def cutoff(self):
        #checking the percentage of the contour that touches the edge/border
        locations = np.where(self.thresh != 0)
        count = 0 #pixels touching border
        for xl,yl in zip(locations[0], locations[1]):
            if xl == 0 or yl == 0 or xl == self.height-1 or yl == self.width-1:
                count+=1
        cutoff_perc = (count/(2*self.height+2*self.width))*100
        return cutoff_perc
    
    def contrast(self):
        return self.im.std()
        
    def laplacian(self):
        return cv2.Laplacian(self.gray,cv2.CV_64F).var()
    
    def edges(self):
        min_threshold = 0.66 * np.mean(self.im)
        max_threshold = 1.33 * np.mean(self.im)
        edges = cv2.Canny(self.im, min_threshold, max_threshold)
        return edges

    def largest_contour(self):
        return sorted(self.contours, key=cv2.contourArea, reverse = True)[0] 
       
    def area(self):
        return cv2.contourArea(self.largest_contour())  

    def perim(self):
        return cv2.arcLength(self.largest_contour(), False)
    
    def phi(self):
        rect = cv2.minAreaRect(self.largest_contour()) #box ONLY around the largest contour 
        #get length and width of contour
        x = rect[1][0]
        y = rect[1][1]      
        self.rect_length = max(x,y)
        self.rect_width = min(x,y)
        return self.rect_width/self.rect_length
        
    def extreme_points(self):
        cnt = self.largest_contour()
        leftmost = tuple(cnt[cnt[:,:,0].argmin()][0])
        rightmost = tuple(cnt[cnt[:,:,0].argmax()][0])
        topmost = tuple(cnt[cnt[:,:,1].argmin()][0])
        bottommost = tuple(cnt[cnt[:,:,1].argmax()][0])
        return np.std([leftmost, rightmost, topmost, bottommost])
   
    def filled_circular_area_ratio(self):  #similar to solidity
        (x,y), radius = cv2.minEnclosingCircle(self.largest_contour())
        center = (int(x),int(y))
        circle = cv2.circle(self.thresh, center, int(radius), (255,255,255), 5)
        #print(self.area()/(np.pi*radius**2), self.area())
#         plt.imshow(circle)
#         plt.show()
        return self.area()/(np.pi*radius**2)

    def circularity(self):
        return (4.*np.pi*self.area())/(self.perim()**2)
    
    def roundness(self):
        return (4.*np.pi*self.area())/(self.convex_perim(True)**2)
                                       
    def perim_area_ratio(self):
        return self.perim()/self.area()
        
    def convex_perim(self, closed_cnt):
        hull = cv2.convexHull(self.largest_contour())
        return cv2.arcLength(hull, closed_cnt)
    
    def convexity(self):
        return self.convex_perim()/self.perim()
    
    def complexity(self):
        return 10*(0.1-(self.area()/(np.sqrt(self.area()/self.hull_area())*self.perim()**2)))
    
    def solidity(self):
        return float(self.area())/self.hull_area()
    
    def equiv_d(self):
        return np.sqrt(4*self.area()/np.pi)
    
    def hull_area(self):
        hull = cv2.convexHull(self.largest_contour())      
        return cv2.contourArea(hull)
        
    def save_image(self, save_dir, flip = False):
        if not os.path.exists(save_dir):
            os.makedirs(save_dir, exist_ok=True)
        
        if flip:
            self.flip_imgs(save_dir)
        else:
            #save single image, no flipping:
            self.im =cv2.cvtColor(self.im, cv2.COLOR_BGR2RGB)
            cv2.imwrite(os.path.join(save_dir,str(self.filename)), np.array(self.im))
        
    def show_image(self):
        plt.imshow(self.image_og)
        plt.show()

In [None]:
def main():
    #open_dirs = ['../cpi_data/training_datasets/SPHERES/good/', '../cpi_data/training_datasets/SPHERES/bad/']
    open_dirs = ['../cpi_data/training_datasets/SPHERES/good/', '../cpi_data/training_datasets/SPHERES/bad/'] 
    desired_size = 1000
    count=0
        
    #Independent Variable
    good_bad = []
    
    #Dependent Variables
    cutoff = []
    contrast = [] 
    height = []
    width = []
    lapl = []
    area=[]
    perim = []
    phi = []
    extreme_points = []
    filled_circular_area_ratio=[]
    circularity = []
    roundness = []
    perim_area_ratio = []
    convex_perim = []
    complexity = []
    solidity = []
    equiv_d = []
    hull_area = []
    std = []
    contours = []
    edges = []
    cnt_area = []
    
    for direct in open_dirs:
        
        for filename in os.listdir(direct):
            #want a good/bad index for every file
            if direct == open_dirs[0]:
                good_bad.append(0)
            else:
                good_bad.append(1)
                
            
            image = Image(direct, filename)
            image.resize_stretch(desired_size)
            image.find_contours()
            if len(image.contours)!=0 and image.area() != 0.0:
                image.morph_contours()
                #image.mask_background()
            
                count_edge_px = np.count_nonzero(image.edges())
                if count_edge_px > 0:
                    std.append(np.std(np.nonzero(image.edges())))
                else:
                    std.append(0)
                lapl.append(image.laplacian())
                contours.append(len(image.contours))
                edges.append(count_edge_px)
                contrast.append(image.contrast())
                height.append(image.height_og)
                width.append(image.width_og)
                cnt_area.append(image.area())
                solidity.append(image.solidity())
                complexity.append(image.complexity())
                equiv_d.append(image.equiv_d())
                convex_perim.append(image.convex_perim(True))
                hull_area.append(image.hull_area())
                perim.append(image.perim())
                phi.append(image.phi())
                circularity.append(image.circularity())
                cutoff.append(image.cutoff())
                perim_area_ratio.append(image.perim_area_ratio())
                roundness.append(image.roundness())
                filled_circular_area_ratio.append(image.filled_circular_area_ratio())
                extreme_points.append(image.extreme_points())
            else:
                image.show_image()
                lapl.append(0)
                contours.append(0)
                edges.append(0)
                contrast.append(0)
                height.append(0)
                width.append(0)
                cnt_area.append(0)
                solidity.append(0)
                complexity.append(0)
                equiv_d.append(0)
                convex_perim.append(0)
                hull_area.append(0)
                perim.append(0)
                phi.append(0)
                circularity.append(0)
                cutoff.append(0)
                perim_area_ratio.append(0)
                roundness.append(0)
                filled_circular_area_ratio.append(0)
                extreme_points.append(0)
                std.append(0)
    
    dicts = {}
    keys = ['good_bad', 'height', 'width', 'lapl', 'contours', 'edges', 'std', 'cnt_area', \
           'contrast', 'circularity', 'solidity','complexity','equiv_d','convex_perim',\
           'hull_area', 'perim', 'phi', 'cutoff', 'extreme_points' ,\
            'filled_circular_area_ratio','roundness','perim_area_ratio']
    values = [good_bad, height, width, lapl, contours, edges, std, cnt_area, \
           contrast, circularity, solidity, complexity, equiv_d, convex_perim,\
           hull_area, perim, phi, cutoff, extreme_points, filled_circular_area_ratio,\
              roundness, perim_area_ratio]
    for key, val in zip(keys, values):
        print(key, len(val))
        dicts[key] = val

    df = pd.DataFrame(dicts)
    return df

In [None]:
%%time 
if __name__ == '__main__':
    df = main()
    #df_new = df.drop(df[(df['height'] == 1000) & (df['width'] == 1000)].index)
    #df.to_pickle("../saved_models/SPHERES.pkl")

# Transform SPHERES model

In [None]:
#read in dataframe with image attributes for training 
df_SPHERES = pd.read_pickle("/data/data/saved_models/no_mask/spheres_df.pkl")

In [None]:
df_SPHERES.fillna(0)

In [None]:
# split dataset into x,y
x = df_SPHERES.drop('good_bad',axis=1)
y = df_SPHERES['good_bad']
# train-test split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=13)

In [None]:
#Predictors:
X_train.columns
#heigth and width are from original image dimensions
#laplacian=blurriness
#contours=# of contours
#edges=# of edges
#std = standard deviation in edge locations
#cnt_area = contour area for largest contour
#extreme_points = standard deviation in most extreme edge or point locations (top, left, bottom, right)
#the rest are different measures of roundness/circularity/etc. or self explanatory

In [None]:
#transform with minmax scaler
#all attributes are scaled to a fixed range between 0 and 1
scaler_SPHERES_norm = MinMaxScaler() 
X_norm_train = scaler_SPHERES_norm.fit_transform(X_train)
X_norm_test = scaler_SPHERES_norm.transform(X_test)

#transform with standard scalar
#mean=0, std=1, normalization 
scaler_SPHERES_stand = StandardScaler() 
X_stand_train = scaler_SPHERES_stand.fit_transform(X_train)
X_stand_test = scaler_SPHERES_stand.transform(X_test)

#If the distribution of the quantity is normal, then it should be standardized, otherwise, the data should be normalized.
#trying both

In [None]:
#also include principal component analysis to speed predictions up 
#let's see how many components are needed to explain variance
pca = PCA()
#X_log_train = np.log(X_train)
#X_log_test = np.log(X_test)
X_train_pca = pca.fit_transform(X_stand_train)
X_test_pca = pca.transform(X_stand_test)
print(np.cumsum(pca.explained_variance_ratio_))
plt.scatter(np.arange(21), np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance');

In [None]:
labels = ['height', 'width', 'blurriness', 'contours', 'edge count', 'edge location spread', 'contour area',
       'contrast', 'circularity', 'solidity', 'complexity', 'equiv-diameter',
       'convex perim', 'hull area', 'perimeter', 'aspect ratio', 'cutoff', 'extreme points',
       'circle-area ratio', 'roundness', 'perim-area ratio']
 

fig , ax = plt.subplots(figsize=(8,5))
plt.title('Original Data')
g = sns.boxplot(data=pd.DataFrame(X_train, columns = X_train.columns))
plt.xticks(plt.xticks()[0], labels)
ax.set_xticklabels(ax.get_xticklabels(),rotation=90);
#fig.savefig('../plots/spheres_og_boxplot_dist.png', dpi=300, bbox_inches='tight')

fig , ax = plt.subplots(figsize=(8, 5))
plt.title('Normalized Data')
sns.boxplot(data=pd.DataFrame(X_norm_train, columns = X_train.columns))
ax.set_xticklabels(ax.get_xticklabels(),rotation=90);
plt.xticks(plt.xticks()[0], labels)
#fig.savefig('../plots/spheres_norm_boxplot_dist.png', dpi=300, bbox_inches='tight')


fig , ax = plt.subplots(figsize=(8, 5))
plt.title('Standardized Data')
sns.boxplot(data=pd.DataFrame(X_stand_train, columns = X_train.columns))
ax.set_xticklabels(ax.get_xticklabels(),rotation=90);
plt.xticks(plt.xticks()[0], labels)
plt.ylim(-6, 11)
#fig.savefig('../plots/spheres_stand_boxplot_dist.png', dpi=300, bbox_inches='tight')


fig , ax = plt.subplots(figsize=(8, 5))
plt.title('PCA')
sns.boxplot(data=pd.DataFrame(X_train_pca, columns = X_train.columns))
ax.set_xticklabels(ax.get_xticklabels(),rotation=90);
plt.xticks(plt.xticks()[0], labels)
plt.ylim(-6, 11)
#fig.savefig('../plots/spheres_pca_boxplot_dist.png', dpi=300, bbox_inches='tight')

In [None]:
%%time

#define binary logistic regression model:
#balance uneven classes in training set for spheres/not spheres (good/bad)
#set random state for reproduciblity and consistency between runs
lg1 = LogisticRegression(random_state=13, class_weight='balanced')  

#standardized transformations:
lg1.fit(X_stand_train,y_train)  #don't normalize/standardize y since already between 0-1 for categorical
y_pred = lg1.predict(X_stand_test)

# performance
print(f'Accuracy Score: {accuracy_score(y_test,y_pred)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}')
print(f'Area Under Curve: {roc_auc_score(y_test, y_pred)}')
print(f'Recall score: {recall_score(y_test,y_pred)}')

In [None]:
%%time 

#now try with normalized transformation
# define model
lg1 = LogisticRegression(random_state=13, class_weight='balanced')
# fit it
lg1.fit(X_norm_train,y_train)  #don't normalize/standardize y since already between 0-1 for categorical 
# test
y_pred = lg1.predict(X_norm_test)
# performance
print(f'Accuracy Score: {accuracy_score(y_test,y_pred)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}')
print(f'Area Under Curve: {roc_auc_score(y_test, y_pred)}')
print(f'Recall score: {recall_score(y_test,y_pred)}')

In [None]:
%%time 

#and lastly with pca 
# define model
lg1 = LogisticRegression(random_state=13, class_weight='balanced')
# fit it
lg1.fit(X_train_pca,y_train)  #don't normalize/standardize y since already between 0-1 for categorical 
# test
y_pred = lg1.predict(X_test_pca)
# performance
print(f'Accuracy Score: {accuracy_score(y_test,y_pred)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}')
print(f'Area Under Curve: {roc_auc_score(y_test, y_pred)}')
print(f'Recall score: {recall_score(y_test,y_pred)}')

In [None]:
#loop through all transformations to determine what's the best in terms of RMSE:

rmse = []
# raw, normalized and standardized training and testing data
trainX = [X_train, X_norm_train, X_stand_train, X_train_pca]
testX = [X_test, X_norm_test, X_stand_test, X_test_pca]
start_time = time.time()
strs=['original', 'normalized', 'standardized', 'pca']
# model fitting and measuring RMSE
for i in range(len(trainX)):
    
    # fit
    lg1.fit(trainX[i],y_train)
    # predict
    pred = lg1.predict(testX[i])
    # RMSE
    rmse.append(np.sqrt(mean_squared_error(y_test,pred)))
    
# visualizing the result
df_lg_rmse = pd.DataFrame({'RMSE':rmse},index=['Original','Normalized','Standardized', 'PCA'])
df_lg_rmse

In [None]:
#have a look at a summary of most important predictors or coefficients based on p-value
#uses statsmodel library
#can change 'X_norm_train' to 'X_stand_train' for diff transforms
predictors = ['lapl', 'height', 'width','contours', 'edges', 'std', 'cnt_area', \
            'contrast', 'circularity', 'solidity','complexity','equiv_d','convex_perim',\
            'hull_area', 'perim', 'phi', 'cutoff', 'extreme_points' ,\
             'filled_circular_area_ratio','roundness','perim_area_ratio']

y = df_SPHERES['good_bad'] #0 = sphere, 1 = other
logit_model=sm.Logit(y_train, X_norm_train, missing='drop')
result=logit_model.fit()
print(result.summary2(xname=predictors))

In [None]:
#raw data with no transforms to see distribution shapes
df_SPHERES.hist(bins=30,figsize=(15,13));

Normalization is a good technique to use when you do not know the distribution of your data or when you know the distribution is not Gaussian (a bell curve). Normalization is useful when your data has varying scales and the algorithm you are using does not make assumptions about the distribution of your data, such as k-nearest neighbors and artificial neural networks.

Standardization assumes that your data has a Gaussian (bell curve) distribution. This does not strictly have to be true, but the technique is more effective if your attribute distribution is Gaussian. Standardization is useful when your data has varying scales and the algorithm you are using does make assumptions about your data having a Gaussian distribution, such as linear regression, logistic regression, and linear discriminant analysis.

# Transform SIFT (separate ice for training) model
separates high and low quality images 

In [None]:
df_SIFT = pd.read_pickle("/data/data/saved_models/no_mask/sift_df.pkl")

In [None]:
# split dataset into x,y
x = df_SIFT.drop('good_bad',axis=1)
y = df_SIFT['good_bad']
# train-test split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=13)

In [None]:
scaler_SIFT_norm = MinMaxScaler() 
X_norm_train = scaler_SIFT_norm.fit_transform(X_train)
X_norm_test = scaler_SIFT_norm.transform(X_test)
scaler_SIFT_stand = StandardScaler() 
X_stand_train = scaler_SIFT_stand.fit_transform(X_train)
X_stand_test = scaler_SIFT_stand.transform(X_test)

In [None]:
pca = PCA()
X_train_pca = pca.fit_transform(X_stand_train)
X_test_pca = pca.transform(X_stand_test)

In [None]:
# define model
lg_SIFT = LogisticRegression(random_state=13, class_weight='balanced')
# fit it
lg_SIFT.fit(X_stand_train,y_train)  #don't normalize/standardize y since already between 0-1 for categorical 
# test
y_pred = lg_SIFT.predict(X_stand_test)
# performance
print(f'Accuracy Score: {accuracy_score(y_test,y_pred)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}')
print(f'Area Under Curve: {roc_auc_score(y_test, y_pred)}')
print(f'Recall score: {recall_score(y_test,y_pred)}')

In [None]:
#Determine if normalization or standardization is better:

rmse = []
# raw, normalized and standardized training and testing data
trainX = [X_train, X_norm_train, X_stand_train, X_train_pca]
testX = [X_test, X_norm_test, X_stand_test, X_test_pca]
start_time = time.time()
strs=['original', 'normalized', 'standardized', 'pca']
# model fitting and measuring RMSE
for i in range(len(trainX)):
    # fit
    lg_SIFT.fit(trainX[i],y_train)
    # predict
    pred = lg_SIFT.predict(testX[i])
    # RMSE
    rmse.append(np.sqrt(mean_squared_error(y_test,pred)))
    
# visualizing the result
df_lg_rmse = pd.DataFrame({'RMSE':rmse},index=['Original','Normalized','Standardized', 'PCA'])
df_lg_rmse

In [None]:
font = {'family' : 'serif',
        'size'   : 12}
matplotlib.rc('font', **font)

labels=['height', 'width', 'laplacian', '# contours', 'edges', '$\sigma$', 'contour area',
       'contrast', 'circularity', 'solidity', 'complexity', 'equiv diameter',
       'convex perimeter', 'hull area', 'perimeter', 'aspect ratio', 'cutoff', 'extreme points',
       'area ratio', 'roundness', 'perim-area ratio']

fig, (ax1, ax2) = plt.subplots(2,1, sharex=True, figsize=(8,7))

#SPHERES feature importance
importance = lg1.coef_[0]
# summarize feature importance
#for i,v in enumerate(importance):
#    print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
ax1.bar([x for x in range(len(importance))], importance)
ax1.set_ylabel('Feature\nImportance', fontsize=18)
ax1.set_title('SPHERES Model', fontsize=20)
ax1.set_ylim(-3.2,3.2)

#SIFT feature importance
importance = lg_SIFT.coef_[0]
# summarize feature importance
#for i,v in enumerate(importance):
#    print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
ax2.bar([x for x in range(len(importance))], importance)
#ax2.set_xticks(np.arange(0,21))
plt.xticks(np.arange(0,21), labels, rotation=90)
ax2.set_ylabel('Feature\nImportance', fontsize=18)
ax2.set_title('SIFT Model', fontsize=20)
ax2.set_ylim(-3.2,3.2)

plt.tight_layout()
plt.savefig('/data/data/plots/feature_importance_SPHERES_SIFT.pdf', dpi=300, bbox_inches = "tight")
plt.show()

In [None]:
predictors = ['lapl', 'height', 'width','contours', 'edges', 'std', 'cnt_area', \
           'contrast', 'circularity', 'solidity','complexity','equiv_d','convex_perim',\
           'hull_area', 'perim', 'phi', 'cutoff', 'extreme_points' ,\
            'filled_circular_area_ratio','roundness','perim_area_ratio']
X = df_SIFT[predictors]
X_stand = scaler_SIFT_stand.fit_transform(X)
X_norm = scaler_SIFT_norm.fit_transform(X)

y = df_SIFT['good_bad'] #0 = sphere, 1 = other
logit_model=sm.Logit(y, X_norm, missing='drop')
result=logit_model.fit()
print(result.summary2())

In [None]:
sns.regplot(x=df_SIFT['roundness'], y=df_SIFT['good_bad'], y_jitter=0.03, logistic = True)

In [None]:
sns.regplot(x=df_SIFT['edges'], y=df_SIFT['good_bad'], y_jitter=0.03, logistic = True)

# Loop through a new data set (not training) and make predictions for SPHERES and SIFT
### show images with predictions 

In [None]:
#campaign = '2002_CRYSTAL-FACE-NASA'
campaigns=['ARM', 'CRYSTAL_FACE_NASA', 'CRYSTAL_FACE_UND', 'AIRS_II',
          'Midcix', 'ICE_L', 'MPACE', 'OLYMPEX']
campaigns=['CRYSTAL_FACE_UND']
desired_size = 1000
spheres_count = 0
good_ice_count = 0
bad_ice_count = 0
cutoff_allowed=10

for campaign in campaigns:
    print(campaign)
    start_time = time.time()
    open_dir = '../cpi_data/campaigns/'+campaign+'/single_imgs/'
    #open_dir = 'cpi_data/training_datasets/SPHERES/bad/'
    save_dir_good = '../cpi_data/campaigns/'+campaign+'/good_lowcutoff5/'
    save_dir_bad = '../cpi_data/campaigns/'+campaign+'/bad/'
    for filename in os.listdir(open_dir):

            image = Image(open_dir, filename)
            image.resize_stretch(desired_size)
            image.find_contours()
            if len(image.contours)!=0 and image.area() != 0.0:
                #image.show_image()
                image.morph_contours()
                #image.mask_background()            

                edges= np.count_nonzero(image.edges())
                lapl=image.laplacian()
                contours=len(image.contours)
                contrast=image.contrast()         

                if edges > 0:
                    std=np.std(np.nonzero(image.edges()))
                else:
                    std=0
                height=image.height_og
                width=image.width_og
                cnt_area=image.area()
                solidity=image.solidity()
                complexity=image.complexity()
                equiv_d=image.equiv_d()
                convex_perim=image.convex_perim(True)
                hull_area=image.hull_area()
                perim=image.perim()
                phi=image.phi()
                circularity=image.circularity()
                cutoff=image.cutoff()
                perim_area_ratio=image.perim_area_ratio()
                roundness=image.roundness()
                filled_circular_area_ratio=image.filled_circular_area_ratio()
                extreme_points=image.extreme_points()
            else:
                height=0
                width=0
                cnt_area=0
                solidity=0
                complexity=0
                equiv_d=0
                convex_perim=0
                hull_area=0
                perim=0
                phi=0
                circularity=0
                cutoff=0
                perim_area_ratio=0
                roundness=0
                filled_circular_area_ratio=0
                extreme_points=0
                std=0
                edges=0
                lapl=0
                contours=0
                contrast=0

            #loop through all at once and append predictors 
            #for each row if spheres, also predict good 
            dicts = {}
            keys = ['lapl', 'height', 'width','contours', 'edges', 'std', 'cnt_area', \
               'contrast', 'circularity', 'solidity','complexity','equiv_d','convex_perim',\
               'hull_area', 'perim', 'phi', 'cutoff', 'extreme_points' ,\
                'filled_circular_area_ratio','roundness','perim_area_ratio']

            values =  [lapl, height, width, contours, edges, std, cnt_area, \
                   contrast, circularity, solidity, complexity, equiv_d, convex_perim,\
                   hull_area, perim, phi, cutoff, extreme_points, filled_circular_area_ratio,\
                      roundness, perim_area_ratio]

            for key, val in zip(keys, values):
                dicts[key] = val
            df_pred = pd.DataFrame(dicts, index=[0])
            #Regression model prediction
            pred_SPHERES = scaler_SPHERES_stand.transform(df_pred)
            pred_SPHERES = lg1.predict(pred_SPHERES)

            #SVC prediction
            #pred = clf_rbf.predict(df)
            #pred = clf_linear.predict(df)
            #pred = clf_sig.predict(df)

            #PCA prediction
            #pred = clf_rfc.predict(df)

            if pred_SPHERES[0] < 0.25:
                if len(image.contours) != 0 and image.cutoff() < cutoff_allowed:
                    spheres_count+=1
                    #print(image.cutoff())
                    #print("GOOD sphere")
                    #image.show_image()
                    plt.show()
                    #image.save_image(save_dir_good)             

            else:
                #print(image.cutoff())
    #             print("BAD sphere")
    #             image.show_image()
    #             plt.show()
                #image.save_image(save_dir_bad)

                #If non sphere, find ice that is not blurry or broken
                pred_SIFT = scaler_SIFT_stand.transform(df_pred)
                pred_SIFT = lg_SIFT.predict(pred_SIFT)
                if pred_SIFT[0] < 0.25 and image.cutoff() < cutoff_allowed:
                    #print("GOOD ice")
                    image.show_image()
                    plt.show()
                    good_ice_count +=1
                    #image.save_image(save_dir_good)

                else:
                    #print('BAD ice')
                    #image.show_image()
                    plt.show()
                    bad_ice_count +=1
    end_time = time.time()
    print(campaign, spheres_count, good_ice_count, bad_ice_count, end_time-start_time)
    f.write(campaign, spheres_count, good_ice_count, bad_ice_count, end_time-start_time, '\n')
    f.close()


In [None]:
#run the above in parallel (was saving only good images of ice)
campaigns = ['MPACE']
for campaign in campaigns:
    print(campaign)
    time_start = time.time()
    open_dir = 'cpi_data/campaigns/'+campaign+'/single_imgs/'
    save_dir_good = 'cpi_data/campaigns/'+campaign+'/good_lowcutoff_timing/'
    save_dir_bad = 'cpi_data/campaigns/'+campaign+'/bad/'
    desired_size = 1000

    iterable = [file for file in os.listdir(open_dir)]
    p = Pool(multiprocessing.cpu_count() - 1)
    p.map(make_new_prediction, iterable)
    print(time.time() - time_start)
    p.close()
