In [22]:
import sys
import pandas as pd
import numpy as np
from ipynb.fs.full.SIFT_to_Features import SIFT_path_to_Features,SIFT_df_to_Features,SIFT_np_to_Features
sys.path.insert(0, '..\src\data')
import make_dataset_beta as md
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import (RandomOverSampler, 
                                    SMOTE, 
                                    ADASYN)
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

In [42]:
import sys
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from scipy.cluster.vq import vq, kmeans
from scipy.spatial.distance import cdist
import pandas as pd
import cv2

In [40]:
# Get Image Descriptors, which are a combination of points on an image and the description of surrounding pixels.

def get_descriptors(nparrays,nfeatures):
    sift = cv2.SIFT_create(nfeatures = nfeatures)
    container = []
    for i in nparrays:
        img_bw = cv2.cvtColor(i, cv2.COLOR_BGR2GRAY)
        keypoint,descriptor = sift.detectAndCompute(img_bw,None)
        container.append([keypoint,descriptor])
    return container

#For a collection of image area descriptions, get the Kmeans of n clusters. This will be what future images are compared to.

def get_vocab(descriptors,n):
    descriptor_container = []
    for i in descriptors:
        if i[1] is None:
            continue
        for j in i[1]:
            descriptor_container.append(j)
    vocab = kmeans(descriptor_container,n)
    return vocab

# For each image, get its descriptors. For each descriptor, get the closest Kmean descriptor in vocab and add 1 to its index in a histogram.
# Return a histogram per image. This histogram will be passed as a feature for modeling.

def descriptor_to_vocab(nparrays,vocab):
    sift = cv2.SIFT_create()
    container = []
    for i in nparrays:
        img_bw = cv2.cvtColor(i, cv2.COLOR_BGR2GRAY)
        keypoint,descriptor = sift.detectAndCompute(img_bw,None)
        if descriptor is not None:
            dist = cdist(descriptor,vocab[0],'euclidean')
            bin_assignment = np.argmin(dist,axis = 1)
        else:
            bin_assignment = []
        image_feats = np.zeros(len(vocab[0]))
        for j in bin_assignment:
            image_feats[j] += 1
        container.append(image_feats)
    return container

# Normalizes histograms from images so that they may be used in ML inputs

def normalize_histograms(histarray):
    histarray = np.array(histarray)
    feats_norm_div = np.linalg.norm(histarray,axis = 1)
    for i in range(0,histarray.shape[0]):
        histarray[i] = histarray[i]/feats_norm_div[i]
    return histarray

In [23]:
# Rebalancing labels
def Rebalance_Labels(Xdf,column):
    nonadf = Xdf.dropna()
    X = nonadf['file']
    y = nonadf[column].astype(int)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    y_train_encoded = LabelEncoder().fit_transform(y_train)
    oversample = RandomOverSampler()
    X_balanced_train, y_balanced_train = oversample.fit_resample(X_train, y_train_encoded)
    train_df = pd.DataFrame(np.column_stack((X_balanced_train, y_balanced_train)))
    test_df = pd.DataFrame(np.column_stack((X_test, y_test)))
    return [train_df,test_df]

In [24]:
# PCA
def PCA_pipeline(Xcolumn,n_components):
    pca = PCA(n_components = n_components)
    reduced_column = pca.fit_transform(Xcolumn)
    return reduced_column

In [25]:
# RFS
def RFS_pipeline(Xdf,ydf,n_estimators):
    selector = SelectFromModel(RandomForestClassifier(n_estimators = n_estimators))
    selector.fit(Xdf, ydf)
    selected_feature= Xdf.columns[(selector.get_support())]
    selected_df = Xdf.loc[:, selected_feature]
    return [selected_df,ydf]

In [37]:
agetestnp = np.load('../data/raw/BalancedRaw/nprawtestbalancedlabelsage.npy',allow_pickle=True)

In [39]:
agetrainnp = np.load('../data/raw/BalancedRaw/nprawtrainbalancedlabelsage.npy',allow_pickle=True)

In [44]:
descriptors = get_descriptors(agetrainnp,100)
vocab = get_vocab(descriptors[:100],200)
histarraytrain = descriptor_to_vocab(agetrainnp,vocab)
histarraytest = descriptor_to_vocab(agetestnp,vocab)
normalizehisttrain = normalize_histograms(histarraytrain)
normalizehisttest = normalize_histograms(histarraytest)

  histarray[i] = histarray[i]/feats_norm_div[i]


In [50]:
np.save('../data/processed/SIFT/BalancedSIFT/npsifttrainbalancedlabelsage.npy',normalizehisttrain)

In [51]:
np.save('../data/processed/SIFT/BalancedSIFT/npsifttestbalancedlabelsage.npy',normalizehisttest)

In [56]:
gendertestnp = np.load('../data/raw/BalancedRaw/nprawtestbalancedlabelsgender.npy',allow_pickle=True)
gendertrainnp = np.load('../data/raw/BalancedRaw/nprawtrainbalancedlabelsgender.npy',allow_pickle=True)

In [58]:
descriptors = get_descriptors(agetrainnp,100)
vocab = get_vocab(descriptors[:100],200)
histarraytrain = descriptor_to_vocab(gendertrainnp,vocab)
histarraytest = descriptor_to_vocab(gendertestnp,vocab)
normalizehisttrain = normalize_histograms(histarraytrain)
normalizehisttest = normalize_histograms(histarraytest)

  histarray[i] = histarray[i]/feats_norm_div[i]


In [59]:
np.save('../data/processed/SIFT/BalancedSIFT/gender/npsifttrainbalancedlabelsgender.npy',normalizehisttrain)

In [60]:
np.save('../data/processed/SIFT/BalancedSIFT/gender/npsifttestbalancedlabelsgender.npy',normalizehisttest)

In [61]:
racetestnp = np.load('../data/raw/BalancedRaw/nprawtestbalancedlabelsrace.npy',allow_pickle=True)
racetrainnp = np.load('../data/raw/BalancedRaw/nprawtrainbalancedlabelsrace.npy',allow_pickle=True)

In [62]:
descriptors = get_descriptors(agetrainnp,100)
vocab = get_vocab(descriptors[:100],200)
histarraytrain = descriptor_to_vocab(racetrainnp,vocab)
histarraytest = descriptor_to_vocab(racetestnp,vocab)
normalizehisttrain = normalize_histograms(histarraytrain)
normalizehisttest = normalize_histograms(histarraytest)

  histarray[i] = histarray[i]/feats_norm_div[i]


In [63]:
np.save('../data/processed/SIFT/BalancedSIFT/race/npsifttrainbalancedlabelsrace.npy',normalizehisttrain)

In [64]:
np.save('../data/processed/SIFT/BalancedSIFT/race/npsifttestbalancedlabelsrace.npy',normalizehisttest)