In [4]:
import os
from meta.paths import PATH_TO_DATA, PATH_TO_TRAINING_IMAGES

# NN on image embeddings

In [1]:
# Load the model and get the feature layer
import tensorflow.keras as keras

In [2]:
model = keras.models.load_model("model.h5")
model = model.layers[2]

In [37]:
# from tensorflow.keras.applications import EfficientNetB0
# model = EfficientNetB0(weights='imagenet',include_top=False, pooling='avg', input_shape=None)

In [33]:
# from tensorflow.keras.applications import VGG16
# model = VGG16(weights='imagenet',include_top=False, pooling='avg', input_shape=None)

In [25]:
# Read training data; need to change this to not include test data
import pandas as pd
path_to_training_data = os.path.join(PATH_TO_DATA, "subset_test.csv")
train_df = pd.read_csv(path_to_training_data)
train_df['target'] = train_df.label_group.map(train_df.groupby('label_group').posting_id.agg('unique').to_dict())
train_df['target'] = train_df['target'].apply(list)

In [39]:
# Get image embeddings
from preprocessing.data_generator import build_plain_generator
generator = build_plain_generator()
image_embeddings = model.predict(generator.flow_from_dataframe(train_df, x_col='image', target_size=(100, 100), y_col='target', directory=PATH_TO_TRAINING_IMAGES))

Found 3423 validated image filenames belonging to 3423 classes.


In [27]:
# Get 50 nearest neighbors
from sklearn.neighbors import NearestNeighbors
KNN = 50

def get_nn_distances(embeddings):
    model = NearestNeighbors(n_neighbors=KNN, metric='cosine')
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    return distances, indices

distances, indices = get_nn_distances(image_embeddings)

In [28]:
distances.max()

0.9054594

In [15]:
# Predictions
def predict_knn(df, distances, indices, threshold):
    def predict(row):
        for i, val in enumerate(distances[row]):
            if val > threshold:
                break
        neighbor_indices = indices[row][:i]
        return list(df.iloc[neighbor_indices]['posting_id'])
    return [predict(i) for i in range(len(train_df))]
train_df['preds'] = predict_knn(train_df, distances, indices, .05)

In [16]:
import numpy as np
def getMetric(col):
    def f1score(row):
        n = len(np.intersect1d(row['target'], row[col]))
        return 2*n / (len(row['target']) + len(row[col]))
    return f1score

train_df['f1'] = train_df.apply(getMetric('preds'),axis=1)
print('CV score =',train_df['f1'].mean())

CV score = 0.10124382059783311


In [36]:
for threshold in np.arange(.000000001, .00000001, .000000001):
    train_df['preds'] = predict_knn(train_df, distances, indices, threshold)
    train_df['f1'] = train_df.apply(getMetric('preds'),axis=1)
    print(f'threshold {threshold}: f1={train_df["f1"].mean()}')

threshold 1e-09: f1=0.4621050239476637
threshold 2e-09: f1=0.4621050239476637
threshold 3.0000000000000004e-09: f1=0.4621050239476637
threshold 4e-09: f1=0.4621050239476637
threshold 5e-09: f1=0.4621050239476637
threshold 6e-09: f1=0.4621050239476637
threshold 7.000000000000001e-09: f1=0.4621050239476637
threshold 8e-09: f1=0.4621050239476637
threshold 9.000000000000001e-09: f1=0.4621050239476637
