In [4]:
import os
from meta.paths import PATH_TO_DATA, PATH_TO_TRAINING_IMAGES

# NN on image embeddings

In [2]:
# Load the model and get the feature layer
import tensorflow.keras as keras
model = keras.models.load_model("model.h5")
feature_layer = model.layers[2]

In [5]:
# Read training data; need to change this to not include test data
import pandas as pd
path_to_training_data = os.path.join(PATH_TO_DATA, "train.csv")
train_df = pd.read_csv(path_to_training_data)
train_df['target'] = train_df.label_group.map(train_df.groupby('label_group').posting_id.agg('unique').to_dict())
train_df['target'] = train_df['target'].apply(list)

In [6]:
# Get image embeddings
from preprocessing.data_generator import build_plain_generator
generator = build_plain_generator()
image_embeddings = feature_layer.predict(generator.flow_from_dataframe(train_df, x_col='image', target_size=(100, 100), y_col='target', directory=PATH_TO_TRAINING_IMAGES))

Found 34250 validated image filenames belonging to 34250 classes.


In [8]:
# Get 50 nearest neighbors
from sklearn.neighbors import NearestNeighbors
KNN = 50
model = NearestNeighbors(n_neighbors=KNN)
model.fit(image_embeddings)
distances, indices = model.kneighbors(image_embeddings)

In [25]:
# Predictions
THRESHOLD = .1
def predict(df, row, distances, indices):
    for i, val in enumerate(distances[row]):
        if val > THRESHOLD:
            break
    neighbor_indices = indices[row][:i]
    return list(df.iloc[neighbor_indices]['posting_id'])
train_df['preds'] = [predict(train_df, i, distances, indices) for i in range(len(train_df))]

In [27]:
train_df['preds']

0                                        [train_129225211]
1                                       [train_3386243561]
2        [train_2288590299, train_1580839663, train_351...
3                                       [train_2406599165]
4                                       [train_3369186413]
                               ...                        
34245                                   [train_4028265689]
34246    [train_769054909, train_3515199340, train_1143...
34247                                    [train_614977732]
34248                                   [train_3630949769]
34249                                   [train_1792180725]
Name: preds, Length: 34250, dtype: object

In [30]:
import numpy as np
def getMetric(col):
    def f1score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        return 2*n / (len(row.target)+len(row[col]))
    return f1score

train_df['f1'] = train_df.apply(getMetric('preds'),axis=1)
print('CV score =',train_df.f1.mean())

CV score = 0.3869275355788712
