In [62]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import keras 
from keras import Sequential, Input
from keras.layers import Conv2D, Dense, UpSampling2D, MaxPool2D, Conv2DTranspose
from keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from keras.models import load_model 
import glob
import cv2
from tensorflow.keras.preprocessing import image_dataset_from_directory
import tensorflow as tf
import pickle
import json

In [2]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import cohen_kappa_score
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import fbeta_score, make_scorer

In [3]:
from tqdm import tqdm

In [6]:
from TextModel import predict_text

In [64]:
tf.get_logger().setLevel('ERROR')

In [7]:
EFFICIENT_NET_MODEL_DIR = "weights/efficientNet-dropout-0.3-30"
UNDER_COMPLETE_AUTOENCODER_MODEL_DIR = "weights/autoencoder/model.150.epochs"
VALIDATION_CSV = "test-data.csv"
TRAINING_CSV = "train-data.csv"

In [8]:
df_train = pd.read_csv(TRAINING_CSV)
df_train.head()

Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,Health,Quantity,Fee,State,RescuerID,VideoAmt,Description,PetID,PhotoAmt,AdoptionSpeed
0,2,Eun Nik,4,266,0,1,2,7,0,2,...,1,1,150,41326,28524862fb4125cdeb126f24ca89e75d,0,Active and friendly kitty. Very sociable and l...,b6c159277,3,1
1,2,Kitten RA15,2,266,0,2,5,7,0,2,...,1,1,0,41401,b770bac0ca797cf1433c48a35d30c4cb,0,Female-2 1/2mths+. Done 2nd dewormed. very pla...,09a41530d,3,2
2,2,Yoo Hee (Thanx Atye),3,265,0,1,1,7,0,2,...,1,1,50,41326,23e54862081be8893f32f88cfb2f4bad,0,"He likes to play so much, and sleep with peopl...",efe569ca6,2,2
3,2,Female Gray Tabby,12,299,0,2,1,6,7,1,...,1,1,0,41326,3d705260a5eb049510ce87c718f30f92,0,This female cat was rescued by me when I saw h...,2.21E+07,3,2
4,1,Dreamer - ( German Sherped Mix),5,307,0,2,1,0,0,2,...,1,1,0,41326,3b074cadd2350de62dca7056b9bab6f4,1,Dreamer is affectionate and loves to be with p...,92f84c5b7,15,3


In [9]:
df_val = pd.read_csv(VALIDATION_CSV)
df_val.head()

Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,Health,Quantity,Fee,State,RescuerID,VideoAmt,Description,PetID,PhotoAmt,AdoptionSpeed
0,2,Turmeric And Snow,15,266,266,3,4,5,7,2,...,1,2,0,41326,faad9c80666c6dd58ae52d530863105e,0,These mother cat and her kitten have been arou...,73de2789e,4,2
1,1,11 Month Old,11,307,0,2,5,7,0,2,...,1,1,0,41326,07184fe71e0dc6ad58406e228719fd21,0,"Great personality, gentle and loving girl! :) ...",447be811f,1,1
2,2,Snowy,10,265,0,1,7,0,0,2,...,1,1,0,41326,88a21c30883ab70a93e09035e0a9a754,0,Snowy as the name suggested is a white colour ...,6e4724604,1,2
3,1,Sushi,84,49,0,2,7,0,0,2,...,1,1,0,41401,cb18bafcf8484f971018cb0836bcd9f3,0,This is a funny clever dog . Very hyper active...,92189aa31,1,4
4,2,Cally,42,266,0,2,1,4,7,1,...,1,1,0,41326,2ebac67d1aac94488703313d85406c7f,1,For serious & committed adopters. Adopted cat ...,249af4201,4,2


# Image

In [10]:
# load image
def load_image_local(img_path, size=(256,256)):
    img = load_img(img_path, target_size = size)
    img = img_to_array(img)
    img = np.expand_dims(img, axis = 0)
    return img

#### EfficientNet B0

In [11]:
IMAGE_SIZE_EFFICIENT_NET = (224,224)

In [12]:
efficientNet_model = load_model(EFFICIENT_NET_MODEL_DIR)



In [13]:
efficientNet_model.summary()

Model: "EfficientNetB0"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 224, 224, 3)]     0         
_________________________________________________________________
efficientnetb0 (Functional)  (None, None, None, 1280)  4049571   
_________________________________________________________________
batch_normalization (BatchNo (None, 7, 7, 1280)        5120      
_________________________________________________________________
avg_pool_final (GlobalAverag (None, 1280)              0         
_________________________________________________________________
dense (Dense)                (None, 512)               655872    
_________________________________________________________________
feature_extractor (Dense)    (None, 128)               65664     
_________________________________________________________________
pred (Dense)                 (None, 5)              

In [14]:
# output: 128 dim vector
feature_extractor_efficientNet = keras.Model(
    inputs=efficientNet_model.inputs,
    outputs=efficientNet_model.get_layer(name="feature_extractor").output
)


In [15]:
image = load_image_local("dataset/test/2/00f3a3993-1.jpg", size=IMAGE_SIZE_EFFICIENT_NET)

In [16]:
np.array(feature_extractor_efficientNet(image))[:,:20]

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.]], dtype=float32)

#### Undercomplete Autoencoder

In [17]:
IMAGE_SIZE_AUTOENCODER = (256,256)

In [18]:
autoencoder_model = load_model(UNDER_COMPLETE_AUTOENCODER_MODEL_DIR)

In [19]:
autoencoder_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1-3x3x16 (Conv2D)        (None, 256, 256, 16)      448       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 128, 128, 16)      0         
_________________________________________________________________
conv2-3x3x32 (Conv2D)        (None, 128, 128, 32)      4640      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 64, 64, 32)        0         
_________________________________________________________________
conv3-3x3x64 (Conv2D)        (None, 64, 64, 64)        18496     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 32, 32, 64)        0         
_________________________________________________________________
conv4-3x3x32 (Conv2D)        (None, 32, 32, 32)        1

In [20]:
feature_extractor_autoencoder = keras.Model(
    inputs=autoencoder_model.inputs,
    outputs=autoencoder_model.get_layer(name="max_pooling2d_4").output
)


In [21]:
image = load_image_local("dataset/test/2/00f3a3993-1.jpg", IMAGE_SIZE_AUTOENCODER)

In [22]:
np.array(feature_extractor_autoencoder(image)).reshape(1,8*8*8).shape

(1, 512)

# Tabular

In [99]:
scaler = None
TOP_BREEDS = [283, 152,  20, 189, 213, 103, 243, 254, 109, 179, 218, 205, 141, 285, 292, 264, 299, 265, 266, 307]
breeds = ["Breed_" + str(i+1) for i in range(len(TOP_BREEDS))]
colors = ["Color_" + str(i) if i > 0 else "Num_Colors" for i in range(8)]
maturities = ["Maturity_" + str(i+1) for i in range(5)]
furlengths = ["FurLength_" + str(i+1) for i in range(4)]
vaccinatss = ["Vaccinated_" + str(i+1) for i in range(3)] 
dewormes = ["Dewormed_" + str(i+1) for i in range(3)]  
sterilizes = ["Sterilized_" + str(i+1) for i in range(3)]
healths = ["Health_" + str(i+1) for i in range(4)]
states = ["State_" + str(i+1) for i in range(15)]

def breed_transform(breeds):
    if breeds["Breed1"] == breeds["Breed2"] or breeds["Breed2"] == 0:
        return 1
    else:
        return 0
    
def breed_category_transform(breed1):
    results = [0 for i in range(20)]
    try:
        results[TOP_BREEDS.index(breed1)] = 1
        return pd.Series(results)
    except:
        return pd.Series(results)

def gender_transform(gender):
    results = [0 for i in range(3)]
    results[gender-1] = 1
    return pd.Series(results)
    
    

def color_transform(colors):
    np_colors = colors.to_numpy()
    num_colors = len(np_colors[np_colors>0])
    results = np.zeros(8)
    results[0] = num_colors
    for color in np_colors[np_colors>0]:
        results[color] = 1
    return pd.Series(results)

def maturity_size_transform(maturity):
    results = np.zeros(5)
    results[maturity] = 1
    return pd.Series(results)

def fur_length_transform(fur_length):
    results = np.zeros(4)
    results[fur_length] = 1
    return pd.Series(results)

def vaccination_transform(vacc):
    results = np.zeros(3)
    results[vacc-1] = 1
    return pd.Series(results)

def deworme_transform(dew):
    results = np.zeros(3)
    results[dew-1] = 1
    return pd.Series(results)

def sterilizes_tranform(ster):
    results = np.zeros(3)
    results[ster-1] = 1
    return pd.Series(results)

def health_transform(health):
    results = np.zeros(4)
    results[health] = 1
    return pd.Series(results)

def state_transform(state):
    results = np.zeros(15)
    STATES = [41336,41325,41367,41401,41415,41324,41332,41335,41330,41380,41327,41345,41342,41326,41361]
    results[STATES.index(state)] = 1
    return pd.Series(results)
    
def feature_engineering(df, state="train"):

    ordinal_features = ['Age', 'Quantity', 'Fee', 'VideoAmt', 'PhotoAmt']
    cols = ['Age', 'Breed1', 'PhotoAmt', 'VideoAmt', 'AdoptionSpeed']
    
    for col in cols:
        if state == "test" and col == "AdoptionSpeed": continue
        df[col] = pd.to_numeric(df[col], errors='coerce')
        
    try:
        df = df.dropna(how='any', subset=['AdoptionSpeed'])
        df = df[df['AdoptionSpeed'] <= 4]
    except:
        pass
    
    df['PhotoAmt'] = df['PhotoAmt'].fillna(0)
    df['VideoAmt'] = df['VideoAmt'].fillna(0)
    
    if state == "train":
        global scaler
        scaler = MinMaxScaler()
        df[ordinal_features] = scaler.fit_transform(df[ordinal_features].to_numpy())
    else:
        df[ordinal_features] = scaler.transform(df[ordinal_features].to_numpy())

    processed_df = df.copy()
    # dog: 0 - cat: 1
    processed_df["Type"] = processed_df["Type"].apply(lambda t: 0 if t == 1 else 1)
    processed_df["Purebred "] = processed_df[["Breed1","Breed2"]].apply(breed_transform, axis=1)
    processed_df[breeds] = 0
    processed_df[breeds] = processed_df["Breed1"].apply(breed_category_transform)
    processed_df[["Gender_Male","Gender_Female","Gender_Mixed"]] = 0
    processed_df[["Gender_Male","Gender_Female","Gender_Mixed"]] = processed_df["Gender"].apply(gender_transform)
    processed_df[colors] = 0
    processed_df[colors] = processed_df[["Color1","Color2","Color3"]].apply(color_transform, axis=1)
    processed_df[maturities] = 0
    processed_df[maturities] = processed_df["MaturitySize"].apply(maturity_size_transform)
    processed_df[furlengths] = 0
    processed_df[furlengths] = processed_df["FurLength"].apply(fur_length_transform)
    processed_df[vaccinatss] = 0
    processed_df[vaccinatss] = processed_df["Vaccinated"].apply(vaccination_transform)
    processed_df[dewormes] = 0
    processed_df[dewormes] = processed_df["Dewormed"].apply(deworme_transform)
    processed_df[sterilizes] = 0
    processed_df[sterilizes] = processed_df["Sterilized"].apply(sterilizes_tranform)
    processed_df[healths] = 0
    processed_df[healths] = processed_df["Health"].apply(health_transform)
    processed_df[states] = 0
    processed_df[states] = processed_df["State"].apply(state_transform)
    
    processed_df.drop(["Name","Breed1","Breed2","Gender","Color1","Color2","Color3","MaturitySize"], inplace=True, axis=1)
    processed_df.drop(["FurLength","Vaccinated","Dewormed","Sterilized","Health","State"], inplace=True, axis=1)
    processed_df.drop(["RescuerID","Description","PetID"], inplace=True, axis=1)
    try:
        processed_df.drop(["AdoptionSpeed"], inplace=True, axis=1)
    except:
        pass
    processed_df['Age'].fillna(0, inplace=True)
    
    return processed_df

In [102]:
df_train_features = feature_engineering(df_train)
df_train_label = df_train["AdoptionSpeed"]
df_train_features.head()

Unnamed: 0,Type,Age,Quantity,Fee,VideoAmt,PhotoAmt,Purebred,Breed_1,Breed_2,Breed_3,...,State_6,State_7,State_8,State_9,State_10,State_11,State_12,State_13,State_14,State_15
0,1,0.016807,0.0,0.05,0.0,0.068966,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1,0.008403,0.0,0.0,0.0,0.068966,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,0.012605,0.0,0.016667,0.0,0.034483,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1,0.05042,0.0,0.0,0.0,0.068966,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0,0.021008,0.0,0.0,0.125,0.482759,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [25]:
df_test_features = feature_engineering(df_val, state="test")
df_test_label = df_val["AdoptionSpeed"]
df_test_features.head()

Unnamed: 0,Type,Age,Quantity,Fee,VideoAmt,PhotoAmt,Purebred,Breed_1,Breed_2,Breed_3,...,State_6,State_7,State_8,State_9,State_10,State_11,State_12,State_13,State_14,State_15
0,1,0.063025,0.052632,0.0,0.0,0.103448,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0,0.046218,0.0,0.0,0.0,0.0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1,0.042017,0.0,0.0,0.0,0.0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0,0.352941,0.0,0.0,0.0,0.0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,0.176471,0.0,0.0,0.125,0.103448,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [26]:
embedding_vector = feature_engineering(df_val[df_val["PetID"] == "6e4724604"].copy(), state="test").to_numpy()
embedding_vector.shape

(1, 75)

# Text

In [30]:
GRU_MODEL_PATH = "gru_lstm_model/save_gru"
LSTM_MODEL_PATH = "gru_lstm_model/save_LSTM"

In [31]:
input_text = 'This is a funny clever dog . Very hyper active . Friendly all time to live with 2 dogs , and caring to my kids too . She can " EAT " anything , so have to care with that . Because of migrate issues I force to give up to continue love her . Is there anyone can help me take care her ( must take good care ) , I can offers pay monthly caring fees .'

In [36]:
input_text_features = predict_text(input_text, True, LSTM_MODEL_PATH)
input_text_features.shape

(1, 64)

# Aggregate all features from images, tabular and text vector

In [39]:
IMAGES_SOURCE = "petfinder-adoption-prediction/train_images"

In [40]:
# input for aggregate function
temp = df_val[df_val["PetID"] == "249af4201"].copy()
temp

Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,Health,Quantity,Fee,State,RescuerID,VideoAmt,Description,PetID,PhotoAmt,AdoptionSpeed
4,2,Cally,42,266,0,2,1,4,7,1,...,1,1,0,41326,2ebac67d1aac94488703313d85406c7f,1,For serious & committed adopters. Adopted cat ...,249af4201,4,2


In [109]:
def aggregate_features(df_agg, submit=False):
    aggregate_features = None
    label = 0
    petId = df_agg["PetID"].to_numpy()[0]
    petId = str(petId)
    
    # image features
    ## query all images for the given petID
    if submit == True:
        image_paths = glob.glob(os.path.join("petfinder-adoption-prediction/test_images", petId + "-*.jpg"))
    else:
        image_paths = glob.glob(os.path.join(IMAGES_SOURCE, petId + "-*.jpg"))
    
    img_features = np.zeros((1,512))
    for img_path in image_paths:
        img = load_image_local(img_path, size=IMAGE_SIZE_AUTOENCODER)
        features_temp = np.array(feature_extractor_autoencoder(img)).reshape(1,512)
        img_features += features_temp

        
    if len(image_paths) > 0: 
        img_features /= len(image_paths)

    # tabular features
    tabular_features = feature_engineering(df_agg, state="test")
    tabular_features = tabular_features.to_numpy().astype(float)

    # text features
    desc = df_agg["Description"].values[0]
    text_features = predict_text(desc, True, LSTM_MODEL_PATH)
    
    # extract label
    try:
        label = int(df_agg["AdoptionSpeed"].to_numpy()[0])
    except:
        label = None
    
    aggregate_features = np.concatenate((img_features, tabular_features, text_features), axis=1)
    return aggregate_features, label

In [66]:
aggregate_features(temp)[0].shape

(1, 651)

In [67]:
training_set = None
label_train_set = []

In [68]:
for i in tqdm(df_train.index):
    df_agg = df_train[df_train.index == i].copy()
    agg_features, label = aggregate_features(df_agg)
    if training_set is None:
        training_set = agg_features
    else:
        training_set = np.concatenate((training_set, agg_features))
    label_train_set.append(label)

100%|██████████████████████████████████████████████████████████████████████████| 10256/10256 [3:34:47<00:00,  1.26s/it]


In [69]:
validation_set = None
label_set = []

In [70]:
for i in tqdm(df_val.index):
    df_agg = df_val[df_val.index == i].copy()
    agg_features, label = aggregate_features(df_agg)
    if validation_set is None:
        validation_set = agg_features
    else:
        validation_set = np.concatenate((validation_set, agg_features))
    label_set.append(label)

100%|████████████████████████████████████████████████████████████████████████████| 4396/4396 [1:28:17<00:00,  1.21s/it]


In [None]:
validation_set.shape

In [None]:
label_set_np = np.array(label_set)
label_set_np.shape

#### SGBClassifier cho features từ Image + Tabular data

In [72]:
model = XGBClassifier()
model.fit(training_set, np.array(label_train_set), verbose=1)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [73]:
Y_pred_1 = model.predict(validation_set)

In [78]:
cohen_kappa_score(Y_pred_1, label_set, weights="quadratic")

0.28639616303583204

In [80]:
with open("weights/early_fusion_512_75_64", "wb") as f:
    pickle.dump(model, f)

# Submission

In [82]:
df_test = pd.read_csv("test.csv")
df_test.head()

Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,Sterilized,Health,Quantity,Fee,State,RescuerID,VideoAmt,Description,PetID,PhotoAmt
0,2,Dopey & Grey,8,266,266,1,2,6,7,1,...,2,1,2,0,41326,2ece3b2573dcdcebd774e635dca15fd9,0,"Dopey Age: 8mths old Male One half of a pair, ...",e2dfc2935,2.0
1,2,Chi Chi,36,285,264,2,1,4,7,2,...,1,2,1,0,41326,2ece3b2573dcdcebd774e635dca15fd9,0,"Please note that Chichi has been neutered, the...",f153b465f,1.0
2,2,Sticky,2,265,0,1,6,7,0,2,...,2,1,1,200,41326,e59c106e9912fa30c898976278c2e834,0,"Sticky, named such because of his tendency to ...",3c90f3f54,4.0
3,1,Dannie & Kass [In Penang],12,307,0,2,2,5,0,2,...,1,1,2,0,41326,e59c106e9912fa30c898976278c2e834,0,Dannie and Kass are mother and daughter. We en...,e02abc8a3,5.0
4,2,Cuddles,12,265,0,1,2,3,7,2,...,1,1,1,0,41326,e59c106e9912fa30c898976278c2e834,0,"Extremely cuddly cat, hence the origin of his ...",09f0df7d1,5.0


In [83]:
df_submission = pd.read_csv("sample_submission.csv")

In [84]:
df_submission.head()

Unnamed: 0,PetID,AdoptionSpeed
0,e2dfc2935,0
1,f153b465f,0
2,3c90f3f54,0
3,e02abc8a3,0
4,09f0df7d1,0


In [110]:
for i in tqdm(df_test.index):
    df_agg = df_test[df_test.index == i].copy()
    agg_features, _ = aggregate_features(df_agg, submit=True)
    y_pred = model.predict(agg_features)
    df_submission.loc[i, "AdoptionSpeed"] = int(y_pred[0])

100%|████████████████████████████████████████████████████████████████████████████| 3972/3972 [2:07:20<00:00,  1.92s/it]


In [113]:
df_submission.to_csv("submission_final.csv")