In [None]:
#!g1.1

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import os
import io
from os import path
from glob import iglob
from tqdm import tqdm
import itertools
import numpy as np
import pandas as pd

from PIL import Image
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
import lightgbm as lgb
import h5py
import boto3
import keras

from tensorflow.keras.applications.resnet50 import ResNet50
from keras.preprocessing import image
from keras.applications.imagenet_utils import preprocess_input, decode_predictions

os.environ["KERAS_BACKEND"] = "cntk"
os.environ['AWS_DEFAULT_REGION'] = "ru-central1"


## Connect to S3

In [None]:
#!g1.1

# Name of your bucket with images
bucket_name = 'bucketwithvideo'

In [None]:
#!g1.1

session = boto3.session.Session()

ENDPOINT = "https://storage.yandexcloud.net"

session = boto3.Session(
    aws_access_key_id = (os.environ['token']),
    aws_secret_access_key = (os.environ['key_value']),
    region_name = (os.environ['AWS_DEFAULT_REGION']),
)


s3 = session.client(
    "s3", endpoint_url=ENDPOINT)

# Get list of bucket objects
fail_files = s3.list_objects_v2(Bucket=bucket_name, Prefix ='bus/bus', MaxKeys  = 1000)['Contents']
pass_files = s3.list_objects_v2(Bucket=bucket_name, Prefix ='car/car', MaxKeys  = 1000)['Contents']

df_fail_files = pd.DataFrame(fail_files)
df_pass_files =  pd.DataFrame(pass_files)    


In [None]:
#!g1.1
def from_s3(bucket, key):
    file_byte_string = s3.get_object(Bucket=bucket, Key=key)['Body'].read()
    img = Image.open(io.BytesIO(file_byte_string))
    return img
    

## Labeling

In [None]:
#!g1.1
image_paths = sorted(df_fail_files.Key.values) + sorted(df_pass_files.Key.values)
total_files = len(fail_files) + len(pass_files)
labels = np.zeros(total_files)
labels[len(fail_files):] = 1
print(labels)
print ('Number of fail images {0} pass images {1} total {2};'.format(len(fail_files),len(pass_files), len(labels)))

In [None]:
#!g1.1
pil_image = from_s3(bucket_name, image_paths[950])
print(pil_image.size)
pil_image


## Calculating the characteristics

In [None]:
#!g1.1
model = ResNet50(weights='imagenet',  input_shape=(224, 224, 3))

In [None]:
#!g1.1
def file_batch(file_list, batch_size):
    for i in range(0, len(file_list), batch_size):
        yield file_list[i:i + batch_size]

In [None]:
#!g1.1
def featurize_images(file_list, model, batch_size=32):
    features = []
    
    for fb in tqdm(file_batch(file_list, batch_size)):
        load_img = []
        for file_path in fb:
            print(file_path)
            img = from_s3(bucket_name,file_path).resize((224, 224))
            x = image.img_to_array(img)
            x = np.expand_dims(x, axis=0)
            load_img.append(preprocess_input(x))
            arr_pred = model.predict_on_batch(np.concatenate(load_img))
            print('arr_pred.shape {0} arr_pred.squeeze.shape: {1}'.format(arr_pred.shape,np.squeeze(arr_pred).shape))
        features.extend(arr_pred.squeeze())
    return np.array(features)


In [None]:
#!g1.1

# Execution of this cell will takes 10-15 minutes
%%time


features_filename = "../features_resnet50.npy"
if path.isfile(features_filename):
    print("Features found!")
    features = np.load(features_filename)
else:
    print("Computing features")
    features = featurize_images(image_paths, model) 
    np.save(features_filename, features)


In [None]:
#!g1.1

# Check the size of array
features.shape

## Training and cross-validation

In [None]:
#!g1.1
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, random_state=2048, shuffle=True)
cv_results = pd.DataFrame(columns=['Accuracy', 'Precision', 'Recall', 'F1', 'AUC', 'Confusion Matrix'])


The function `classification_metrics` is used to calculate quality metrics for each fold.

In [None]:
#!g1.1

def classification_metrics(y_true, y_pred_proba, threshold=0.5):
    y_pred = np.where(y_pred_proba > threshold, 1, 0)
    cm_dict = {}
    cm_dict['Accuracy'] = accuracy_score(y_true, y_pred)
    cm_dict['Precision'] =  precision_score(y_true, y_pred)
    cm_dict['Recall'] =  recall_score(y_true, y_pred)
    cm_dict['F1'] =  f1_score(y_true, y_pred) 
    cm_dict['AUC'] = roc_auc_score(y_true, y_pred_proba)
    cm_dict['Confusion Matrix'] = confusion_matrix(y_true, y_pred).tolist()
    return cm_dict

Try different parameters of LightGBM model to see how it affects the results.

In [None]:
#!g1.1
params = {'num_leaves': 512,
           'learning_rate': 0.1,
           'min_split_gain': 0.1,
           'min_child_weight': 30,
           'reg_lambda': 1,
           'subsample': 1,
           'objective':'binary',
           'task': 'train',
           'verbose': -1
           }

In [None]:
#!g1.1
for train_index, test_index in tqdm(skf.split(features, labels)):
    X_train, X_test = features[train_index], features[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=False)
    clf = lgb.train(params, lgb_train, num_boost_round=500, verbose_eval=False)
    y_pred_proba = clf.predict(X_test, verbose_eval=False)
    cm_dict = classification_metrics(y_test, y_pred_proba)
    print(cm_dict)
    cv_results = cv_results.append(classification_metrics(y_test, y_pred_proba),ignore_index=True)

In [None]:
#!g1.1

lgb_train = lgb.Dataset(features, labels, free_raw_data=False)
clf = lgb.train(params, lgb_train, num_boost_round=500)
y_pred_proba = clf.predict(features)

In [None]:
#!g1.1

# Function for displaying the error matrix
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
    """Plots a confusion matrix.
    Source: http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
    """
    cm_max = cm.max()
    cm_min = cm.min()
    if cm_min > 0: cm_min = 0
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        cm_max = 1
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    thresh = cm_max / 2.
    plt.clim(cm_min, cm_max)

    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i,
                 round(cm[i, j], 3),  # round to 3 decimals if they are float
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

In [None]:
#!g1.1
cm = np.asarray(classification_metrics(labels, y_pred_proba)['Confusion Matrix'])
plot_confusion_matrix(cm, ['fail','pass'], normalize=True)
print('Confusion Matrix:')
print(cm)

## Saving the model

Save the model to the project folder. You will download this file in the next notebook to visualize the solution.

In [None]:
#!g1.1
model_path = 'lightgbm_classifier.model'
clf.save_model(model_path)