In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
import os
import numpy as np
import pandas as pd

from PIL import Image
from sklearn import ensemble
from sklearn import metrics
from sklearn import model_selection
from tqdm import tqdm


def create_dataset(training_df, image_dir):
    """
    This function takes the training dataframe
    and outputs training array and labels
    :param training_df: dataframe with ImageId, Target columns
    :param image_dir: location of images (folder), string
    :return: X, y (training array with features and labels)
    """
    # create empty list to store image vectors
    images = []

    # create empty list to store targets
    targets = []

    # loop over the dataframe
    for index, row in tqdm(
        training_df.iterrows(),
        total=len(training_df),
        desc="processing images"
    ):
        # get image id
        image_id = row['ImageId']
        # create image path
        image_path = os.path.join(image_dir, image_id)
        # open image using PIL
        image = Image.open(image_path + '.png')
        # resize image to 256x256. we use bilinear resamplng
        image = image.resize((256, 256), resample=Image.BILINEAR)
        # convert image to array
        image = np.array(image)
        # ravel
        image = image.ravel()
        # append images and targets lists
        images.append(image)
        targets.append(int(row['target']))
    # convert list of list of images to numpy array
    images = np.array(images)
    print(images.shape)
    return images, targets

In [10]:
csv_path = '../input/siim-png-train-csv/train.csv'
image_path = '../input/siim-png-images/train_png'

# read_csv with imageid and target columns
df = pd.read_csv(csv_path)

# we create a new column called kfold and fill it with -1
df['kfold'] = -1

# the next step is to randomize the rows of the data
df = df.sample(frac=1).reset_index(drop=True)

# fetch labels
y = df.target.values

# initiate the kfold class from model_selection module
kf = model_selection.StratifiedKFold(n_splits=5)

# fill the new kfold column
for f, (t_, v_) in enumerate(kf.split(X=df, y=y)):
    df.loc[v_, 'kfold'] = f

# we go over the folds created
for fold_ in range(5):
    # temporary dataframes for train and test
    train_df = df[df.kfold != fold_].reset_index(drop=True)
    test_df = df[df.kfold == fold_].reset_index(drop=True)

    # create train dataset
    # you can move this outside to save some computation time
    xtrain, ytrain = create_dataset(train_df, image_path)

    # create test dataset
    # you can move this outsiet to save some compuation time
    xtest, ytest = create_dataset(test_df, image_path)

    # fit random forest without any modification of time
    clf = ensemble.RandomForestClassifier(n_jobs=-1)
    clf.fit(xtrain, ytrain)

    # predict probability of class 1
    preds = clf.predict_proba(xtest)[:, 1]

    # print results
    print(f'FOLD: {fold_}')
    print(f'AUC = {metrics.roc_auc_score(ytest, preds)}')
    print("")

processing images: 100%|██████████| 8540/8540 [02:25<00:00, 58.56it/s]
processing images:   0%|          | 6/2135 [00:00<00:37, 57.15it/s]

(8540, 65536)


processing images: 100%|██████████| 2135/2135 [00:46<00:00, 45.55it/s]


(2135, 65536)


processing images:   0%|          | 5/8540 [00:00<03:14, 43.87it/s]

FOLD: 0
AUC = 0.7140368805750149



processing images: 100%|██████████| 8540/8540 [03:11<00:00, 44.55it/s]
processing images:   0%|          | 6/2135 [00:00<00:36, 58.53it/s]

(8540, 65536)


processing images: 100%|██████████| 2135/2135 [00:49<00:00, 43.19it/s]


(2135, 65536)


processing images:   0%|          | 6/8540 [00:00<02:30, 56.83it/s]

FOLD: 1
AUC = 0.7265621438448797



processing images: 100%|██████████| 8540/8540 [02:26<00:00, 58.41it/s]
processing images:   0%|          | 6/2135 [00:00<00:35, 59.63it/s]

(8540, 65536)


processing images: 100%|██████████| 2135/2135 [00:36<00:00, 58.60it/s]


(2135, 65536)


processing images:   0%|          | 6/8540 [00:00<02:29, 56.90it/s]

FOLD: 2
AUC = 0.7169899605411785



processing images: 100%|██████████| 8540/8540 [02:26<00:00, 58.22it/s]
processing images:   0%|          | 7/2135 [00:00<00:35, 60.44it/s]

(8540, 65536)


processing images: 100%|██████████| 2135/2135 [00:36<00:00, 58.85it/s]


(2135, 65536)


processing images:   0%|          | 6/8540 [00:00<02:33, 55.51it/s]

FOLD: 3
AUC = 0.7462149416728717



processing images: 100%|██████████| 8540/8540 [02:25<00:00, 58.68it/s]
processing images:   0%|          | 6/2135 [00:00<00:36, 58.08it/s]

(8540, 65536)


processing images: 100%|██████████| 2135/2135 [00:36<00:00, 58.56it/s]


(2135, 65536)
FOLD: 4
AUC = 0.7128928344958783

