# Download dataset with image info from github repo

In [None]:
!mkdir hotels-50k
!wget -P hotels-50k https://github.com/GWUvision/Hotels-50K/raw/master/input/dataset.tar.gz
!tar -xvzf hotels-50k/dataset.tar.gz -C hotels-50k
!rm hotels-50k/dataset.tar.gz

# Load data info

In [None]:
import pandas as pd
import tqdm

In [None]:
chain_df = pd.read_csv("./hotels-50k/dataset/chain_info.csv")

In [None]:
hotel_df = pd.read_csv("./hotels-50k/dataset/hotel_info.csv")

In [None]:
import plotly
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
train_df = pd.read_csv('./hotels-50k/dataset/train_set.csv', header=None, 
                       names=['image_id', 'hotel_id', 'url', 'source', 'timestamp'])

# Check Hotels-50k data

In [None]:
data_df = train_df.merge(hotel_df, on="hotel_id").merge(chain_df, on="chain_id")
data_df["image_id"] = data_df["image_id"].astype(str)
data_df["hotel_id"] = data_df["hotel_id"].astype(str)
data_df["chain_id"] = data_df["chain_id"].astype(str)

### Hotel and image count per chain

In [None]:
chain_group_df = data_df.groupby(["chain_name"]).agg({"hotel_id": [pd.Series.nunique], "image_id" : [pd.Series.nunique]})
chain_group_df.columns = ["_".join(x) for x in chain_group_df.columns.ravel()]
chain_group_df = chain_group_df.reset_index().sort_values("hotel_id_nunique")[::-1]

In [None]:
fig = px.scatter(chain_group_df, x="chain_name", y="hotel_id_nunique",
                 size="image_id_nunique", color = "image_id_nunique",
                 hover_name = None,
                 log_y=True, size_max=75)

fig.update_yaxes(title_text="Hotel count")
fig.update_xaxes(title_text="Chain ID")
fig.update_layout(title="Hotel and image count per chain", coloraxis=dict(colorbar=dict(title="Image count")))
fig.update_traces(hovertemplate="Chain: %{x} <br>Hotel count: %{y:%d}<br>Image count: %{marker.size:%d}")
fig.show()

### Image count per hotel

In [None]:
group_df = data_df.groupby(["hotel_id"]).size().to_frame("image_count").sort_values("image_count")[::-1].reset_index()

In [None]:
group_df = data_df.groupby(["source"]).size().to_frame("image_count").sort_values("image_count")[::-1].reset_index()

fig = px.bar(group_df, x="source", y="image_count", height=500)
fig.update_layout(title="Image count per source")
fig.update_traces(hovertemplate="Source: %{x:%d} <br>Image count: %{y:%d}")
fig.update_yaxes(title_text="Image count")
fig.update_xaxes(title_text="Source")
fig.show()

In [None]:
fig = px.histogram(group_df, x="image_count", nbins=100, marginal="box", height=500)
fig.update_layout(title="Distribution of image count per hotel")
fig.update_traces(hovertemplate="Image count: %{x} <br>Hotel count: %{y:%d}")
fig.update_yaxes(title_text="Hotel count")
fig.update_xaxes(title_text="Image count")
fig.show()

# Only select same hotels as present in Hotel-ID

In [None]:
hotel_group_df = data_df.groupby(by=["hotel_id"])["image_id"].count().to_frame("image_count")

In [None]:
import csv

#Hotel-IDs of Kaggle's original dataset will be stored in hotels_training
hotels_training = [] 
with open('../input/traincsv/train.csv') as file_obj:
      
    reader_obj = csv.reader(file_obj)
      
    # Iterate over each row in the csv 
    i = 0
    for row in reader_obj:
        if(i == 0):
            i += 1
            continue
        else:
            if(row[0] not in hotels_training):
                hotels_training.append(row[0])
                i += 1

In [None]:
hotel_names = ['unknown']
sample_df = data_df[~data_df['hotel_id'].isin(hotels_training) & ~data_df['chain_name'].isin(hotel_names) & data_df['hotel_id'].isin(sample_hotels.index)].reset_index(drop=True)

In [None]:
chain_group_df = sample_df.groupby(["chain_name"]).agg({"hotel_id": [pd.Series.nunique], "image_id" : [pd.Series.nunique]})
chain_group_df.columns = ["_".join(x) for x in chain_group_df.columns.ravel()]
chain_group_df = chain_group_df.reset_index().sort_values("hotel_id_nunique")[::-1]

fig = px.scatter(chain_group_df, x="chain_name", y="hotel_id_nunique",
                 size="image_id_nunique", color = "image_id_nunique",
                 hover_name = None,
                 size_max=75)

fig.update_yaxes(title_text="Hotel count")
fig.update_xaxes(title_text="Chain ID")
fig.update_layout(title="Sampled data <br>Hotel and image count per chain", coloraxis=dict(colorbar=dict(title="Image count")))
fig.update_traces(hovertemplate="Chain: %{x} <br>Hotel count: %{y:%d}<br>Image count: %{marker.size:%d}")
fig.show()

# Download sampled images

## Prepare to download images
The SSL certificate of the image urls is expired so we have to handle it.

In [None]:
from __future__ import print_function
import csv, multiprocessing, cv2, os
import numpy as np
import urllib
import urllib.request

import ssl

ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

In [None]:
output_folder = "hotels-50k/images"
output_image_folder  = output_folder + "/train"

os.makedirs(output_image_folder)

## Download images

We will download the images without padding or resizing and we will keep the original folder structure: hotels-50k/images/train/chain_id/hotel_id/source/image_id.jpeg

In [None]:
def url_to_image(url):
    resp = urllib.request.urlopen(url, context=ctx)
    image = np.asarray(bytearray(resp.read()), dtype="uint8")
    image = cv2.imdecode(image, cv2.IMREAD_UNCHANGED)
    return image


def download_images(imList):
    # 2d list, rows are samples
    # columns: "chain_id", "hotel_id", "source", "image_id", "url"
    for im in imList:
        try:
            saveDir = os.path.join(output_image_folder, im[0], im[1], im[2])
            if not os.path.exists(saveDir):
                os.makedirs(saveDir)

            savePath = os.path.join(saveDir, str(im[3])+'.'+im[4].split('.')[-1])

            if not os.path.isfile(savePath):
                WIDTH = 512
                HEIGHT = 512
                img = url_to_image(im[4])
                w, h, c = np.shape(img)
                if(w > h):
                    pad = int((w-h)/2)
                    img = cv2.copyMakeBorder(img, 0, 0, pad, pad, cv2.BORDER_CONSTANT, value = 0)
                else:
                    pad = int((h-w)/2)
                    img = cv2.copyMakeBorder(img, pad, pad, 0, 0, cv2.BORDER_CONSTANT, value = 0)

                img = cv2.resize(img, (WIDTH, HEIGHT))
                cv2.imwrite(savePath,img)
            else:
                print('Already exists: ' + savePath)
        except Exception as e:
            print(e, ': ' + im[4])

In [None]:
%%time

image_data_array = sample_df[["chain_id", "hotel_id", "source", "image_id", "url"]].values

pool = multiprocessing.Pool()
NUM_THREADS = multiprocessing.cpu_count()
for cpu in range(NUM_THREADS):
    pool.apply_async(download_images,[image_data_array[cpu::NUM_THREADS]])

pool.close()
pool.join()

Not every image is available, lets check how many images were successfully downloaded

In [None]:
!find {output_image_folder} -type f | wc -l

## Check downloaded data

In [None]:
# update the sample data frame with path, image name and whether it was downloaded
sample_df["downloaded"] = False
sample_df["image_name"] = None
sample_df["image_folder"] = None

for index, row in sample_df.iterrows():
    image_folder = os.path.join(output_image_folder, row["chain_id"], row["hotel_id"], row["source"])
    image_name   = row["image_id"] + '.'+ row["url"].split('.')[-1]
    image_path   = os.path.join(image_folder, image_name)
    if os.path.exists(image_path):
        sample_df.loc[index, "downloaded"] = True
        sample_df.loc[index, "image_name"] = image_name
        sample_df.loc[index, "image_folder"] = image_folder

In [None]:
display(sample_df.head())

In [None]:
# number of downloaded images should be the same as number of images in the output_image_folder
print("Number of downloaded images:", sample_df["downloaded"].sum())

In [None]:
# save sample df to csv
sample_df.to_csv("hotels-50k/sample.csv", index=False)

# Create zip and clean up
Compress the downloaded images to zip file and delete the data.

In [None]:
!zip -r -qq hotels-50K-sample.zip hotels-50k
!rm -rf hotels-50k