# exracting data

In [1]:
import os
import numpy as np
import pytube
import cv2
from PIL import Image
import glob
import random
import shutil
import pandas as pd

CAP_PROP_POS_MSEC = 0


def get_all_video_names():
    """A function to define a set of videos associated
    with an entity and assigned to a categor
    Returns:
        dict : a dict with the labels, entities and video urls
    """
    indoor_videos = {"room": ["https://www.youtube.com/watch?v=N9a9abjsqbE",
                              "https://www.youtube.com/watch?v=dmxbVa8mZlY",
                              "https://www.youtube.com/watch?v=20BPIbSO72M"],
                     "BedRoom": ["https://www.youtube.com/watch?v=hFmXTgqJ98Q",
                                 "https://www.youtube.com/watch?v=Yc_RwaVrZEk"],
                     "Restaurant": ["https://www.youtube.com/watch?v=JjsM2DeyyZM",
                                    "https://www.youtube.com/watch?v=NkGKRN9yoeM"],
                     "bathroom": ["https://www.youtube.com/watch?v=kiyaIyuF47Q",
                                  "https://www.youtube.com/watch?v=dMfZE6XWK1k",
                                  "https://www.youtube.com/watch?v=pgmVSPbxeGw"]}

    outdoor_videos = {"oceans": ["https://www.youtube.com/watch?v=9ntinpHGlec",
                              "https://www.youtube.com/watch?v=IYePs7Q-se8",
                                 "https://www.youtube.com/watch?v=2V3DGJGOalo"],
                   "mountains": ["https://www.youtube.com/watch?v=o1-TOwCaKBQ",
                                 "https://www.youtube.com/watch?v=2SaOEUZQ2G8"],
                   "building": ["https://www.youtube.com/watch?v=TDOU34ThXeY"],
                   "city": ["https://www.youtube.com/watch?v=UwlA4ZUkc-g"],
                    "tree": ["https://www.youtube.com/watch?v=9q7q2Ygo2Cs"]}
    
    
    return {0: indoor_videos, 1: outdoor_videos}

def download_video(all_videos, videos_path):
    """A function to download all videos given the
    dictionary of videos, entities and categories
    Args:
        all_videos (dict) : The dictionary of categories, entities and videos
        videos_path (str) : The directory path to download the videos to
    Returns:
        None
    """
    for category in all_videos:
        ctr = 0
        for type in all_videos[category]:
            for v_idx, video in enumerate(all_videos[category][type]):
                print("Downloading video {}".format(video))
                yt = pytube.YouTube(video)
                out_file = ".mp4".format(video)
                stream = yt.streams.filter(file_extension="mp4").first()
                stream.download(videos_path,
                                filename='video_{}_{}'.format(category, ctr))

                print(os.path.join("videos", "{}.mp4".format(yt.title)))
                ctr += 1
                
                


def download_all_videos(video_path, all_video_names):
    if not os.path.exists(video_path):
        os.mkdir(video_path)
    download_video(all_video_names, video_path)


def get_all_frames(filename,sample_period=3.0,offset=10,image_folder=""):
    """A function that gets all the frames from a video every 1 second
    Args:
        filename (str) : The file name of the video
        sample_period (int) : The period at which to sample the video (seconds)
        offset (int) : Skip video until (seconds)
        image_folder (str) : The directory where the final images need to be saved
    Generates:
        frame (numpy.array)
    """
    video_capture = cv2.VideoCapture()
    video_capture.open(filename)
    video_capture.set(cv2.CAP_PROP_POS_AVI_RATIO, 1)
    max_seconds = video_capture.get(cv2.CAP_PROP_POS_MSEC)
    max_seconds = video_capture.get(cv2.CAP_PROP_POS_MSEC)
    video_capture.set(cv2.CAP_PROP_POS_AVI_RATIO, 0)
    num_frames = video_capture.get(cv2.CAP_PROP_FRAME_COUNT)
   
    fps = video_capture.get(cv2.CAP_PROP_FPS)  # OpenCV2 version 2 used "CV_CAP_PROP_FPS"
    num_seconds = num_frames / fps
  
    second_number = 0    
    video_number = filename.split(".")[0].split("_")[-1]
    class_label = filename.split(".")[0].split("_")[-2]
    # Start from middle of the video if required
    offset = fps * offset

    for idx in range(int(offset), int(num_frames)):
        frame_number = video_capture.get(1)
        is_frame, frame = video_capture.read()
        frame = np.array(frame)
        if idx % int(fps * sample_period) == 0:
            new_im = Image.fromarray(frame)
            image_name = os.path.join(image_folder,"image_{}_{}_{}.png".format(class_label,video_number,second_number))
            new_im.save(image_name)
            second_number += 1
    #yield frame




In [2]:
x=get_all_video_names()

download_all_videos("E:/datasets/indoor&outdoor/vids", x)

Downloading video https://www.youtube.com/watch?v=N9a9abjsqbE
videos\TBNRfrags Final Goodbye Apartment Tour.mp4
Downloading video https://www.youtube.com/watch?v=dmxbVa8mZlY
videos\شقة للبيع في اسطنبول اكسراي 4 غرف وصالون.mp4
Downloading video https://www.youtube.com/watch?v=20BPIbSO72M
videos\Alquiler Apartamentos Amoblados en Cúcuta - Amoblados CIO - Helogar Apto.203.mp4
Downloading video https://www.youtube.com/watch?v=hFmXTgqJ98Q
videos\Episode 71 - Boho Bedroom.mp4
Downloading video https://www.youtube.com/watch?v=Yc_RwaVrZEk
videos\Neo Vertika Condo Miami - Video Tour - 2 Bedroom.mp4
Downloading video https://www.youtube.com/watch?v=JjsM2DeyyZM
videos\Rice Krispie Treat Competition at America's Test Kitchen.mp4
Downloading video https://www.youtube.com/watch?v=NkGKRN9yoeM
videos\The Disney Hotel Santa Fe at Disneyland Paris.mp4
Downloading video https://www.youtube.com/watch?v=kiyaIyuF47Q
videos\Bushwick 2BR Box apt. L train to Myrtle-Wyckoff ave.mp4
Downloading video https://www

In [3]:
vid_names = os.listdir("E:/datasets/indoor&outdoor/vids")

for i in vid_names :
    file_name= "E:/datasets/indoor&outdoor/vids/"+i


    
    if i[6] == '0' :
        get_all_frames(file_name,sample_period=2.0,offset=12,image_folder="E:/datasets/indoor&outdoor/frames/indoor")
        
    elif i[6] == '1' :
        get_all_frames(file_name,sample_period=2.0,offset=12,image_folder="E:/datasets/indoor&outdoor/frames/outdoor")
        

# splitting data (train & validation and test)

In [4]:
indoors = os.listdir("E:/datasets/indoor&outdoor/frames/indoor")
random.shuffle(indoors)

l=len(indoors)
for i in indoors:
    train = indoors[:int(l*0.8)]
    val = indoors[int(l*0.8) : int(l*0.95) ]
    test=  indoors[int(l*0.95) : ]

for i in train:
    source = "E:/datasets/indoor&outdoor/frames/indoor/"+i
    destination = "E:/datasets/indoor&outdoor/train/indoor/"+i
    shutil.copyfile(source, destination)

for i in val:
    source = "E:/datasets/indoor&outdoor/frames/indoor/"+i
    destination = "E:/datasets/indoor&outdoor/val/indoor/"+i
    shutil.copyfile(source, destination)

for i in test:
    source = "E:/datasets/indoor&outdoor/frames/indoor/"+i
    destination = "E:/datasets/indoor&outdoor/test/"+i
    shutil.copyfile(source, destination)



In [5]:
outdoors = os.listdir("E:/datasets/indoor&outdoor/frames/outdoor")
random.shuffle(outdoors)

l=len(outdoors)
for i in outdoors:
    train = outdoors[:int(l*0.8)]
    val = outdoors[int(l*0.8) : int(l*0.95) ]
    test=  outdoors[int(l*0.95) : ]
    
for i in train:
    source = "E:/datasets/indoor&outdoor/frames/outdoor/"+i
    destination = "E:/datasets/indoor&outdoor/train/outdoor/"+i
    shutil.copyfile(source, destination)

for i in val:
    source = "E:/datasets/indoor&outdoor/frames/outdoor/"+i
    destination = "E:/datasets/indoor&outdoor/val/outdoor/"+i
    shutil.copyfile(source, destination)

for i in test:
    source = "E:/datasets/indoor&outdoor/frames/outdoor/"+i
    destination = "E:/datasets/indoor&outdoor/test/"+i
    shutil.copyfile(source, destination)



# splitting data into indoors and outdoors

In [6]:
indoor_data = []
label=[]
for i in os.listdir("E:/datasets/indoor&outdoor/frames/indoor"):
    if os.path.isfile("E:/datasets/indoor&outdoor/frames/indoor"+"/"+i):
        
        indoors = Image.open("E:/datasets/indoor&outdoor/frames/indoor"+"/"+i).convert("L")
        indoors = indoors.resize((50,50),Image.ANTIALIAS)
        
        indoors = np.asarray(indoors)/255.0 
        indoor_data.append(indoors)
        label.append(1)  # Label 1 for indoors

In [7]:
outdoor_data = []
for i in os.listdir("E:/datasets/indoor&outdoor/frames/outdoor"):
    if os.path.isfile("E:/datasets/indoor&outdoor/frames/outdoor"+"/"+i):
        
        outdoors = Image.open("E:/datasets/indoor&outdoor/frames/outdoor"+"/"+i).convert("L")
        outdoors = outdoors.resize((50,50),Image.ANTIALIAS)
        
        outdoors = np.asarray(outdoors)/255.0 
        outdoor_data.append(outdoors)
        label.append(0)  # Label 0 for outdoors

In [8]:
x = np.concatenate((indoor_data,outdoor_data),axis=0)#test dataset
y= np.asarray(label)
y = y.reshape(x.shape[0],1)
print(x.shape)
x = x.reshape(x.shape[0],x.shape[1]*x.shape[2])# Convert 3D to 2D
print("x shape:{}\ny shape{}".format(x.shape,y.shape))

(1926, 50, 50)
x shape:(1926, 2500)
y shape(1926, 1)


In [35]:
data = np.concatenate((x,y),axis=1)#test dataset
df = pd.DataFrame(data)
l=[]
for i in list(df.columns):
    l.append("pixel"+" "+str(i+1))
l[-1]='label'
df.columns=l
df.to_csv('indoor&outdoor_data.csv')
