#SYSC 5906:
## **Room Detection - Dataset Processor: CSV**

---
Script to process MIT Indoor Scenes dataset. This generates a new dataset 
to train a room/environment classifier.

This script **outputs the data as a .csv file**, for use with Scikitlearn.

###Step 1: Access Drive
Mount the drive with the provided .zip file of code located in it

In [None]:
#Enter the gdrive
from google.colab import drive
drive.mount('/gdrive',force_remount=True)

Mounted at /gdrive


In [None]:
#Install missing libraries
%%capture 
!pip install pandas_read_xml

##Step 2: Setup
Import relevenat libraries, load original MIT dataset into colabs (without the images) and get the folders inside

In [None]:
import pandas as pd
import xml.etree.ElementTree as et
import os
from pathlib import Path
import glob
import csv
import numpy as np
from collections import Counter
import pandas_read_xml as pdx
from pandas_read_xml import flatten, fully_flatten, auto_separate_tables

DATASET_DIRECTORY = '/gdrive/My Drive/Colab Notebooks/SYSC 5906/datasets/mit_indoors/indoorCVPR_09annotations/Annotations/'
PROCESSED_FULLSET_DIRECTORY = '/gdrive/My Drive/Colab Notebooks/SYSC 5906/datasets/mit_indoors/processed/data_fullset/processedData.csv'
OBJ_FULLSET_DIRECTORY = '/gdrive/My Drive/Colab Notebooks/SYSC 5906/datasets/mit_indoors/processed/data_fullset/objData.csv'
PROCESSED_SUBSET_DIRECTORY = '/gdrive/My Drive/Colab Notebooks/SYSC 5906/datasets/mit_indoors/processed/data_subset/processedData_subset.csv'
OBJ_SUBSET_DIRECTORY = '/gdrive/My Drive/Colab Notebooks/SYSC 5906/datasets/mit_indoors/processed/data_subset/objData_subset.csv'

In [None]:
#List of desired rooms to create a subset of the MIT dataset
desiredRoomSubset = ['bathroom','bedroom','children_room','dining_room','corridor','garage','livingroom','kitchen','office','pantry','computerroom','staircase','closet']

#Subset toggle
subsetTog = False

In [None]:
#Empty list for folders
folderList = []

#Read XML's from dataset folder
with os.scandir(DATASET_DIRECTORY) as entries:
    for entry in entries:
        #Either keep all room types in the dataset or just a subset
        if (entry.name in desiredRoomSubset and subsetTog == True) or subsetTog == False: 
            folderList.append(entry.name)
            #print(entry.name)

#Generate dataframe with folder names
folderDF = pd.DataFrame(folderList)

print(folderList)

['tv_studio', 'inside_subway', 'movietheater', 'nursery', 'toystore', 'winecellar', 'corridor', 'meeting_room', 'poolinside', 'greenhouse', 'lobby', 'studiomusic', 'bedroom', 'dentaloffice', 'livingroom', 'gym', 'cloister', 'stairscase', 'children_room', 'classroom', 'buffet', 'kindergarden', 'computerroom', 'warehouse', 'dining_room', 'auditorium', 'bar', 'jewelleryshop', 'hairsalon', 'florist', 'pantry', 'waitingroom', 'videostore', 'mall', 'clothingstore', 'laboratorywet', 'restaurant', 'inside_bus', 'fastfood_restaurant', 'kitchen', 'deli', 'operating_room', 'hospitalroom', 'bathroom', 'trainstation', 'prisoncell', 'artstudio', 'gameroom', 'library', 'bakery', 'office', 'airport_inside', 'elevator', 'museum', 'church_inside', 'laundromat', 'concert_hall', 'shoeshop', 'grocerystore', 'bookstore', 'bowling', 'casino', 'restaurant_kitchen', 'garage', 'locker_room', 'closet', 'subway']


##Step 3: Parse XML
Extract object data from .xml's in the MIT dataset

In [None]:
#Document iterator
def iter_docs(author):
    author_attr = author.attrib
    for doc in author.iter('document'):
        doc_dict = author_attr.copy()
        doc_dict.update(doc.attrib)
        doc_dict['data'] = doc.text
        yield doc_dict

#Add 'author' line to end of each xml
def iter_author(etree):
    for author in etree.iter('author'):
        for row in iter_docs(author):
            yield row

#List for each instance of a object in the dataset
listOfAllObj = []
listOfAllRooms = []
numFiles = 0

#List of all unique objects
uniqueObjs = set()

#List of all unique room tpyes (folders)
roomNames = set()

for folder in folderDF.iloc[:,0]:
    roomNames.add(folder) #Record all of the possible room names
    for file in os.listdir(DATASET_DIRECTORY+folder):
        #print(folder + " ---- " + file)
        if file.endswith("xml"): 
            numFiles += 1
            
            #Create a list to hold objects temporarily
            fileObj = []

            listOfAllRooms.append(folder)
            
            #Open the xml
            curXML = open(DATASET_DIRECTORY+folder+"/"+file,"r",
                          encoding="UTF-8",errors='xmlcharrefreplace')
            fileContents = curXML.read()
            
            #Create parser to run through the xml
            eTreeParser = et.XMLParser()
            
            #Create a element tree from the xml
            etree = et.fromstring(fileContents, parser=eTreeParser)
  
            #Run through all of the object tags inside the element tree
            curObjs = etree.findall(".//object")
            for obj in curObjs:
                objStr = obj.findall("name")[0].text
                cleanObjStr = objStr.strip('\n')
                fileObj.append(cleanObjStr)
                uniqueObjs.add(cleanObjStr)
                
        #Store list of all objects from the current file
        listOfAllObj.append(fileObj)  

##Step 4: Generate CSV
Process extracted data into new CSV format

The format looks like:\
[instance obj1 obj2 obj3 ... objn room]\
[1         0     0   0   ...  0  "kitchen"]

In [None]:
#Create a CSV for the processed dataset
if subsetTog == True:
    processedData = open(PROCESSED_SUBSET_DIRECTORY, 'w', newline='')
    print("Building subset CSV")
else:
    processedData = open(PROCESSED_FULLSET_DIRECTORY, 'w', newline='')
    print("Building fullset CSV")
csvWriter = csv.writer(processedData, delimiter=',')

#Create a CSV for all of the unique detected objects in the dataset
if subsetTog == True:
    objectsData = open(OBJ_SUBSET_DIRECTORY, 'w', newline='')
else:
    objectsData = open(OBJ_FULLSET_DIRECTORY, 'w', newline='')
csvWriter2 = csv.writer(objectsData, delimiter=',')

#Create dataframe of rooms/scenes, objects, and object counts
columns = ['room']
for obj in uniqueObjs:
    columns.append(obj)

    #Record the detected objects in CSV
    csvWriter2.writerow([obj])

data = np.zeros([len(uniqueObjs)+1, numFiles])

#List to store "ground truths", these are the room names
countedListOfObjects = [[]]
objectCountsDF=np.empty([len(listOfAllObj)],dtype='object')
objectCounts=[]

#Cycle through the list of lists that contain the objects from each xml file
for i in range(len(listOfAllObj)):
    #Count each instance of a object in a given room type
    count = Counter(listOfAllObj[i]).most_common()
    objectCounts.append(Counter(listOfAllObj[i]))
    countedListOfObjects.append(count)

#Store counted objects in the dataframe
data = pd.DataFrame.from_records(objectCounts)
data["RoomName"] = listOfAllRooms
data = data.fillna(0) #Fill NaN with zero

#Insert the dataframe into a CSV
if subsetTog == True:
    data.to_csv(PROCESSED_SUBSET_DIRECTORY, sep=',', index=True)
else:
    data.to_csv(PROCESSED_FULLSET_DIRECTORY, sep=',', index=True)
            
#Close the CSVs
processedData.close()
objectsData.close()

Building fullset CSV
