In [1]:
import os
import re
import pandas as pd
import numpy as np

import requests
from bs4 import BeautifulSoup
import json

## Location page

In [2]:
url = "https://www.industriousoffice.com/locations"
page = BeautifulSoup(requests.get(url).content,'lxml')   

In [3]:
# get location links
loc_links_ls = list()

locations = page.find_all('a',{'class':'gtm-all-locations-link'})
for loc in locations:
    link = loc.get('href')
    name = loc.get_text()
    loc_links_ls.append((name, link))

In [4]:
loc_links_ls

[('Phoenix', 'https://www.industriousoffice.com/m/phoenix'),
 ('Scottsdale  Fashion Square',
  'https://www.industriousoffice.com/l/phoenix/fashion-square/7014-east-camelback-rd'),
 ('Scottsdale  Old Town',
  'https://www.industriousoffice.com/l/phoenix/old-town/4250-n-drinkwater-blvd'),
 ('  Mill Ave',
  'https://www.industriousoffice.com/l/phoenix/tempe/222-south-mill-ave'),
 ('  Biltmore',
  'https://www.industriousoffice.com/l/phoenix/biltmore/2801-e-camelback-road'),
 ('Los Angeles', 'https://www.industriousoffice.com/m/los-angeles'),
 ('  Pasadena  Pasarroyo',
  'https://www.industriousoffice.com/l/los-angeles/pasarroyo/251-south-lake-ave'),
 ('  Old Pasadena',
  'https://www.industriousoffice.com/l/los-angeles/old-pasadena/21-miller-alley'),
 ('  Century City',
  'https://www.industriousoffice.com/l/los-angeles/century-city/1925-century-park-e'),
 ('  Playa District',
  'https://www.industriousoffice.com/l/los-angeles/playa-district/6060-center-drive'),
 ('  West Hollywood',
  '

In [5]:
len(loc_links_ls)

276

## Download images from each location page

In [6]:
# for each location page, get its image links
def get_img_links(url):
    
    page = BeautifulSoup(requests.get(url).content,'lxml')
    
    img_links_ls = list()
    images = page.find_all('picture',{'class':'embed-responsive-item'})
    for i in images:
        link = i.find('img',{'class':'webpexpress-processed'}).get('src')
        img_links_ls.append(link[:-7])
        
    return img_links_ls

In [8]:
# get all locations' image links
all_img_links = list()
for loc in loc_links_ls:
    if 'www.industriousoffice.com/l' in loc[1]:
        img_links_ls = get_img_links(loc[1])
        all_img_links.append((loc[0],img_links_ls))        

In [9]:
all_img_links[0]

('Scottsdale  Fashion Square',
 ['https://www.industriousoffice.com/wp-content/uploads/2019/06/Scottsdale-FS-Gallery8-1230x692.jpg',
  'https://www.industriousoffice.com/wp-content/uploads/2020/12/scottsdale_3.jpg',
  'https://www.industriousoffice.com/wp-content/uploads/2020/12/Office8-1230x692.jpg',
  'https://www.industriousoffice.com/wp-content/uploads/2019/06/Scottsdale-FS-Gallery4-1-1230x692.jpg',
  'https://www.industriousoffice.com/wp-content/uploads/2020/12/WellnessRoom6-1230x692.jpg',
  'https://www.industriousoffice.com/wp-content/uploads/2020/12/Scottsdale-FashionSquare-Common-1230x692.jpg',
  'https://www.industriousoffice.com/wp-content/uploads/2020/12/Scottsdale-FashionSquare-Entry-1230x692.jpg'])

In [15]:
# save all images: one file for one location
img_directory = 'image'
industrious_directory = 'image/industrious'

if not os.path.exists(img_directory):
    os.mkdir(img_directory)
if not os.path.exists(industrious_directory):
    os.mkdir(industrious_directory)

for loc_img in all_img_links:
    for i,link in enumerate(loc_img[1]):
        loc = re.sub('[\s+]', '', loc_img[0])
        img = requests.get(link).content
        if not os.path.exists(industrious_directory+'/'+loc):
            os.mkdir(industrious_directory+'/'+loc)
        with open(industrious_directory+'/'+loc+'/%d.jpg'%i,'wb') as f:
            f.write(img)

In [30]:
# record tag for each image
img_tag_ls = list()

for loc in loc_links_ls:
    if 'www.industriousoffice.com/l' in loc[1]:
        loc_name = re.sub('[\s+]', '', loc[0])
        loc_page = BeautifulSoup(requests.get(loc[1]).content,'lxml')
        tag_ls = list()
        for t in loc_page.find_all('p',{'class':'gallery-caption'}):
            i = loc_page.find_all('p',{'class':'gallery-caption'}).index(t)
            tag_ls.append((industrious_directory+'/'+loc_name+'/%d.jpg'%i, t.get_text()))
    img_tag_ls.extend(tag_ls)

In [31]:
df = pd.DataFrame(img_tag_ls, columns=['img_path', 'tag'])
df.head()

Unnamed: 0,img_path,tag
0,image/industrious/ScottsdaleFashionSquare/0.jpg,Common area
1,image/industrious/ScottsdaleFashionSquare/1.jpg,Cafe
2,image/industrious/ScottsdaleFashionSquare/2.jpg,Example shown: Private office
3,image/industrious/ScottsdaleFashionSquare/3.jpg,Conference room
4,image/industrious/ScottsdaleFashionSquare/4.jpg,Example shown: Wellness room


In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1430 entries, 0 to 1429
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   img_path  1430 non-null   object
 1   tag       1430 non-null   object
dtypes: object(2)
memory usage: 22.5+ KB


In [35]:
df.to_csv("industrious_images_tag.csv")

## Classify into outdoor, indoor, gym&swimming pool, floorplan

In [2]:
df = pd.read_csv("industrious_images_tag.csv")

In [9]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df.head()

Unnamed: 0,img_path,tag
0,image/industrious/ScottsdaleFashionSquare/0.jpg,Common area
1,image/industrious/ScottsdaleFashionSquare/1.jpg,Cafe
2,image/industrious/ScottsdaleFashionSquare/2.jpg,Example shown: Private office
3,image/industrious/ScottsdaleFashionSquare/3.jpg,Conference room
4,image/industrious/ScottsdaleFashionSquare/4.jpg,Example shown: Wellness room


In [10]:
df['tag'].value_counts()

Common area                                                                             344
Conference room                                                                         146
Private office                                                                          106
Cafe                                                                                     63
Example shown: Conference room                                                           63
                                                                                       ... 
This meeting room is just the right size for a small team huddle.                         2
Outdoor terrace                                                                           2
Outdoor Space                                                                             2
Carlyle Tower fitness center                                                              2
Members receive complimentary breakfast and an afternoon snack daily in the kitc

In [12]:
tag_ls = list(df['tag'].unique())

In [13]:
tag_ls

['Common area',
 'Cafe',
 'Example shown: Private office',
 'Conference room',
 'Example shown: Wellness room',
 'Mall entrance',
 'Common Area',
 'Office',
 'Conference Room',
 'Café',
 'Wellness Room',
 'Outdoor Space',
 'Example shown: Common area',
 'Example shown: Office',
 'Example shown: Cafe',
 'Example shown: Conference room',
 'Example shown: Phone booths',
 'Private office',
 'Phone booths',
 'Wellness room',
 'Canvas suite',
 'Event space',
 'Fitness center',
 'Campus basketball court',
 'Campus lounge area',
 nan,
 'Example shown: Focus area',
 'Reception',
 'Café and lounge',
 'Reception and common area',
 'Large team suite',
 'Building lobby',
 'Tenant lounge',
 'Library',
 'Lounge and conference rooms',
 'Example shown: Café',
 'Outdoor terrace',
 'Lounge',
 'Front desk',
 'Focus area',
 'Industrious entrance and building lobby',
 'Carlyle Tower fitness center',
 'Carlyle Tower tenant lounge',
 'Carlyle Tower conference center',
 'Huddle room',
 'Rooftop terrace | Image

In [28]:
# df.loc[df["tag"].isnull()]
df.loc[df['tag'] == "Building amenity: Fitness center"]

Unnamed: 0,img_path,tag
572,image/industrious/WellsFargoCapitolCenter/9.jpg,Building amenity: Fitness center
1290,image/industrious/WellsFargoCapitolCenter/9.jpg,Building amenity: Fitness center
