## Imports

In [1]:
from geopy.geocoders import GoogleV3
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd
import numpy as np
import urllib3
import re
from textwrap import shorten
import os

In [2]:
df = pd.read_csv('Bay Area companies that have decided to shift HQs, Jan. 1, 2022 to July 5, 2022 - Sheet1.csv')

In [3]:
df.columns

Index(['Company name', 'Category', 'Address new HQ', 'Size of new HQ, in sf',
       'Address old HQ', 'Difference in size between old and new HQ, in sf',
       'Subleasing HQ? If yes, name of sublessor ',
       'Date TRD wrote about HQ shift ', 'Link to story'],
      dtype='object')

## Image URL collector

In [63]:
# Drop last n rows of a df. In case there are sums that need clipped
# df.drop(df.tail(n).index, inplace=True)

In [4]:
df['image_links'] = ''
links = []
for url in df['Link to story']:
    try:
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')
        images = soup.find_all('img')
        counter = 0
        for image in images:
            counter += 1
            if 'jpg' in image['src']:
                links.append(image['src'])
                if counter == len(images):
                    links.append('no image found')
                break
    except:
        links.append('no_image')

df['image_links'] = links

## Article link formatter

In [7]:
links

['https://therealdeal.com/sanfrancisco/wp-content/uploads/2022/01/SFO-Redwood-City-robotics-company-leases-158K-sf-site-of-new-San-Jose-HQ-MAIN-r1-705x439.jpg',
 'https://therealdeal.com/sanfrancisco/wp-content/uploads/2022/01/Main_relocation.jpg',
 'https://therealdeal.com/sanfrancisco/wp-content/uploads/2022/06/Iconiq-Capital-moves-HQ-across-SFs-Financial-District-705x466.jpg',
 'https://therealdeal.com/sanfrancisco/wp-content/uploads/2022/06/main-SF-Pure-Storage-upgrades-HQ-in-move-to-Santa-Clara-705x439.jpg',
 'https://mms.businesswire.com/media/20220616005375/en/1488739/4/EikonHQ-ACLS_Millbrae.jpg?download=1',
 'https://therealdeal.com/sanfrancisco/wp-content/uploads/2022/01/Main_allay-705x439.jpg',
 'https://therealdeal.com/sanfrancisco/wp-content/uploads/2022/01/SFO-Sephora-takes-286K-sf-in-SF-citys-biggest-Covid-era-office-lease-FEATUREIMG.jpg',
 'https://therealdeal.com/sanfrancisco/wp-content/uploads/2022/06/main-SF-Farella-Braun-Martel-to-downsize-HQ-in-San-Francisco-705x439

In [9]:
df['Company name']

0      Procept BioRobotics
1            Graphite Bio 
2           Iconiq Capital
3            Pure Storage 
4       Eikon Therapeutics
              ...         
993                    NaN
994                    NaN
995                    NaN
996                    NaN
997                    NaN
Name: Company name, Length: 998, dtype: object

In [18]:
df = df.dropna()

In [19]:
df["short_description"] = df["Company name"].apply(lambda s: shorten(s, width=20, placeholder=""))
df['remaining_desc'] = df.apply(lambda row : row['Company name'].replace(str(row['short_description']), ''), axis=1)
df['remaining_desc']

df["description_link"] = '<a href="' + df["Link to story"] + '">' + df["short_description"] + "</a>" + df["remaining_desc"]
df['description_link']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["short_description"] = df["Company name"].apply(lambda s: shorten(s, width=20, placeholder=""))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['remaining_desc'] = df.apply(lambda row : row['Company name'].replace(str(row['short_description']), ''), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versu

0     <a href="https://therealdeal.com/sanfrancisco/...
1     <a href="https://therealdeal.com/sanfrancisco/...
2     <a href="https://therealdeal.com/sanfrancisco/...
3     <a href="https://therealdeal.com/sanfrancisco/...
4     <a href="https://www.businesswire.com/news/hom...
5     <a href="https://therealdeal.com/sanfrancisco/...
6     <a href="https://therealdeal.com/sanfrancisco/...
7     <a href="https://therealdeal.com/sanfrancisco/...
8     <a href="https://therealdeal.com/sanfrancisco/...
9     <a href="https://therealdeal.com/sanfrancisco/...
10    <a href="https://therealdeal.com/sanfrancisco/...
Name: description_link, dtype: object

In [20]:
df.description_link[0]

'<a href="https://therealdeal.com/sanfrancisco/2022/01/06/redwood-city-robotics-company-leases-158k-sf-for-new-san-jose-hq/">Procept BioRobotics</a>'

## Google Maps API Geolocater Setup

In [21]:
%store -r google_maps_API_Key
geolocator = GoogleV3(api_key=google_maps_API_Key)

In [23]:
df['geo_address'] = df['Address new HQ'] + ' CA'
df['loc'] = df['geo_address'].apply(geolocator.geocode, timeout=10)
df['point'] = df['loc'].apply(lambda loc: tuple(loc.point) if loc else None)
df[['lat','lon','altitude']] = pd.DataFrame(df['point'].to_list(),index=df.index)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['geo_address'] = df['Address new HQ'] + ' CA'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['loc'] = df['geo_address'].apply(geolocator.geocode, timeout=10)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['point'] = df['loc'].apply(lambda loc: tuple(loc.point) if loc else None)
A value is 

## HTML popup formatter

In [28]:
def popup_html(row):
    i = row
    address = df['Address new HQ'].iloc[i]
    developer = df['Size of new HQ, in sf'].iloc[i]
    description = df['description_link'].iloc[i]
    image = df['image_links'].iloc[i]
    
    html = '''<!DOCTYPE html>
    <html>
    <img src={} width="256" height="156">'''.format(image) + '''<br>______________________________________<br>
    Address: <em>{}'''.format(address) + '''</em><br>
    Size of new HQ: <strong><em>{}'''.format(developer) + '''</strong></em><br><br>
    Company Name: <strong>{}'''.format(description) + '''</strong></html>
    '''
    return html


### HTML reservoir

In [72]:
#     Developer(s): <strong><em>{}'''.format(developer) + '''</strong></em><br><br>

## Map Maker

In [88]:
df.description_link[0]

'<a href="https://therealdeal.com/sanfrancisco/2022/05/12/state-bar-to-sell-250k-sf-hq-building-in-san-francisco/">State Bar of</a> California '

In [35]:
import folium
import branca

f = folium.Figure(width=750, height=750)
m = folium.Map(location=df[["lat", "lon"]].mean().to_list(),zoom_start=9)

title_html = '''
              <h3 align="center" style="font-size:16px"><b>{}</b></h3>
             '''.format(f'Bay Area companies that have decided to shift HQs, Jan. 1, 2022 to July 5, 2022')

for i in range(0,len(df)):
    html = popup_html(i)
    iframe = branca.element.IFrame(html=html)
    popup = folium.Popup(folium.Html(html, script=True))
    folium.Marker([df['lat'].iloc[i],df['lon'].iloc[i]],
                 popup=popup).add_to(m)

m.get_root().html.add_child(folium.Element(title_html))
m.fit_bounds(bounds=df[['lat','lon']].mean().to_list(),max_zoom=15)
# m.add_to(f)
m

In [32]:
m.save('index.html')

## Map URL snagger

Map template URL: `https://trd-digital.github.io/trd-news-interactive-maps/{map-folder-name}`

In [33]:
base_name = 'https://trd-digital.github.io/trd-news-interactive-maps/'

In [34]:
cwd = os.getcwd()

cwd = cwd.split('/')

final_name = base_name  + cwd[-1]
print(final_name)

https://trd-digital.github.io/trd-news-interactive-maps/bay_area_shift_hq
