In [1]:
import math
import arrow

import ipynb 
import os.path
import json
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from dotenv import load_dotenv
from networkx.algorithms import bipartite
from importlib import reload

from typing import List

load_dotenv(verbose=True)

True

In [2]:
# import functions from py file 

import functions.fun
reload(functions.fun)
from functions.fun import CB_data_cleaning, df_from_api_CB, extract_nodes, extract_data_from_column
from functions.fun import nx_dip_graph_from_pandas, filter_dict, check_desc
from functions.fun import extract_classes_company_tech, degree_bip, insert_data_classes

In [3]:
# import functions from py file 

import functions.fun_external_factors
reload(functions.fun_external_factors)
from functions.fun_external_factors import rank_comparison, calibrate_analytic, create_exogenous_rank, haversine_distance, extract_coordinates_location

In [4]:
# import classes 

import classes
reload(classes)
from classes import Company, Investor, Technology

### Download data from CSV

In [5]:
df_start = pd.read_csv("data/data_cb/organizations.csv")

In [6]:
df_start.head()

Unnamed: 0,uuid,name,type,permalink,cb_url,rank,created_at,updated_at,legal_name,roles,...,phone,facebook_url,linkedin_url,twitter_url,logo_url,alias1,alias2,alias3,primary_role,num_exits
0,e1393508-30ea-8a36-3f96-dd3226033abd,Wetpaint,organization,wetpaint,https://www.crunchbase.com/organization/wetpaint,158955.0,2007-05-25 13:51:27,2019-06-24 22:19:25,,company,...,206-859-6300,https://www.facebook.com/Wetpaint,https://www.linkedin.com/company/wetpaint,https://twitter.com/wetpainttv,https://res.cloudinary.com/crunchbase-producti...,,,,company,
1,bf4d7b0e-b34d-2fd8-d292-6049c4f7efc7,Zoho,organization,zoho,https://www.crunchbase.com/organization/zoho,6686.0,2007-05-26 02:30:28,2018-10-27 00:29:49,,"investor,company",...,,http://www.facebook.com/zoho,http://www.linkedin.com/company/zoho-corporati...,http://twitter.com/zoho,https://res.cloudinary.com/crunchbase-producti...,,,,company,1.0
2,5f2b40b8-d1b3-d323-d81a-b7a8e89553d0,Digg,organization,digg,https://www.crunchbase.com/organization/digg,7793.0,2007-05-26 03:03:23,2018-12-10 10:09:14,"Digg Holdings, LLC",company,...,877-342-7222,http://www.facebook.com/digg,http://www.linkedin.com/company/digg,http://twitter.com/digg,https://res.cloudinary.com/crunchbase-producti...,,,,company,
3,f4d5ab44-058b-298b-ea81-380e6e9a8eec,Omidyar Network,organization,omidyar-network,https://www.crunchbase.com/organization/omidya...,136861.0,2007-05-26 03:21:34,2019-06-19 12:17:48,,investor,...,650.482.2500,http://www.facebook.com/OmidyarNetwork,http://www.linkedin.com/company/22806,http://twitter.com/OmidyarNetwork,https://res.cloudinary.com/crunchbase-producti...,,,,investor,38.0
4,df662812-7f97-0b43-9d3e-12f64f504fbb,Facebook,organization,facebook,https://www.crunchbase.com/organization/facebook,47.0,2007-05-26 04:22:15,2021-04-14 23:52:25,"Facebook, Inc.","investor,company",...,,https://www.facebook.com/facebook/,http://www.linkedin.com/company/facebook,https://twitter.com/facebook,https://res.cloudinary.com/crunchbase-producti...,,,,company,


In [7]:
df_start.columns

Index(['uuid', 'name', 'type', 'permalink', 'cb_url', 'rank', 'created_at',
       'updated_at', 'legal_name', 'roles', 'domain', 'homepage_url',
       'country_code', 'state_code', 'region', 'city', 'address',
       'postal_code', 'status', 'short_description', 'category_list',
       'category_groups_list', 'num_funding_rounds', 'total_funding_usd',
       'total_funding', 'total_funding_currency_code', 'founded_on',
       'last_funding_on', 'closed_on', 'employee_count', 'email', 'phone',
       'facebook_url', 'linkedin_url', 'twitter_url', 'logo_url', 'alias1',
       'alias2', 'alias3', 'primary_role', 'num_exits'],
      dtype='object')

### Data Cleaning

for now we decide to use as key the name. it would be better to use the uuid

In [8]:
to_drop = [
    'type',
    'permalink',
    'cb_url',   
    'created_at',
    'domain',
    'address',
    'state_code',
    'updated_at',
    'legal_name',
    'roles',
    'postal_code',
    'homepage_url',
    'num_funding_rounds',
    'total_funding_currency_code',
    'phone',
    'email',
    'num_exits',
    'alias2',
    'alias3',
    'num_exits',
    'logo_url',
    'alias1',
    'last_funding_on',
    'twitter_url',
    'facebook_url'
]

to_rename = { 'category_groups_list': 'category_groups' }

drop_if_nan = [
    'category_groups',
    'rank'
]

to_check_double = {}

sort_by = "rank"

In [9]:
df = CB_data_cleaning(df_start, to_drop, to_rename, to_check_double, drop_if_nan, sort_by)

In [10]:
df.head()

Unnamed: 0,uuid,name,rank,country_code,region,city,status,short_description,category_list,category_groups,total_funding_usd,total_funding,founded_on,closed_on,employee_count,linkedin_url,primary_role
1178,74a20af3-f4dd-6188-de60-c4ee6cd0ca4a,Ant Group,1.0,CHN,Zhejiang,Hangzhou,operating,Ant Group strives to enable all consumers and ...,"Banking,Financial Services,FinTech,Payments","Financial Services,Lending and Investments,Pay...",22000000000.0,22000000000.0,2014-10-01,,5001-10000,https://www.linkedin.com/company/antgroup/,company
4042,022417b5-4980-6c54-0f3c-6736bbbb1a5e,Spotify,2.0,SWE,Stockholms Lan,Stockholm,ipo,Spotify is a commercial music streaming servic...,"Audio,Cloud Computing,Music,Music Streaming,Vi...","Content and Publishing,Internet Services,Media...",2085425000.0,2085425000.0,2006-04-23,,5001-10000,http://www.linkedin.com/company/spotify,company
349,468bef9f-2f50-590e-6e78-62e3adb05aa1,Citi,3.0,USA,New York,New York,ipo,Citigroup is a diversified financial services ...,"Banking,Credit Cards,Financial Services,Wealth...","Financial Services,Lending and Investments,Pay...",8700000000.0,8700000000.0,1998-10-08,,10000+,https://www.linkedin.com/company/citi,investor
211260,a40d0a1f-f32c-a1e9-1bbd-a10bb0eca2e7,Deliveroo,4.0,GBR,England,London,ipo,Deliveroo is an online food delivery service t...,"Delivery,Food and Beverage,Food Delivery,Same ...","Administrative Services,Food and Beverage,Tran...",1712683000.0,1712683000.0,2012-01-01,,5001-10000,https://www.linkedin.com/company/deliveroo,company
621119,00daca16-8311-454b-84e0-24a40d16be9c,Antler,5.0,SGP,Central Region,Singapore,operating,Antler is a global early-stage venture capital...,Venture Capital,"Financial Services,Lending and Investments",78000000.0,78000000.0,2017-01-01,,101-250,https://www.linkedin.com/company/antlerglobal/,investor


In [11]:
df.columns

Index(['uuid', 'name', 'rank', 'country_code', 'region', 'city', 'status',
       'short_description', 'category_list', 'category_groups',
       'total_funding_usd', 'total_funding', 'founded_on', 'closed_on',
       'employee_count', 'linkedin_url', 'primary_role'],
      dtype='object')

In [12]:
df_places = df[['country_code', 'region', 'city']]
df_places.head()

Unnamed: 0,country_code,region,city
1178,CHN,Zhejiang,Hangzhou
4042,SWE,Stockholms Lan,Stockholm
349,USA,New York,New York
211260,GBR,England,London
621119,SGP,Central Region,Singapore


In [13]:
# only in USA and not duplicate 

df_places = df_places[df_places['country_code']=='USA']
df_places = df_places.drop_duplicates('region')
df_places.head(10)

Unnamed: 0,country_code,region,city
349,USA,New York,New York
85286,USA,California,San Francisco
2005,USA,Massachusetts,Cambridge
287787,USA,North Carolina,Cary
176113,USA,Georgia,Atlanta
502612,USA,Texas,Austin
59901,USA,Maryland,Bethesda
281077,USA,Florida,Clearwater
364540,USA,Michigan,Detroit
287392,USA,Pennsylvania,Philadelphia


In [14]:
df_places = df_places[:10]

In [15]:
import haversine as hs

In [16]:
loc1=(28.426846,77.088834)
loc2=(28.394231,77.050308)
hs.haversine(loc1,loc2)


5.229712941541709

In [17]:
from geopy.geocoders import Nominatim
import geopandas
import geocoder

In [18]:
geolocator = Nominatim(user_agent='myapplication')

### Investors' location

Let us suppose the investor is located in NY

In [21]:
city_inv = "New York"
region_inv = "New York"
country_inv = "USA"

str_place = city_inv + ', ' + region_inv + ', ' + country_inv

location = geolocator.geocode(str_place) # coversion to conventional address (valid for the next command)

lat_inv = location.latitude
lon_in= location.longitude

In [22]:
h_dis = []
lat_list = []
lon_list = []

for index, row in df_places.iterrows():
    
    str_place = row['city'] + ', ' + row['region'] #+ ', ' +  row['country_code']

    location = geolocator.geocode(str_place) # coversion to conventional address (valid for the next command)
    
    lat = location.latitude
    lon = location.longitude
    
    lat_list.append(lat)
    lon_list.append(lon)
    
    # Haversine distance between the company and teh investor
    h = haversine_distance(lat, lon, lat_inv, lon_in)
    h_dis.append(h)
    
    

In [14]:
!pip install sympy

Collecting sympy
  Downloading sympy-1.8-py3-none-any.whl (6.1 MB)
[K     |████████████████████████████████| 6.1 MB 4.1 MB/s eta 0:00:01
[?25hCollecting mpmath>=0.19
  Downloading mpmath-1.2.1-py3-none-any.whl (532 kB)
[K     |████████████████████████████████| 532 kB 56.1 MB/s eta 0:00:01
[?25hInstalling collected packages: mpmath, sympy
Successfully installed mpmath-1.2.1 sympy-1.8


In [19]:
import sympy
s = "for index, row in df_places.iterrows():"
sympy.latex(eval(s)) 

SyntaxError: invalid syntax (<string>, line 1)

In [23]:
h_dict = {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5}
h_max=10

s = "{c_name:h/h_max for (c_name,h) in h_dict.items()}"
sympy.latex(eval(s)) 

'\\left\\{ \\mathtt{\\text{a}} : 0.1, \\  \\mathtt{\\text{b}} : 0.2, \\  \\mathtt{\\text{c}} : 0.3, \\  \\mathtt{\\text{d}} : 0.4, \\  \\mathtt{\\text{e}} : 0.5\\right\\}'

In [2]:
! pip install mapview



In [8]:
!garden install mapview

Downloading http://github.com/kivy-garden/garden.mapview/archive/master.zip ...
Progression 1024 | 
Progression 2048 / 
Progression 3072 - 
Progression 4096 \ 
Progression 5120 | 
Progression 6144 / 
Progression 7168 - 
Progression 8192 \ 
Progression 9216 | 
Progression 10240 / 
Progression 11264 - 
Progression 12288 \ 
Progression 13312 | 
Progression 14336 / 
Progression 15360 - 
Progression 16384 \ 
Progression 17408 | 
Progression 18432 / 
Progression 19456 - 
Progression 20480 \ 
Progression 21504 | 
Progression 22528 / 
Progression 23552 - 
Progression 24576 \ 
Progression 25600 | 
Progression 26624 / 
Progression 27648 - 
Progression 28672 \ 
Progression 29696 | 
Progression 30720 / 
Progression 31744 - 
Progression 32768 \ 
Progression 33792 | 
Progression 34816 / 
Progression 35840 - 
Progression 36864 \ 
Progression 37888 | 
Progression 38912 / 
Progression 39936 - 
Progression 40960 \ 
Progression 41984 | 
Progression 43008 / 
Progression 44032 - 
Progression 45056 \ 
Progr

In [42]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [43]:
from arcgis.gis import GIS
m1 = GIS().map('United States')
m1.show()

AttributeError: 'MapView' object has no attribute 'show'

In [27]:
BBox = ((min(lon_list),   max(lon_list), min(lat_list),   max(lat_list)))
BBox

(-122.419906, -71.1056157, 27.9658533, 42.3750997)

In [None]:
ruh_m = plt.imread('C:/.. … /Riyadh_map.png')

In [None]:
fig, ax = plt.subplots(figsize = (8,7))ax.scatter(df.longitude, df.latitude, zorder=1, alpha= 0.2, c='b', s=10)ax.set_title('Plotting Spatial Data on Riyadh Map')
ax.set_xlim(BBox[0],BBox[1])
ax.set_ylim(BBox[2],BBox[3])ax.imshow(ruh_m, zorder=0, extent = BBox, aspect= 'equal')