In [71]:
import numpy as np
import pandas as pd
import requests

In [72]:
# loading first data set which contains company names and descriptions
startups = pd.read_csv('startups.csv')

In [73]:
startups.head()

Unnamed: 0,name,city,tagline,description
0,Campus Bubble,Atlanta,Your Academic Identity,Campus Bubble (“CB”) is the Academic Community...
1,DueProps,Atlanta,Gamifying the $46 Billion Employee Incentives ...,t unprecedented ...
2,SalesLoft,Atlanta,Quickly build high-quality prospect lists,build high-quality prospect lists\n
3,The Coca-Cola Company,Atlanta,,Coca-Cola Journey is a digital magazine that f...
4,EarthLink,Atlanta,,


In [74]:
startups.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42038 entries, 0 to 42037
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         42034 non-null  object
 1   city         42038 non-null  object
 2   tagline      36730 non-null  object
 3   description  37174 non-null  object
dtypes: object(4)
memory usage: 1.3+ MB


In [75]:
startups.isna().sum()

name              4
city              0
tagline        5308
description    4864
dtype: int64

In [76]:
# I need to strip the name columns of all spaces or extra characters and make them lower case, so that the join can get the most possible matches

startups['name_unformatted'] = startups['name']

In [77]:
for i in range(len(startups['name_unformatted'])):
    startups['name_unformatted'][i] = str(startups['name_unformatted'][i]).lower().replace(' ', '')
    if not startups['name_unformatted'][i].isalnum():
        for char in startups['name_unformatted'][i]:
            if not char.isalnum():
                startups['name_unformatted'][i] = startups['name_unformatted'][i].replace(char, '')

In [78]:
startups.head()

Unnamed: 0,name,city,tagline,description,name_unformatted
0,Campus Bubble,Atlanta,Your Academic Identity,Campus Bubble (“CB”) is the Academic Community...,campusbubble
1,DueProps,Atlanta,Gamifying the $46 Billion Employee Incentives ...,t unprecedented ...,dueprops
2,SalesLoft,Atlanta,Quickly build high-quality prospect lists,build high-quality prospect lists\n,salesloft
3,The Coca-Cola Company,Atlanta,,Coca-Cola Journey is a digital magazine that f...,thecocacolacompany
4,EarthLink,Atlanta,,,earthlink


In [79]:
# loading the second dataset which contains more information on the companies such as categories, status, funding, and URLs
companies = pd.read_csv('companies.csv',sep=';')

In [80]:
companies.head()

Unnamed: 0,Name,homepage_url,category_list,funding_total_usd,status,country_code,state_code,region,city,funding_rounds,founded_at,first_funding_at,last_funding_at,permalink
0,#fame,http://livfame.com,Media,10000000,operating,IND,16,Mumbai,Mumbai,1,,05/01/2015,05/01/2015,/organization/-fame
1,:Qounter,http://www.qounter.com,Application Platforms|Real Time|Social Network...,700000,operating,USA,DE,DE - Other,Delaware City,2,04/09/2014,01/03/2014,14/10/2014,/organization/-qounter
2,"(THE) ONE of THEM,Inc.",http://oneofthem.jp,Apps|Games|Mobile,3406878,operating,,,,,1,,30/01/2014,30/01/2014,/organization/-the-one-of-them-inc-
3,0-6.com,http://www.0-6.com,Curated Web,2000000,operating,CHN,22,Beijing,Beijing,1,01/01/2007,19/03/2008,19/03/2008,/organization/0-6-com
4,004 Technologies,http://004gmbh.de/en/004-interact,Software,-,operating,USA,IL,"Springfield, Illinois",Champaign,1,01/01/2010,24/07/2014,24/07/2014,/organization/004-technologies


In [81]:
companies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63215 entries, 0 to 63214
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Name               63214 non-null  object
 1   homepage_url       59069 non-null  object
 2   category_list      63215 non-null  object
 3   funding_total_usd  63215 non-null  object
 4   status             63215 non-null  object
 5   country_code       57799 non-null  object
 6   state_code         56263 non-null  object
 7   region             56760 non-null  object
 8   city               56762 non-null  object
 9   funding_rounds     63215 non-null  int64 
 10  founded_at         49707 non-null  object
 11  first_funding_at   63192 non-null  object
 12  last_funding_at    63215 non-null  object
 13  permalink          63215 non-null  object
dtypes: int64(1), object(13)
memory usage: 6.8+ MB


In [82]:
# creating an unformatted name column here as well to maximize the join
companies['name_unformatted'] = companies['Name']

In [83]:
for i in range(len(companies['name_unformatted'])):
    companies['name_unformatted'][i] = str(companies['name_unformatted'][i]).lower().replace(' ', '')
    if not companies['name_unformatted'][i].isalnum():
        for char in companies['name_unformatted'][i]:
            if not char.isalnum():
                companies['name_unformatted'][i] = companies['name_unformatted'][i].replace(char, '')

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [84]:
# trying to determine status of companies now by testing website URLs 

#companies['current_status'] = companies['status']
#for i in range(len(companies['homepage_url'])):
#    if requests.get(companies['homepage_url'][i]).status_code == 200:
#        companies['current_status'][i] = 'operating'
#    else:
#        companies['current_status'][i] = 'closed' 
        
# r.content.decode('utf-8')

In [85]:
startups_join = startups.join(companies.set_index('name_unformatted'), on='name_unformatted', how='inner', lsuffix='_startups-list', rsuffix='_crunchbase')

In [86]:
startups_join.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10622 entries, 0 to 42023
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   name                10618 non-null  object
 1   city_startups-list  10622 non-null  object
 2   tagline             8200 non-null   object
 3   description         8531 non-null   object
 4   name_unformatted    10622 non-null  object
 5   Name                10618 non-null  object
 6   homepage_url        10495 non-null  object
 7   category_list       10622 non-null  object
 8   funding_total_usd   10622 non-null  object
 9   status              10622 non-null  object
 10  country_code        9997 non-null   object
 11  state_code          9741 non-null   object
 12  region              9911 non-null   object
 13  city_crunchbase     9911 non-null   object
 14  funding_rounds      10622 non-null  int64 
 15  founded_at          9483 non-null   object
 16  first_funding_at    10615 n

In [87]:
startups_join.head()

Unnamed: 0,name,city_startups-list,tagline,description,name_unformatted,Name,homepage_url,category_list,funding_total_usd,status,country_code,state_code,region,city_crunchbase,funding_rounds,founded_at,first_funding_at,last_funding_at,permalink
0,Campus Bubble,Atlanta,Your Academic Identity,Campus Bubble (“CB”) is the Academic Community...,campusbubble,Campus Bubble,http://www.campusbubble.com/,Colleges|Education|Networking|SaaS|Social Medi...,525000,operating,,,,,1,01/03/2012,04/06/2014,04/06/2014,/organization/campus-bubble
1,DueProps,Atlanta,Gamifying the $46 Billion Employee Incentives ...,t unprecedented ...,dueprops,DueProps,http://dueprops.com,Games,200000,operating,USA,GA,Atlanta,Atlanta,1,23/05/2011,01/11/2011,01/11/2011,/organization/dueprops
2,SalesLoft,Atlanta,Quickly build high-quality prospect lists,build high-quality prospect lists\n,salesloft,SalesLoft,http://salesloft.com,B2B|Lead Generation|SaaS|Sales and Marketing|S...,11218000,operating,USA,GA,Atlanta,Atlanta,4,01/10/2011,17/05/2012,01/04/2015,/organization/salesloft
4,EarthLink,Atlanta,,,earthlink,EarthLink,http://www.earthlink.net,Curated Web|Software,23598258,ipo,USA,GA,Atlanta,Atlanta,1,01/01/1994,12/04/2011,12/04/2011,/organization/earthlink
5,REscour,Atlanta,Market intelligence and analytics for commerci...,REscour is a data platform and decision engine...,rescour,REscour,http://www.rescour.com,Commercial Real Estate|Real Estate,2800000,operating,USA,GA,Atlanta,Atlanta,2,01/01/2013,04/03/2015,03/12/2015,/organization/rescour


In [88]:
startups_join = startups_join.reset_index(drop=True)

In [89]:
startups_join.head()

Unnamed: 0,name,city_startups-list,tagline,description,name_unformatted,Name,homepage_url,category_list,funding_total_usd,status,country_code,state_code,region,city_crunchbase,funding_rounds,founded_at,first_funding_at,last_funding_at,permalink
0,Campus Bubble,Atlanta,Your Academic Identity,Campus Bubble (“CB”) is the Academic Community...,campusbubble,Campus Bubble,http://www.campusbubble.com/,Colleges|Education|Networking|SaaS|Social Medi...,525000,operating,,,,,1,01/03/2012,04/06/2014,04/06/2014,/organization/campus-bubble
1,DueProps,Atlanta,Gamifying the $46 Billion Employee Incentives ...,t unprecedented ...,dueprops,DueProps,http://dueprops.com,Games,200000,operating,USA,GA,Atlanta,Atlanta,1,23/05/2011,01/11/2011,01/11/2011,/organization/dueprops
2,SalesLoft,Atlanta,Quickly build high-quality prospect lists,build high-quality prospect lists\n,salesloft,SalesLoft,http://salesloft.com,B2B|Lead Generation|SaaS|Sales and Marketing|S...,11218000,operating,USA,GA,Atlanta,Atlanta,4,01/10/2011,17/05/2012,01/04/2015,/organization/salesloft
3,EarthLink,Atlanta,,,earthlink,EarthLink,http://www.earthlink.net,Curated Web|Software,23598258,ipo,USA,GA,Atlanta,Atlanta,1,01/01/1994,12/04/2011,12/04/2011,/organization/earthlink
4,REscour,Atlanta,Market intelligence and analytics for commerci...,REscour is a data platform and decision engine...,rescour,REscour,http://www.rescour.com,Commercial Real Estate|Real Estate,2800000,operating,USA,GA,Atlanta,Atlanta,2,01/01/2013,04/03/2015,03/12/2015,/organization/rescour


In [90]:
startups_join.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10622 entries, 0 to 10621
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   name                10618 non-null  object
 1   city_startups-list  10622 non-null  object
 2   tagline             8200 non-null   object
 3   description         8531 non-null   object
 4   name_unformatted    10622 non-null  object
 5   Name                10618 non-null  object
 6   homepage_url        10495 non-null  object
 7   category_list       10622 non-null  object
 8   funding_total_usd   10622 non-null  object
 9   status              10622 non-null  object
 10  country_code        9997 non-null   object
 11  state_code          9741 non-null   object
 12  region              9911 non-null   object
 13  city_crunchbase     9911 non-null   object
 14  funding_rounds      10622 non-null  int64 
 15  founded_at          9483 non-null   object
 16  first_funding_at    10

In [91]:
# ran this to get csv to work with
# startups_join.to_csv('startups_join.csv')

In [92]:
startups_join.isna().sum()

name                     4
city_startups-list       0
tagline               2422
description           2091
name_unformatted         0
Name                     4
homepage_url           127
category_list            0
funding_total_usd        0
status                   0
country_code           625
state_code             881
region                 711
city_crunchbase        711
funding_rounds           0
founded_at            1139
first_funding_at         7
last_funding_at          0
permalink                0
dtype: int64

In [108]:
copy = startups_join

In [109]:
copy.shape

(10622, 19)

In [110]:
# if I drop all nulls I lose 4118 rows
copy.dropna().shape

(6504, 19)

In [111]:
# which rows don't have names?
copy[copy['name'].isna()]

Unnamed: 0,name,city_startups-list,tagline,description,name_unformatted,Name,homepage_url,category_list,funding_total_usd,status,country_code,state_code,region,city_crunchbase,funding_rounds,founded_at,first_funding_at,last_funding_at,permalink
1719,,Boston,,,,,http://tellitin10.com,Startups,25000,closed,USA,OR,"Portland, Oregon",Portland,1,01/10/2011,01/03/2012,01/03/2012,/organization/tell-it-in
1720,,Hong Kong,Consumer Electronics,Consumer electronics.,,,http://tellitin10.com,Startups,25000,closed,USA,OR,"Portland, Oregon",Portland,1,01/10/2011,01/03/2012,01/03/2012,/organization/tell-it-in
1721,,Tel Aviv,,mpaign,,,http://tellitin10.com,Startups,25000,closed,USA,OR,"Portland, Oregon",Portland,1,01/10/2011,01/03/2012,01/03/2012,/organization/tell-it-in
1722,,Washington DC,,,,,http://tellitin10.com,Startups,25000,closed,USA,OR,"Portland, Oregon",Portland,1,01/10/2011,01/03/2012,01/03/2012,/organization/tell-it-in


In [112]:
# all these companies have the same URL and permalink, making sure there aren't others
copy[copy['homepage_url']=='http://tellitin10.com']

Unnamed: 0,name,city_startups-list,tagline,description,name_unformatted,Name,homepage_url,category_list,funding_total_usd,status,country_code,state_code,region,city_crunchbase,funding_rounds,founded_at,first_funding_at,last_funding_at,permalink
1719,,Boston,,,,,http://tellitin10.com,Startups,25000,closed,USA,OR,"Portland, Oregon",Portland,1,01/10/2011,01/03/2012,01/03/2012,/organization/tell-it-in
1720,,Hong Kong,Consumer Electronics,Consumer electronics.,,,http://tellitin10.com,Startups,25000,closed,USA,OR,"Portland, Oregon",Portland,1,01/10/2011,01/03/2012,01/03/2012,/organization/tell-it-in
1721,,Tel Aviv,,mpaign,,,http://tellitin10.com,Startups,25000,closed,USA,OR,"Portland, Oregon",Portland,1,01/10/2011,01/03/2012,01/03/2012,/organization/tell-it-in
1722,,Washington DC,,,,,http://tellitin10.com,Startups,25000,closed,USA,OR,"Portland, Oregon",Portland,1,01/10/2011,01/03/2012,01/03/2012,/organization/tell-it-in


In [113]:
# just drop these
copy = copy.dropna(subset='name')

In [114]:
# checking that 4 rows are gone
copy.shape

(10618, 19)

In [115]:
# do the tagline and description null columns coincide?
copy.dropna(subset='tagline').shape # 8200 rows left
copy.dropna(subset=['tagline', 'description']).shape # 8010 rows left, an extra 190, so they largely do coincide

(8009, 19)

In [116]:
# I'll drop all the rows that are missing descriptions since that's the crux of my project
copy = copy.dropna(subset='description')

In [117]:
copy.shape

(8528, 19)

In [118]:
# I can safely drop city_crunchbase and Name columns since the city_startups-list and name columns have all that information anyway
# I also don't need name_unformatted or permalink
# I don't think I need the state_code and region columns either if I have city and country
copy = copy.drop(columns=['name_unformatted', 'Name', 'city_crunchbase', 'state_code', 'region', 'permalink'], axis=1)

In [119]:
# checking that columns were dropped
copy.shape

(8528, 13)

In [120]:
# I should be able to impute country code from the city column
copy[['']]

In [121]:
# which rows don't have URLs?
copy[copy['homepage_url'].isna()]
# some of these seem to still be operating and have websites, should try to find them and add them in

Unnamed: 0,name,city_startups-list,tagline,description,homepage_url,category_list,funding_total_usd,status,country_code,funding_rounds,founded_at,first_funding_at,last_funding_at
287,Kin Valley,Austin,Connect With Your Kin,The way people connect online has become trivi...,,Social Media,-,operating,USA,1,01/01/2009,20/12/2012,20/12/2012
299,Kambit,Austin,Changing the Way People Create Change,ides a platform that combines two incredible f...,,Software,-,closed,USA,2,22/09/2011,28/02/2012,15/08/2012
542,Noow,Austin,Physical therapy engagement platform,"s, the system reminds them to do their workout...",,Health Care,-,operating,,1,,20/12/2013,20/12/2013
695,Twiddly,Bangalore,Personalised content browser on the Android lo...,f the 5 startups which are in the current batc...,,Content Discovery|Mobile|Personalization,30000,operating,,1,,28/10/2014,28/10/2014
727,Torch,Bangalore,Google Analytics for Offline Retailers,"Torch provides location analytics, customer en...",,Monetization|Service Providers,30000,operating,,1,,28/10/2014,28/10/2014
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10267,TasteAway,Toronto,"You Crave, We deliver. Online food takeout and...",s in adding a technology layer on a fragmented...,,Delivery|E-Commerce|Restaurants,450000,operating,CAN,2,15/12/2012,30/05/2013,30/06/2014
10530,Cotton & Reed Distillery,Washington DC,"Craft Distillery - exerpimental spirits, forag...",Cotton & Reed delivers experimental and natura...,,Restaurants|Services|Wine And Spirits,-,operating,USA,1,01/03/2013,22/10/2013,22/10/2013
10535,YoBucko,Washington DC,YoBucko is an online personal finance guide fo...,YoBucko is a personal finance site for Generat...,,Education|Financial Services|Lead Generation|P...,125125,operating,USA,1,,20/06/2011,20/06/2011
10585,KickUp,Washington DC,KickUp creates peer-to-peer engagement systems...,one reason teachers leave or move from the cla...,,Collaboration|Communities|Education,100000,closed,,1,,17/06/2015,17/06/2015


In [122]:
# for these I can at least impute from first_funding_at, I should see if those seem to correlate
copy[copy['founded_at'].isna()]

Unnamed: 0,name,city_startups-list,tagline,description,homepage_url,category_list,funding_total_usd,status,country_code,funding_rounds,founded_at,first_funding_at,last_funding_at
27,Verdeeco,Atlanta,Energy Data Analytics & Applications Platform,The deployment of smart devices across the nat...,http://www.verdeeco.com,Analytics,1514205,acquired,USA,2,,14/07/2011,28/11/2012
33,Calendly,Atlanta,"Simple, beautiful scheduling","endly, it can take up to 7 emails over 2-3 day...",http://calendly.com,Software,-,operating,USA,1,,25/04/2014,25/04/2014
48,i-nexus,Atlanta,Business Execution Software as a Service,ecution software helps large organisations ali...,http://www.i-nexus.com,Business Services|Software|Software Compliance,6000000,operating,GBR,2,,30/06/2008,25/07/2008
68,abeo,Atlanta,eHarmony/Linkedin/ for employee engagement,ou create a private social network for your or...,http://abeo.com,Health Care,2012700,operating,USA,1,,30/11/2011,30/11/2011
71,Sevamob,Atlanta,Convenient access to primary healthcare in dev...,lth via online health exchange for internet-sa...,http://sevamob.com/,Health Care,-,operating,USA,1,,28/09/2012,28/09/2012
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10583,Workhorse,Washington DC,No bullshit strategy and design.,We're a full-service design studio in Washingt...,http://www.workhorse.co.in,Internet,-,operating,IND,1,,12/05/2015,12/05/2015
10585,KickUp,Washington DC,KickUp creates peer-to-peer engagement systems...,one reason teachers leave or move from the cla...,,Collaboration|Communities|Education,100000,closed,,1,,17/06/2015,17/06/2015
10593,Engage,Washington DC,A digital agency with a passion for disrupting...,"We give brands, candidates, and causes a voice...",http://www.engage.com,Match-Making|Online Dating|Private Social Netw...,5000000,closed,USA,1,,12/07/2006,12/07/2006
10599,Zest,Washington DC,empowering food bloggers everywhere,(In progress),,Blogging Platforms|Monetization|Social Media|S...,25000,operating,USA,1,,01/12/2014,01/12/2014


In [124]:
copy.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8528 entries, 0 to 10620
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   name                8528 non-null   object
 1   city_startups-list  8528 non-null   object
 2   tagline             8009 non-null   object
 3   description         8528 non-null   object
 4   homepage_url        8428 non-null   object
 5   category_list       8528 non-null   object
 6   funding_total_usd   8528 non-null   object
 7   status              8528 non-null   object
 8   country_code        7976 non-null   object
 9   funding_rounds      8528 non-null   int64 
 10  founded_at          7653 non-null   object
 11  first_funding_at    8521 non-null   object
 12  last_funding_at     8528 non-null   object
dtypes: int64(1), object(12)
memory usage: 932.8+ KB


In [123]:
# I can impute most of these from founded at, with the
copy[copy['first_funding_at'].isna()]

Unnamed: 0,name,city_startups-list,tagline,description,homepage_url,category_list,funding_total_usd,status,country_code,funding_rounds,founded_at,first_funding_at,last_funding_at
238,RealMassive,Austin,"Open data, marketing platform for commercial r...","ry: Zillow for commercial real estate, monetiz...",http://www.realmassive.com,Commercial Real Estate|Marketing Automation|Ma...,8000000.0,operating,USA,2,11/02/2013,,14/05/2015
3047,MotionMetrics,London,Wearables for ski instruction,We create hardware and software to help you im...,http://motionmetrics.co,Hardware + Software|Wearables,150000.0,closed,GBR,2,24/09/2013,,01/09/2014
3379,dopay,London,Banking the unbanked,a cloud-based payroll service that allows empl...,http://www.dopay.com,Financial Services,1675550.80940274,operating,GBR,3,04/01/2014,,01/09/2014
6171,Keymetrics,New York,Comprehensive monitoring & management software...,e is positioned as a paid extension of our exi...,https://keymetrics.io/,SaaS,360000.0,operating,USA,3,,,01/10/2015
8449,Topicmarks,San Francisco,Auto-organize your cloud documents meaningfully,Topicmarks has launched in beta a web service ...,http://topicmarks.com,Curated Web|Education|Knowledge Management|Nat...,150000.0,acquired,USA,2,01/01/2009,,18/03/2011
9151,Sontra,Sao Paulo,Uber for Trucks. re-inventing trucking in Brazil,born as a web and mobile based marketplace wit...,http://www.sontra.com.br,Customer Service|Logistics|Transportation,4349586.0,operating,BRA,5,01/01/2013,,23/01/2015
9848,Aniways,Tel Aviv,Contextualized. personalized. monetized,"Aniways has an SDK for adding ""intelligence an...",http://www.aniways.com,Advertising|Chat|Messaging|Mobile|Social Media,1190000.0,operating,ISR,6,01/01/2012,,23/03/2014


In [27]:
# a lot of funding_total_usd rows don't register as null because they have a '-' character
# 1464 rows total (before dropping any nulls)
copy[copy['funding_total_usd'] == '-']

Unnamed: 0,name,city_startups-list,tagline,description,name_unformatted,Name,homepage_url,category_list,funding_total_usd,status,country_code,state_code,region,city_crunchbase,funding_rounds,founded_at,first_funding_at,last_funding_at,permalink
5,viaCycle,Atlanta,"Zipcar for bicycles. Call or text, unlock, and...",viaCycle creates bicycle sharing technology th...,viacycle,viaCycle,http://www.viacycle.com,Hardware + Software|Mobile|Mobility|Transporta...,-,closed,USA,GA,Atlanta,Atlanta,1,01/05/2010,01/06/2012,01/06/2012,/organization/viacycle
8,MOVL,Atlanta,Connecting Mobile Devices and SmartTVs,The MOVL Connect Platform and KontrolTV provid...,movl,MOVL,http://kontrol.tv,Android|Apps|iOS|Social Network Media|Software...,-,operating,USA,GA,Atlanta,Atlanta,1,26/08/2010,01/01/2012,01/01/2012,/organization/movl
26,SportsCrunch,Atlanta,LinkedIn for Sports.,SportsCrunch.com,sportscrunch,SportsCrunch,http://www.SportsCrunch.com,Consumer Goods|Sports,-,operating,USA,GA,Atlanta,Atlanta,3,01/08/2010,01/09/2011,01/02/2013,/organization/xsporture
29,Storyful,Atlanta,The 1st news agency for the social media age,Storyful solves the biggest problem facing the...,storyful,Storyful,http://storyful.com,Media|News|Publishing|Social Media,-,acquired,IRL,7,Dublin,Dublin,4,01/12/2009,01/10/2011,09/09/2013,/organization/storyful
33,Calendly,Atlanta,"Simple, beautiful scheduling","endly, it can take up to 7 emails over 2-3 day...",calendly,Calendly,http://calendly.com,Software,-,operating,USA,GA,Atlanta,Atlanta,1,,25/04/2014,25/04/2014,/organization/calendly
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10577,Freedom Farms,Washington DC,Growing Prosperity. Feeding Progress,Aquaponics. This is about a new way. This is a...,freedomfarms,Freedom Farms,http://www.freedomfarmsdc.com/,Hospitality,-,operating,USA,DC,"Washington, D.C.",Washington,1,17/08/2013,06/10/2013,06/10/2013,/organization/freedom-farms
10583,Workhorse,Washington DC,No bullshit strategy and design.,We're a full-service design studio in Washingt...,workhorse,Workhorse,http://www.workhorse.co.in,Internet,-,operating,IND,16,Mumbai,Mumbai,1,,12/05/2015,12/05/2015,/organization/workhorse-2
10592,Encounter,Washington DC,Qlik View for medical supply management,ic health record (EHR) software integration us...,encounter,Encounter,http://tryencounter.com,Apps|Mobile|Online Dating|Social Media,-,operating,USA,AZ,Phoenix,Scottsdale,1,15/05/2015,10/11/2015,10/11/2015,/organization/encounter-2
10616,"Fit Body Club, Co.",Washington DC,"A step above Equinox - we sell results, not ac...",Fit Body Club is a results-oriented fitness co...,fitbodyclubco,"Fit Body Club, Co.",http://www.fitbodyclub.co,Fitness|Health and Wellness|Health Care|Nutrition,-,operating,USA,DC,"Washington, D.C.",Washington,1,03/03/2014,14/11/2014,14/11/2014,/organization/fit-body-club-co


Next steps: feature engineer - remove rows with no funding, create new column for length of description, remove redundant columns, split the category_list column into a distinct number of categories in order to dummy, create new column of current status that checks status codes of URLs

In [29]:
copy[['description']].isna().sum()

description    2091
dtype: int64

In [31]:
help(copy.dropna)

Help on method dropna in module pandas.core.frame:

dropna(*, axis: 'Axis' = 0, how: 'AnyAll | NoDefault' = <no_default>, thresh: 'int | NoDefault' = <no_default>, subset: 'IndexLabel' = None, inplace: 'bool' = False, ignore_index: 'bool' = False) -> 'DataFrame | None' method of pandas.core.frame.DataFrame instance
    Remove missing values.
    
    See the :ref:`User Guide <missing_data>` for more on which values are
    considered missing, and how to work with missing data.
    
    Parameters
    ----------
    axis : {0 or 'index', 1 or 'columns'}, default 0
        Determine if rows or columns which contain missing values are
        removed.
    
        * 0, or 'index' : Drop rows which contain missing values.
        * 1, or 'columns' : Drop columns which contain missing value.
    
        Pass tuple or list to drop on multiple axes.
        Only a single axis is allowed.
    
    how : {'any', 'all'}, default 'any'
        Determine if row or column is removed from DataFrame

In [None]:
# I can impute missing URLs by just filling them in with companyname.com
# first add unformatted name column for use in creating URLs
companies['name_unformatted'] = companies['Name']

for i in companies[companies['homepage_url'].isna()].index:
    companies.at[i, 'name_unformatted'] = str(companies['name_unformatted'][i]).lower().replace(' ', '')
    if not companies['name_unformatted'][i].isalnum():
        for char in companies['name_unformatted'][i]:
            if not char.isalnum():
                companies.at[i, 'name_unformatted'] = companies['name_unformatted'][i].replace(char, '')

In [None]:
# create URLs using unformatted names for rows with null values

for i in companies[companies['homepage_url'].isna()].index:
    companies.at[i, 'homepage_url'] = f'{companies["name_unformatted"][i]}.com'

In [None]:
# nulls are gone
companies['homepage_url'].isna().sum()

In [None]:
# double checking a specific instance
companies['homepage_url'][10]

# should I create a new column that indicates that these rows' websites are made up? I might need to if I'm going to create a new status column based off of it

In [None]:
# everything looks pretty good now
# on to some EDA
companies['status'].value_counts() # pretty unbalanced

In [None]:
companies['funding_total_usd'].value_counts().plot(kind='hist') # don't know why this doesn't work

In [None]:
# here's the histogram for funding_total_usd, we can see the mean is around 1 million
sns.histplot(companies['funding_total_usd'], log_scale=True)

In [None]:
# the smallest funding amount is $1 and largest is $30,000,000,000 which is a pretty ridiculous range
# which company raised 30 billion?
companies[companies['funding_total_usd'] > 30000000000 ]

In [None]:
# it's verizon which actually makes sense, but may not count as a startup
# what other companies are there here with huge funding totals?
# filtering for over 1B
companies[companies['funding_total_usd'] > 1000000000 ] # there are 61

In [2]:
import requests

In [3]:
r = requests.get('http://dueprops.com')

In [4]:
r.status_code

404

In [5]:
type(r.content)

bytes

In [31]:
if 'domain' and 'sale' in r.content.decode('utf-8'):
    print(5)

5


In [32]:
r.content

b'<!DOCTYPE html>\n<html lang="en">\n<head>\n\n<script id="cookieyes" type="text/javascript" src="https://cdn-cookieyes.com/client_data/e71bc53f1cb88666d160c1e2/script.js"></script>\n\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">\n\n<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">\n<link rel="preconnect" href="https://www.google.com">\n<link rel="preconnect" href="https://www.gstatic.com" crossorigin>\n<link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/fancyapps/fancybox@3.5.7/dist/jquery.fancybox.min.css" />\n<link rel="stylesheet" href="https://static.HugeDomains.com/css/hdv3-css/reboot.min.css">\n<link rel="stylesheet" href="https://static.HugeDomains.com/css/hdv3-css/style.css?aa=2021-06-09a">\n<link rel="stylesheet" href="https://static.HugeDomains.com/css/hdv3-css/responsive.css?aa=2021-06-09a">\n<link rel="stylesheet" href="https://static.HugeDomains.com/css/hdv3-css/hd-style.css?aa=2022-10-33">\n<meta name="them