In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
pd.options.display.float_format = '{:.2f}'.format

### Companies Data Analysis

In [3]:
df_com = pd.read_csv('companies.txt', sep='\t', encoding='ISO-8859-1')

In [4]:
df_com.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66368 entries, 0 to 66367
Data columns (total 10 columns):
permalink        66368 non-null object
name             66367 non-null object
homepage_url     61310 non-null object
category_list    63220 non-null object
status           66368 non-null object
country_code     59410 non-null object
state_code       57821 non-null object
region           58338 non-null object
city             58340 non-null object
founded_at       51147 non-null object
dtypes: object(10)
memory usage: 5.1+ MB


In [5]:
df_com.describe()

Unnamed: 0,permalink,name,homepage_url,category_list,status,country_code,state_code,region,city,founded_at
count,66368,66367,61310,63220,66368,59410,57821,58338,58340,51147
unique,66368,66102,61191,27296,4,137,311,1092,5111,3978
top,/Organization/Skidos,Peach,http://www.askforoffer.com,Software,operating,USA,CA,SF Bay Area,San Francisco,01-01-2012
freq,1,4,5,3995,53034,37601,12900,8804,3526,2730


In [6]:
df_com.size

663680

In [7]:
df_com.shape

(66368, 10)

In [8]:
df_com.columns

Index(['permalink', 'name', 'homepage_url', 'category_list', 'status',
       'country_code', 'state_code', 'region', 'city', 'founded_at'],
      dtype='object')

In [9]:
df_com.head(10)

Unnamed: 0,permalink,name,homepage_url,category_list,status,country_code,state_code,region,city,founded_at
0,/Organization/-Fame,#fame,http://livfame.com,Media,operating,IND,16,Mumbai,Mumbai,
1,/Organization/-Qounter,:Qounter,http://www.qounter.com,Application Platforms|Real Time|Social Network...,operating,USA,DE,DE - Other,Delaware City,04-09-2014
2,/Organization/-The-One-Of-Them-Inc-,"(THE) ONE of THEM,Inc.",http://oneofthem.jp,Apps|Games|Mobile,operating,,,,,
3,/Organization/0-6-Com,0-6.com,http://www.0-6.com,Curated Web,operating,CHN,22,Beijing,Beijing,01-01-2007
4,/Organization/004-Technologies,004 Technologies,http://004gmbh.de/en/004-interact,Software,operating,USA,IL,"Springfield, Illinois",Champaign,01-01-2010
5,/Organization/01Games-Technology,01Games Technology,http://www.01games.hk/,Games,operating,HKG,,Hong Kong,Hong Kong,
6,/Organization/0Ndine-Biomedical-Inc,Ondine Biomedical Inc.,http://ondinebio.com,Biotechnology,operating,CAN,BC,Vancouver,Vancouver,01-01-1997
7,/Organization/0Xdata,H2O.ai,http://h2o.ai/,Analytics,operating,USA,CA,SF Bay Area,Mountain View,01-01-2011
8,/Organization/1,One Inc.,http://whatis1.com,Mobile,operating,USA,CA,SF Bay Area,San Francisco,01-08-2011
9,/Organization/1-2-3-Listo,"1,2,3 Listo",http://www.123listo.com,E-Commerce,operating,CHL,12,Santiago,Las Condes,01-01-2012


If we observe the permalink, the case is not uniform; lets convert into lower case

In [10]:
df_com['permalink'] = df_com['permalink'].str.lower()

In [11]:
df_com.head(10)

Unnamed: 0,permalink,name,homepage_url,category_list,status,country_code,state_code,region,city,founded_at
0,/organization/-fame,#fame,http://livfame.com,Media,operating,IND,16,Mumbai,Mumbai,
1,/organization/-qounter,:Qounter,http://www.qounter.com,Application Platforms|Real Time|Social Network...,operating,USA,DE,DE - Other,Delaware City,04-09-2014
2,/organization/-the-one-of-them-inc-,"(THE) ONE of THEM,Inc.",http://oneofthem.jp,Apps|Games|Mobile,operating,,,,,
3,/organization/0-6-com,0-6.com,http://www.0-6.com,Curated Web,operating,CHN,22,Beijing,Beijing,01-01-2007
4,/organization/004-technologies,004 Technologies,http://004gmbh.de/en/004-interact,Software,operating,USA,IL,"Springfield, Illinois",Champaign,01-01-2010
5,/organization/01games-technology,01Games Technology,http://www.01games.hk/,Games,operating,HKG,,Hong Kong,Hong Kong,
6,/organization/0ndine-biomedical-inc,Ondine Biomedical Inc.,http://ondinebio.com,Biotechnology,operating,CAN,BC,Vancouver,Vancouver,01-01-1997
7,/organization/0xdata,H2O.ai,http://h2o.ai/,Analytics,operating,USA,CA,SF Bay Area,Mountain View,01-01-2011
8,/organization/1,One Inc.,http://whatis1.com,Mobile,operating,USA,CA,SF Bay Area,San Francisco,01-08-2011
9,/organization/1-2-3-listo,"1,2,3 Listo",http://www.123listo.com,E-Commerce,operating,CHL,12,Santiago,Las Condes,01-01-2012


## As per the description of the Companies data, Permalink is unique to each company

## Answer to the Question - How many unique companies are present in the companies file?

In [12]:
unique_permalink = df_com.permalink.unique()
print(len(unique_permalink))

66368


In [13]:
df_com['name'] = df_com['name'].str.lower()

In [14]:
df_com.head(15)

Unnamed: 0,permalink,name,homepage_url,category_list,status,country_code,state_code,region,city,founded_at
0,/organization/-fame,#fame,http://livfame.com,Media,operating,IND,16,Mumbai,Mumbai,
1,/organization/-qounter,:qounter,http://www.qounter.com,Application Platforms|Real Time|Social Network...,operating,USA,DE,DE - Other,Delaware City,04-09-2014
2,/organization/-the-one-of-them-inc-,"(the) one of them,inc.",http://oneofthem.jp,Apps|Games|Mobile,operating,,,,,
3,/organization/0-6-com,0-6.com,http://www.0-6.com,Curated Web,operating,CHN,22,Beijing,Beijing,01-01-2007
4,/organization/004-technologies,004 technologies,http://004gmbh.de/en/004-interact,Software,operating,USA,IL,"Springfield, Illinois",Champaign,01-01-2010
5,/organization/01games-technology,01games technology,http://www.01games.hk/,Games,operating,HKG,,Hong Kong,Hong Kong,
6,/organization/0ndine-biomedical-inc,ondine biomedical inc.,http://ondinebio.com,Biotechnology,operating,CAN,BC,Vancouver,Vancouver,01-01-1997
7,/organization/0xdata,h2o.ai,http://h2o.ai/,Analytics,operating,USA,CA,SF Bay Area,Mountain View,01-01-2011
8,/organization/1,one inc.,http://whatis1.com,Mobile,operating,USA,CA,SF Bay Area,San Francisco,01-08-2011
9,/organization/1-2-3-listo,"1,2,3 listo",http://www.123listo.com,E-Commerce,operating,CHL,12,Santiago,Las Condes,01-01-2012


In [15]:
unique_name = df_com.name.unique()
len(unique_name)

66038

In [16]:
df_dup_com = df_com[df_com.duplicated(['name'])]

In [17]:
df_dup_com.head()

Unnamed: 0,permalink,name,homepage_url,category_list,status,country_code,state_code,region,city,founded_at
282,/organization/3divaz-3,3divaz,http://www.3divaz.ch/Home,,operating,CHE,1,CHE - Other,Wildegg,01-07-2014
1527,/organization/adtena-2,adtena,http://adtena.com,,closed,,,,,15-11-2014
1996,/organization/agora-6,agora,https://www.agora.co,Android|Apps|Internet|Mobile,operating,USA,NY,New York City,New York,01-07-2013
2460,/organization/alike-2,alike,http://alike.fr/,Design,closed,FRA,A8,Paris,Paris,
3006,/organization/amicus-co,amicus,http://www.amicus.co,Apps|Mobile|Mobile Commerce,operating,IND,7,New Delhi,New Delhi,01-01-2015


In [18]:
# df_dup_com1 = df_com[df_com.duplicated()]

In [19]:
# df_dup_com1.shape

In [20]:
df_dup_com.shape

(330, 10)

There are around 330 companies which are duplicated; lets filter that later and lets continue using the company table
with duplicates

In [21]:
df_com.iloc[281]

permalink           /organization/3divaz-2
name                                3divaz
homepage_url     http://www.3divaz.ch/Home
category_list                          NaN
status                              closed
country_code                           NaN
state_code                             NaN
region                                 NaN
city                                   NaN
founded_at                      01-07-2014
Name: 281, dtype: object

In [22]:
df_com.iloc[282]

permalink           /organization/3divaz-3
name                                3divaz
homepage_url     http://www.3divaz.ch/Home
category_list                          NaN
status                           operating
country_code                           CHE
state_code                               1
region                         CHE - Other
city                               Wildegg
founded_at                      01-07-2014
Name: 282, dtype: object

In [23]:
# TODO - display all duplicated companies

## Rounds

In [24]:
df_rounds = pd.read_csv('rounds2.csv', encoding='ISO-8859-1')

In [25]:
df_rounds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114949 entries, 0 to 114948
Data columns (total 6 columns):
company_permalink          114949 non-null object
funding_round_permalink    114949 non-null object
funding_round_type         114949 non-null object
funding_round_code         31140 non-null object
funded_at                  114949 non-null object
raised_amount_usd          94959 non-null float64
dtypes: float64(1), object(5)
memory usage: 5.3+ MB


In [26]:
df_rounds.describe()

Unnamed: 0,raised_amount_usd
count,94959.0
mean,10426869.33
std,114821247.98
min,0.0
25%,322500.0
50%,1680511.0
75%,7000000.0
max,21271935000.0


In [27]:
df_rounds.shape

(114949, 6)

In [28]:
df_rounds.columns

Index(['company_permalink', 'funding_round_permalink', 'funding_round_type',
       'funding_round_code', 'funded_at', 'raised_amount_usd'],
      dtype='object')

In [29]:
df_rounds.head(10)

Unnamed: 0,company_permalink,funding_round_permalink,funding_round_type,funding_round_code,funded_at,raised_amount_usd
0,/organization/-fame,/funding-round/9a01d05418af9f794eebff7ace91f638,venture,B,05-01-2015,10000000.0
1,/ORGANIZATION/-QOUNTER,/funding-round/22dacff496eb7acb2b901dec1dfe5633,venture,A,14-10-2014,
2,/organization/-qounter,/funding-round/b44fbb94153f6cdef13083530bb48030,seed,,01-03-2014,700000.0
3,/ORGANIZATION/-THE-ONE-OF-THEM-INC-,/funding-round/650b8f704416801069bb178a1418776b,venture,B,30-01-2014,3406878.0
4,/organization/0-6-com,/funding-round/5727accaeaa57461bd22a9bdd945382d,venture,A,19-03-2008,2000000.0
5,/ORGANIZATION/004-TECHNOLOGIES,/funding-round/1278dd4e6a37fa4b7d7e06c21b3c1830,venture,,24-07-2014,
6,/organization/01games-technology,/funding-round/7d53696f2b4f607a2f2a8cbb83d01839,undisclosed,,01-07-2014,41250.0
7,/ORGANIZATION/0NDINE-BIOMEDICAL-INC,/funding-round/2b9d3ac293d5cdccbecff5c8cb0f327d,seed,,11-09-2009,43360.0
8,/organization/0ndine-biomedical-inc,/funding-round/954b9499724b946ad8c396a57a5f3b72,venture,,21-12-2009,719491.0
9,/ORGANIZATION/0XDATA,/funding-round/383a9bd2c04f7038bb543ccef5ba3eae,seed,,22-05-2013,3000000.0


In [30]:
df_rounds['company_permalink'] = df_rounds['company_permalink'].str.lower()

In [31]:
df_rounds.head(10)

Unnamed: 0,company_permalink,funding_round_permalink,funding_round_type,funding_round_code,funded_at,raised_amount_usd
0,/organization/-fame,/funding-round/9a01d05418af9f794eebff7ace91f638,venture,B,05-01-2015,10000000.0
1,/organization/-qounter,/funding-round/22dacff496eb7acb2b901dec1dfe5633,venture,A,14-10-2014,
2,/organization/-qounter,/funding-round/b44fbb94153f6cdef13083530bb48030,seed,,01-03-2014,700000.0
3,/organization/-the-one-of-them-inc-,/funding-round/650b8f704416801069bb178a1418776b,venture,B,30-01-2014,3406878.0
4,/organization/0-6-com,/funding-round/5727accaeaa57461bd22a9bdd945382d,venture,A,19-03-2008,2000000.0
5,/organization/004-technologies,/funding-round/1278dd4e6a37fa4b7d7e06c21b3c1830,venture,,24-07-2014,
6,/organization/01games-technology,/funding-round/7d53696f2b4f607a2f2a8cbb83d01839,undisclosed,,01-07-2014,41250.0
7,/organization/0ndine-biomedical-inc,/funding-round/2b9d3ac293d5cdccbecff5c8cb0f327d,seed,,11-09-2009,43360.0
8,/organization/0ndine-biomedical-inc,/funding-round/954b9499724b946ad8c396a57a5f3b72,venture,,21-12-2009,719491.0
9,/organization/0xdata,/funding-round/383a9bd2c04f7038bb543ccef5ba3eae,seed,,22-05-2013,3000000.0


In [32]:
df_rounds.company_permalink.shape

(114949,)

## Answer to the Question - How many unique companies are present in rounds2?

In [33]:
unique_comp_perm = df_rounds.company_permalink.unique()
print(len(unique_comp_perm))

66370


## Analysis - Q.Are there any companies in the rounds2 file which are not  present in companies ? Answer Y/N.

As per the below calculation, there are 2 companies missing in companies data compared to Rounds data.
Also the Missing comapnies listed

In [34]:
set_company_permlink = set(unique_comp_perm)

In [35]:
set_permlink = set(unique_permalink)

In [36]:
len(set_company_permlink)

66370

In [37]:
len(set_permlink)

66368

In [38]:
# 2 companies missing in companies table compared to the rounds table

#### Data present in Rounds table which isnt present in companies table

In [39]:
set_company_permlink.difference(set_permlink)

{'/organization/e-cã\x8abica',
 '/organization/energystone-games-ç\x81µç\x9f³æ¸¸æ\x88\x8f',
 '/organization/huizuche-com-æ\x83\xa0ç§ÿè½¦',
 '/organization/magnet-tech-ç£\x81ç\x9f³ç§\x91æ\x8a\x80',
 '/organization/tipcat-interactive-æ²\x99è\x88ÿä¿¡æ\x81¯ç§\x91æ\x9a\x80',
 '/organization/weiche-tech-å\x96\x82è½¦ç§\x91æ\x8a\x80',
 '/organization/zengame-ç¦\x85æ¸¸ç§\x91æ\x8a\x80'}

#### Another Way

In [40]:
set_diff_df = pd.concat([df_rounds['company_permalink'], df_com['permalink'], df_com['permalink']]).drop_duplicates(keep=False)
print(set_diff_df)

29597                               /organization/e-cãbica
31863          /organization/energystone-games-çµç³æ¸¸æ
45176                  /organization/huizuche-com-æ ç§ÿè½¦
58473                /organization/magnet-tech-ç£ç³ç§æ
101036    /organization/tipcat-interactive-æ²èÿä¿¡æ¯ç...
109969               /organization/weiche-tech-åè½¦ç§æ
113839                   /organization/zengame-ç¦æ¸¸ç§æ
dtype: object


In [41]:
# Even though the diff is 2; the actual difference is 7 (len(set_diff_df))
# so after merge 114949-7 rows should be pesent in the merged df

In [42]:
# data in companies which isnt present in rounds
set_permlink.difference(set_company_permlink)

{'/organization/e-cã\x9abica',
 '/organization/energystone-games-ç\x81µçÿ³æ¸¸æ\x88\x8f',
 '/organization/huizuche-com-æ\x83\xa0ç§\x9fè½¦',
 '/organization/tipcat-interactive-æ²\x99è\x88\x9fä¿¡æ\x81¯ç§\x91æ\x9a\x80',
 '/organization/zengame-ç¦\x85æ¸¸ç§\x91æ\x9a\x80'}

#### merge the data frames
Need to complete the information of rounds from the company df. So have to join both the df based on the intersection values 

In [43]:
master_frame = pd.merge(df_rounds, df_com, how='inner', left_on='company_permalink', right_on='permalink')

In [44]:
master_frame.head(15)

Unnamed: 0,company_permalink,funding_round_permalink,funding_round_type,funding_round_code,funded_at,raised_amount_usd,permalink,name,homepage_url,category_list,status,country_code,state_code,region,city,founded_at
0,/organization/-fame,/funding-round/9a01d05418af9f794eebff7ace91f638,venture,B,05-01-2015,10000000.0,/organization/-fame,#fame,http://livfame.com,Media,operating,IND,16,Mumbai,Mumbai,
1,/organization/-qounter,/funding-round/22dacff496eb7acb2b901dec1dfe5633,venture,A,14-10-2014,,/organization/-qounter,:qounter,http://www.qounter.com,Application Platforms|Real Time|Social Network...,operating,USA,DE,DE - Other,Delaware City,04-09-2014
2,/organization/-qounter,/funding-round/b44fbb94153f6cdef13083530bb48030,seed,,01-03-2014,700000.0,/organization/-qounter,:qounter,http://www.qounter.com,Application Platforms|Real Time|Social Network...,operating,USA,DE,DE - Other,Delaware City,04-09-2014
3,/organization/-the-one-of-them-inc-,/funding-round/650b8f704416801069bb178a1418776b,venture,B,30-01-2014,3406878.0,/organization/-the-one-of-them-inc-,"(the) one of them,inc.",http://oneofthem.jp,Apps|Games|Mobile,operating,,,,,
4,/organization/0-6-com,/funding-round/5727accaeaa57461bd22a9bdd945382d,venture,A,19-03-2008,2000000.0,/organization/0-6-com,0-6.com,http://www.0-6.com,Curated Web,operating,CHN,22,Beijing,Beijing,01-01-2007
5,/organization/004-technologies,/funding-round/1278dd4e6a37fa4b7d7e06c21b3c1830,venture,,24-07-2014,,/organization/004-technologies,004 technologies,http://004gmbh.de/en/004-interact,Software,operating,USA,IL,"Springfield, Illinois",Champaign,01-01-2010
6,/organization/01games-technology,/funding-round/7d53696f2b4f607a2f2a8cbb83d01839,undisclosed,,01-07-2014,41250.0,/organization/01games-technology,01games technology,http://www.01games.hk/,Games,operating,HKG,,Hong Kong,Hong Kong,
7,/organization/0ndine-biomedical-inc,/funding-round/2b9d3ac293d5cdccbecff5c8cb0f327d,seed,,11-09-2009,43360.0,/organization/0ndine-biomedical-inc,ondine biomedical inc.,http://ondinebio.com,Biotechnology,operating,CAN,BC,Vancouver,Vancouver,01-01-1997
8,/organization/0ndine-biomedical-inc,/funding-round/954b9499724b946ad8c396a57a5f3b72,venture,,21-12-2009,719491.0,/organization/0ndine-biomedical-inc,ondine biomedical inc.,http://ondinebio.com,Biotechnology,operating,CAN,BC,Vancouver,Vancouver,01-01-1997
9,/organization/0xdata,/funding-round/383a9bd2c04f7038bb543ccef5ba3eae,seed,,22-05-2013,3000000.0,/organization/0xdata,h2o.ai,http://h2o.ai/,Analytics,operating,USA,CA,SF Bay Area,Mountain View,01-01-2011


In [45]:
master_frame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 114942 entries, 0 to 114941
Data columns (total 16 columns):
company_permalink          114942 non-null object
funding_round_permalink    114942 non-null object
funding_round_type         114942 non-null object
funding_round_code         31139 non-null object
funded_at                  114942 non-null object
raised_amount_usd          94958 non-null float64
permalink                  114942 non-null object
name                       114941 non-null object
homepage_url               108810 non-null object
category_list              111535 non-null object
status                     114942 non-null object
country_code               106271 non-null object
state_code                 104003 non-null object
region                     104782 non-null object
city                       104785 non-null object
founded_at                 94422 non-null object
dtypes: float64(1), object(15)
memory usage: 14.9+ MB


### [Answer] Q . Merge the two data frames so that all  variables (columns)  in the companies frame are added to the rounds2 data frame. Name the merged frame master_frame. How many observations are present in master_frame ?

In [46]:
master_frame.shape

(114942, 16)

In [47]:
print(df_rounds.shape)
print(df_rounds.columns)

(114949, 6)
Index(['company_permalink', 'funding_round_permalink', 'funding_round_type',
       'funding_round_code', 'funded_at', 'raised_amount_usd'],
      dtype='object')


In [48]:
print(df_com.shape)
print(df_com.columns)

(66368, 10)
Index(['permalink', 'name', 'homepage_url', 'category_list', 'status',
       'country_code', 'state_code', 'region', 'city', 'founded_at'],
      dtype='object')


In [99]:
master_frame.to_csv('master.csv')

#### DAta clean-up for further analysis

In [50]:
master_frame.isnull().all()

company_permalink          False
funding_round_permalink    False
funding_round_type         False
funding_round_code         False
funded_at                  False
raised_amount_usd          False
permalink                  False
name                       False
homepage_url               False
category_list              False
status                     False
country_code               False
state_code                 False
region                     False
city                       False
founded_at                 False
dtype: bool

In [51]:
master_frame.isnull().any()

company_permalink          False
funding_round_permalink    False
funding_round_type         False
funding_round_code          True
funded_at                  False
raised_amount_usd           True
permalink                  False
name                        True
homepage_url                True
category_list               True
status                     False
country_code                True
state_code                  True
region                      True
city                        True
founded_at                  True
dtype: bool

In [52]:
master_frame.isnull().sum()

company_permalink              0
funding_round_permalink        0
funding_round_type             0
funding_round_code         83803
funded_at                      0
raised_amount_usd          19984
permalink                      0
name                           1
homepage_url                6132
category_list               3407
status                         0
country_code                8671
state_code                 10939
region                     10160
city                       10157
founded_at                 20520
dtype: int64

In [53]:
round(100*master_frame.isnull().sum()/len(master_frame.index),2)

company_permalink          0.00
funding_round_permalink    0.00
funding_round_type         0.00
funding_round_code        72.91
funded_at                  0.00
raised_amount_usd         17.39
permalink                  0.00
name                       0.00
homepage_url               5.33
category_list              2.96
status                     0.00
country_code               7.54
state_code                 9.52
region                     8.84
city                       8.84
founded_at                17.85
dtype: float64

In [58]:
# Not used any where - funding_round_code, founded_at, city, state_Code, region, homepage_url

In [54]:
master_frame.drop('funding_round_code', axis=1, inplace=True)

In [55]:
master_frame.drop('founded_at', axis=1, inplace=True)

In [59]:
master_frame.drop('city', axis=1, inplace=True)

In [60]:
master_frame.drop('state_code', axis=1, inplace=True)

In [61]:
master_frame.drop('region', axis=1, inplace=True)

In [62]:
master_frame.drop('homepage_url', axis=1, inplace=True)

In [63]:
master_frame['permalink'].equals(master_frame['company_permalink'])

True

In [64]:
# one can be removed

In [65]:
master_frame.drop('company_permalink', axis=1, inplace=True)

In [66]:
round(100*master_frame.isnull().sum()/len(master_frame.index),2)

funding_round_permalink    0.00
funding_round_type         0.00
funded_at                  0.00
raised_amount_usd         17.39
permalink                  0.00
name                       0.00
category_list              2.96
status                     0.00
country_code               7.54
dtype: float64

In [67]:
# raised_amt = master_frame.raised_amount_usd
# raised_amt.isnull()

In [68]:
# raised_amt[1]

In [69]:
master_frame.dropna(subset=['raised_amount_usd'], inplace=True)

In [70]:
master_frame.head()

Unnamed: 0,funding_round_permalink,funding_round_type,funded_at,raised_amount_usd,permalink,name,category_list,status,country_code
0,/funding-round/9a01d05418af9f794eebff7ace91f638,venture,05-01-2015,10000000.0,/organization/-fame,#fame,Media,operating,IND
2,/funding-round/b44fbb94153f6cdef13083530bb48030,seed,01-03-2014,700000.0,/organization/-qounter,:qounter,Application Platforms|Real Time|Social Network...,operating,USA
3,/funding-round/650b8f704416801069bb178a1418776b,venture,30-01-2014,3406878.0,/organization/-the-one-of-them-inc-,"(the) one of them,inc.",Apps|Games|Mobile,operating,
4,/funding-round/5727accaeaa57461bd22a9bdd945382d,venture,19-03-2008,2000000.0,/organization/0-6-com,0-6.com,Curated Web,operating,CHN
6,/funding-round/7d53696f2b4f607a2f2a8cbb83d01839,undisclosed,01-07-2014,41250.0,/organization/01games-technology,01games technology,Games,operating,HKG


In [71]:
round(100*master_frame.isnull().sum()/len(master_frame.index),2)

funding_round_permalink   0.00
funding_round_type        0.00
funded_at                 0.00
raised_amount_usd         0.00
permalink                 0.00
name                      0.00
category_list             1.10
status                    0.00
country_code              6.16
dtype: float64

In [72]:
master_frame.isnull().sum()

funding_round_permalink       0
funding_round_type            0
funded_at                     0
raised_amount_usd             0
permalink                     0
name                          1
category_list              1044
status                        0
country_code               5850
dtype: int64

In [73]:
master_frame.shape

(94958, 9)

In [100]:
master_frame.to_csv('master.csv')

In [101]:
index_countries = np.where(master_frame['country_code'].isna())

In [102]:
print(index_countries, len(index_countries[0]))

(array([    2,    26,    52, ..., 94862, 94874, 94957], dtype=int64),) 5850


In [103]:
index_cat = np.where(master_frame['category_list'].isna())

In [104]:
print(index_cat, len(index_cat[0]))

(array([   75,   102,   141, ..., 94443, 94539, 94540], dtype=int64),) 1044


In [90]:
bool_cat = pd.isnull(master_frame['category_list'])
print(len(index_cat))

94958


In [107]:
# print(bool_cat)

In [86]:
# categories can have Others as primary_sector; so lets fill the category data

In [106]:
master_frame.loc[bool_cat, ['category_list']] = "unknown_cat"

In [108]:
master_frame.isnull().sum()

funding_round_permalink       0
funding_round_type            0
funded_at                     0
raised_amount_usd             0
permalink                     0
name                          1
category_list                 0
status                        0
country_code               5850
dtype: int64

In [109]:
bool_country = pd.isnull(master_frame['country_code'])

In [110]:
master_frame.loc[bool_country, ['country_code']] = "unknown_country"

In [112]:
round(100*master_frame.isnull().sum()/len(master_frame.index),2)

funding_round_permalink   0.00
funding_round_type        0.00
funded_at                 0.00
raised_amount_usd         0.00
permalink                 0.00
name                      0.00
category_list             0.00
status                    0.00
country_code              0.00
dtype: float64

#### MAster frame is complete.

In [113]:
master_frame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 94958 entries, 0 to 114941
Data columns (total 9 columns):
funding_round_permalink    94958 non-null object
funding_round_type         94958 non-null object
funded_at                  94958 non-null object
raised_amount_usd          94958 non-null float64
permalink                  94958 non-null object
name                       94957 non-null object
category_list              94958 non-null object
status                     94958 non-null object
country_code               94958 non-null object
dtypes: float64(1), object(8)
memory usage: 9.7+ MB


In [114]:
master_frame.describe()

Unnamed: 0,raised_amount_usd
count,94958.0
mean,10426962.02
std,114821849.03
min,0.0
25%,322500.0
50%,1680521.0
75%,7000000.0
max,21271935000.0


### CheckPoint2

In [133]:
f_mean = master_frame.groupby('funding_round_type').mean()
print(type(f_mean))
f_mean.sort_values(ascending=True, by=['raised_amount_usd'])

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0_level_0,raised_amount_usd
funding_round_type,Unnamed: 1_level_1
non_equity_assistance,411203.05
equity_crowdfunding,538368.21
seed,719779.62
angel,958694.47
product_crowdfunding,1363131.07
convertible_note,1453438.54
grant,4300576.34
venture,11748949.13
debt_financing,17043526.02
undisclosed,19242370.23


In [134]:
f_mean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14 entries, angel to venture
Data columns (total 1 columns):
raised_amount_usd    14 non-null float64
dtypes: float64(1)
memory usage: 224.0+ bytes


In [135]:
f_mean.columns

Index(['raised_amount_usd'], dtype='object')

In [136]:
print(master_frame.funding_round_type.unique())

['venture' 'seed' 'undisclosed' 'convertible_note' 'private_equity'
 'debt_financing' 'angel' 'grant' 'equity_crowdfunding' 'post_ipo_equity'
 'post_ipo_debt' 'product_crowdfunding' 'secondary_market'
 'non_equity_assistance']


In [138]:
f_venture = master_frame['funding_round_type']=='venture'
f_seed = master_frame['funding_round_type']=='seed'
f_pequity = master_frame['funding_round_type']=='private_equity'
f_angel = master_frame['funding_round_type']=='angel'

In [139]:
df_venture = master_frame[f_venture]
df_seed = master_frame[f_seed]
df_pequity = master_frame[f_pequity]
df_angel = master_frame[f_angel]

### Answers to the Question
Average funding amount of venture type

Average funding amount of angel type

Average funding amount of seed type

Average funding amount of private equity type


In [140]:
print('Venture - ' + str(round(df_venture.raised_amount_usd.mean(),2)))
print('Seed - ' + str(round(df_seed.raised_amount_usd.mean(),2)))
print('Private Equity - ' + str(round(df_pequity.raised_amount_usd.mean(),2)))
print('Angel - ' + str(round(df_angel.raised_amount_usd.mean(),2)))

Venture - 11748949.13
Seed - 719779.62
Private Equity - 73308593.03
Angel - 958694.47


In [141]:
m_venture = df_venture.raised_amount_usd.mean()
m_seed = df_seed.raised_amount_usd.mean()
m_pequity = df_pequity.raised_amount_usd.mean()
m_angel = df_angel.raised_amount_usd.mean()

print(m_venture, m_seed, m_pequity, m_angel)

11748949.129489528 719779.6202016778 73308593.02944215 958694.4697530865


In [142]:
# 5 to 15 million
lower = 5000000
upper = 15000000

In [143]:
m_venture>=lower and m_venture<upper

True

In [144]:
m_seed>=lower and m_seed<=upper

False

In [145]:
m_pequity>=lower and m_pequity<=upper

False

In [146]:
m_angel>=lower and m_angel<=upper

False

### [Answer] Q. Considering that Spark Funds wants to invest between 5 to 15 million USD per  investment round, which investment type is the most suitable for them?


### Checkpoint 3: Country Analysis

In [148]:
# From the previous analysis - venture fund is suitable to invest

In [149]:
df_venture.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50228 entries, 0 to 114934
Data columns (total 9 columns):
funding_round_permalink    50228 non-null object
funding_round_type         50228 non-null object
funded_at                  50228 non-null object
raised_amount_usd          50228 non-null float64
permalink                  50228 non-null object
name                       50228 non-null object
category_list              50228 non-null object
status                     50228 non-null object
country_code               50228 non-null object
dtypes: float64(1), object(8)
memory usage: 3.8+ MB


In [150]:
df_venture.describe()

Unnamed: 0,raised_amount_usd
count,50228.0
mean,11748949.13
std,86352066.56
min,0.0
25%,1600901.75
50%,5000000.0
75%,12000000.0
max,17600000000.0


In [151]:
df_venture.head(10)

Unnamed: 0,funding_round_permalink,funding_round_type,funded_at,raised_amount_usd,permalink,name,category_list,status,country_code
0,/funding-round/9a01d05418af9f794eebff7ace91f638,venture,05-01-2015,10000000.0,/organization/-fame,#fame,Media,operating,IND
3,/funding-round/650b8f704416801069bb178a1418776b,venture,30-01-2014,3406878.0,/organization/-the-one-of-them-inc-,"(the) one of them,inc.",Apps|Games|Mobile,operating,unknown_country
4,/funding-round/5727accaeaa57461bd22a9bdd945382d,venture,19-03-2008,2000000.0,/organization/0-6-com,0-6.com,Curated Web,operating,CHN
8,/funding-round/954b9499724b946ad8c396a57a5f3b72,venture,21-12-2009,719491.0,/organization/0ndine-biomedical-inc,ondine biomedical inc.,Biotechnology,operating,CAN
10,/funding-round/3bb2ee4a2d89251a10aaa735b1180e44,venture,09-11-2015,20000000.0,/organization/0xdata,h2o.ai,Analytics,operating,USA
11,/funding-round/ae2a174c06517c2394aed45006322a7e,venture,03-01-2013,1700000.0,/organization/0xdata,h2o.ai,Analytics,operating,USA
12,/funding-round/e1cfcbe1bdf4c70277c5f29a3482f24e,venture,19-07-2014,8900000.0,/organization/0xdata,h2o.ai,Analytics,operating,USA
22,/funding-round/b952cbaf401f310927430c97b68162ea,venture,17-03-2015,5000000.0,/organization/1-mainstream,1 mainstream,Apps|Cable|Distribution|Software,acquired,USA
28,/funding-round/0faccbbcc5818dc5326469f13f5a8ac8,venture,09-10-2014,4000000.0,/organization/10-minutes-with,10 minutes with,Education,operating,GBR
34,/funding-round/502bd0e50c27616995e4bdad24605ef8,venture,16-02-2011,2520000.0,/organization/1000memories,1000memories,Curated Web,acquired,USA


In [152]:
df_venture['category_list'] = df_venture['category_list'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [153]:
# df_venture =df_venture[pd.notnull(df_venture['country_code'])]

In [154]:
df_venture.country_code.value_counts().nlargest(9)

USA                36139
unknown_country     2117
GBR                 2055
CHN                 1564
CAN                 1256
FRA                  917
IND                  824
ISR                  796
DEU                  558
Name: country_code, dtype: int64

In [156]:
df_venture.columns

Index(['funding_round_permalink', 'funding_round_type', 'funded_at',
       'raised_amount_usd', 'permalink', 'name', 'category_list', 'status',
       'country_code'],
      dtype='object')

### 1. Spark Funds wants to see the top nine countries which have received the highest total funding (across ALL sectors for the chosen investment type)

In [168]:
df_venture.groupby('country_code')['raised_amount_usd'].sum().sort_values(ascending=False).head(15)

country_code
USA               422510842796.00
CHN                39835418773.00
unknown_country    25506108414.00
GBR                20245627416.00
IND                14391858718.00
CAN                 9583332317.00
FRA                 7259536732.00
ISR                 6907514579.00
DEU                 6346959822.00
JPN                 3363676611.00
SWE                 3254952563.00
NLD                 2939403619.00
CHE                 2827560264.00
SGP                 2793917856.00
ESP                 1835831452.00
Name: raised_amount_usd, dtype: float64

#### Ignoring the unknown country as we dont know what are those

In [167]:
top9_countries = ['USA','CHN', 'GBR', 'IND', 'CAN','FRA','ISR', 'DEU', 'JPN']

### 2. For the chosen investment type, make a data frame named top9 with the top nine countries (based on the total investment amount each country has received)

In [169]:
top9 = df_venture[df_venture['country_code'].isin(top9_countries)]

In [170]:
top9.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44383 entries, 0 to 114934
Data columns (total 9 columns):
funding_round_permalink    44383 non-null object
funding_round_type         44383 non-null object
funded_at                  44383 non-null object
raised_amount_usd          44383 non-null float64
permalink                  44383 non-null object
name                       44383 non-null object
category_list              44383 non-null object
status                     44383 non-null object
country_code               44383 non-null object
dtypes: float64(1), object(8)
memory usage: 3.4+ MB


In [171]:
top9.head(15)

Unnamed: 0,funding_round_permalink,funding_round_type,funded_at,raised_amount_usd,permalink,name,category_list,status,country_code
0,/funding-round/9a01d05418af9f794eebff7ace91f638,venture,05-01-2015,10000000.0,/organization/-fame,#fame,Media,operating,IND
4,/funding-round/5727accaeaa57461bd22a9bdd945382d,venture,19-03-2008,2000000.0,/organization/0-6-com,0-6.com,Curated Web,operating,CHN
8,/funding-round/954b9499724b946ad8c396a57a5f3b72,venture,21-12-2009,719491.0,/organization/0ndine-biomedical-inc,ondine biomedical inc.,Biotechnology,operating,CAN
10,/funding-round/3bb2ee4a2d89251a10aaa735b1180e44,venture,09-11-2015,20000000.0,/organization/0xdata,h2o.ai,Analytics,operating,USA
11,/funding-round/ae2a174c06517c2394aed45006322a7e,venture,03-01-2013,1700000.0,/organization/0xdata,h2o.ai,Analytics,operating,USA
12,/funding-round/e1cfcbe1bdf4c70277c5f29a3482f24e,venture,19-07-2014,8900000.0,/organization/0xdata,h2o.ai,Analytics,operating,USA
22,/funding-round/b952cbaf401f310927430c97b68162ea,venture,17-03-2015,5000000.0,/organization/1-mainstream,1 mainstream,Apps|Cable|Distribution|Software,acquired,USA
28,/funding-round/0faccbbcc5818dc5326469f13f5a8ac8,venture,09-10-2014,4000000.0,/organization/10-minutes-with,10 minutes with,Education,operating,GBR
34,/funding-round/502bd0e50c27616995e4bdad24605ef8,venture,16-02-2011,2520000.0,/organization/1000memories,1000memories,Curated Web,acquired,USA
38,/funding-round/13be128d655076a025221d7fddc90d68,venture,14-10-2009,265940.0,/organization/1000museums-com,1000museums.com,Curated Web,operating,USA


In [172]:
top9.describe()

Unnamed: 0,raised_amount_usd
count,44383.0
mean,11951530.27
std,91291415.39
min,0.0
25%,1700000.0
50%,5000000.0
75%,12000000.0
max,17600000000.0


### Identify the top three English-speaking countries in the data frame top9.

In [173]:
# https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes
list_eng_countries = ['GBR', 'AUS', 'BHS', 'BWA', 'CAN','CYP', 'FJI', 'IND', 
                      'KEN', 'KIR', 'MLT', 'NGA', 'PAK', 'PNG', 'IRL', 'ZAF', 
                      'NZL', 'SGP', 'PHL', 'GMB']
print(len(list_eng_countries))

20


In [174]:
countries = 'ATG,AUS,BHS,BRB,BLZ,BWA,CMR,CAN,DMA,ERI,ETH,FJI,GMB,GHA,GRD,GUY,IND,IRL,JAM,KEN,KIR,NLD,LSO,LBR,MWI,MLT,MHL,MUS,FSM,NAM,NRU,NZL,NGA,PAK,PLW,PNG,PHL,RWA,KNA,LCA,VCT,WSM,SYC,SLE,SGP,SLB,ZAF,SSD,SDN,SWZ,TZA,TON,TTO,TUV,GBR,USA,UGA,VUT,ZMB,ZWE'
eng_countries = countries.split(',')
print(len(eng_countries))

60


In [175]:
list_eng_countries = list(set(list_eng_countries)|set(eng_countries))
print(type(list_eng_countries), len(list_eng_countries), list_eng_countries)

<class 'list'> 61 ['CMR', 'ZWE', 'BWA', 'TZA', 'DMA', 'IRL', 'GMB', 'FSM', 'FJI', 'CAN', 'MWI', 'MHL', 'LCA', 'BHS', 'NGA', 'TTO', 'KNA', 'JAM', 'GBR', 'SLE', 'WSM', 'SWZ', 'TON', 'PLW', 'MLT', 'ZMB', 'BLZ', 'SDN', 'VUT', 'GRD', 'KIR', 'LBR', 'NLD', 'PAK', 'AUS', 'LSO', 'SGP', 'GUY', 'UGA', 'ETH', 'USA', 'ERI', 'ZAF', 'PNG', 'RWA', 'CYP', 'GHA', 'PHL', 'SSD', 'VCT', 'KEN', 'MUS', 'NAM', 'NZL', 'SLB', 'IND', 'SYC', 'NRU', 'ATG', 'TUV', 'BRB']


In [176]:
list_eng_countries.sort()

In [177]:
print(list_eng_countries, len(list_eng_countries))

['ATG', 'AUS', 'BHS', 'BLZ', 'BRB', 'BWA', 'CAN', 'CMR', 'CYP', 'DMA', 'ERI', 'ETH', 'FJI', 'FSM', 'GBR', 'GHA', 'GMB', 'GRD', 'GUY', 'IND', 'IRL', 'JAM', 'KEN', 'KIR', 'KNA', 'LBR', 'LCA', 'LSO', 'MHL', 'MLT', 'MUS', 'MWI', 'NAM', 'NGA', 'NLD', 'NRU', 'NZL', 'PAK', 'PHL', 'PLW', 'PNG', 'RWA', 'SDN', 'SGP', 'SLB', 'SLE', 'SSD', 'SWZ', 'SYC', 'TON', 'TTO', 'TUV', 'TZA', 'UGA', 'USA', 'VCT', 'VUT', 'WSM', 'ZAF', 'ZMB', 'ZWE'] 61


In [178]:
top9_eng = top9[top9['country_code'].isin(list_eng_countries)]

In [180]:
top9_eng.country_code.value_counts()

USA    36139
GBR     2055
CAN     1256
IND      824
Name: country_code, dtype: int64

### Top3 english speaking countries - Investment wise

#### 1.USA
#### 2. GBR
#### 3. CAN

In [246]:
top9_eng.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40274 entries, 0 to 114929
Data columns (total 9 columns):
funding_round_permalink    40274 non-null object
funding_round_type         40274 non-null object
funded_at                  40274 non-null object
raised_amount_usd          40274 non-null float64
permalink                  40274 non-null object
name                       40274 non-null object
category_list              40274 non-null object
status                     40274 non-null object
country_code               40274 non-null object
dtypes: float64(1), object(8)
memory usage: 3.1+ MB


In [249]:
top3_country_list = ['USA','GBR','CAN']

In [251]:
top3 = top9_eng[top9_eng['country_code'].isin(top3_country_list)]

In [252]:
top3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39450 entries, 8 to 114929
Data columns (total 9 columns):
funding_round_permalink    39450 non-null object
funding_round_type         39450 non-null object
funded_at                  39450 non-null object
raised_amount_usd          39450 non-null float64
permalink                  39450 non-null object
name                       39450 non-null object
category_list              39450 non-null object
status                     39450 non-null object
country_code               39450 non-null object
dtypes: float64(1), object(8)
memory usage: 3.0+ MB


### Checkpoint 4: Sector Analysis 1

In [181]:
df_mapping = pd.read_csv('mapping.csv',encoding = "ISO-8859-1")

In [182]:
df_mapping.columns

Index(['category_list', 'Automotive & Sports', 'Blanks',
       'Cleantech / Semiconductors', 'Entertainment', 'Health',
       'Manufacturing', 'News, Search and Messaging', 'Others',
       'Social, Finance, Analytics, Advertising'],
      dtype='object')

In [183]:
df_mapping.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 688 entries, 0 to 687
Data columns (total 10 columns):
category_list                              687 non-null object
Automotive & Sports                        688 non-null int64
Blanks                                     688 non-null int64
Cleantech / Semiconductors                 688 non-null int64
Entertainment                              688 non-null int64
Health                                     688 non-null int64
Manufacturing                              688 non-null int64
News, Search and Messaging                 688 non-null int64
Others                                     688 non-null int64
Social, Finance, Analytics, Advertising    688 non-null int64
dtypes: int64(9), object(1)
memory usage: 53.8+ KB


In [184]:
df_mapping.head(10)

Unnamed: 0,category_list,Automotive & Sports,Blanks,Cleantech / Semiconductors,Entertainment,Health,Manufacturing,"News, Search and Messaging",Others,"Social, Finance, Analytics, Advertising"
0,,0,1,0,0,0,0,0,0,0
1,3D,0,0,0,0,0,1,0,0,0
2,3D Printing,0,0,0,0,0,1,0,0,0
3,3D Technology,0,0,0,0,0,1,0,0,0
4,Accounting,0,0,0,0,0,0,0,0,1
5,Active Lifestyle,0,0,0,0,1,0,0,0,0
6,Ad Targeting,0,0,0,0,0,0,0,0,1
7,Advanced Materials,0,0,0,0,0,1,0,0,0
8,Adventure Travel,1,0,0,0,0,0,0,0,0
9,Advertising,0,0,0,0,0,0,0,0,1


In [186]:
def reCategorize(val):
    splits = val.split('|')
    if(len(splits)>1):
        return splits[0]
    else:
        return val

In [187]:
print(reCategorize('Apps|Cable|Distribution|Software'))

Apps


In [188]:
print(reCategorize('Analytics'))

Analytics


In [189]:
print(reCategorize('Curated Web'))

Curated Web


In [192]:
# df_venture['primary_sector'] = df_venture['category_list'].apply(reCategorize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [193]:
master_frame['primary_sector'] = master_frame['category_list'].apply(reCategorize)

In [194]:
master_frame.head()

Unnamed: 0,funding_round_permalink,funding_round_type,funded_at,raised_amount_usd,permalink,name,category_list,status,country_code,primary_sector
0,/funding-round/9a01d05418af9f794eebff7ace91f638,venture,05-01-2015,10000000.0,/organization/-fame,#fame,Media,operating,IND,Media
2,/funding-round/b44fbb94153f6cdef13083530bb48030,seed,01-03-2014,700000.0,/organization/-qounter,:qounter,Application Platforms|Real Time|Social Network...,operating,USA,Application Platforms
3,/funding-round/650b8f704416801069bb178a1418776b,venture,30-01-2014,3406878.0,/organization/-the-one-of-them-inc-,"(the) one of them,inc.",Apps|Games|Mobile,operating,unknown_country,Apps
4,/funding-round/5727accaeaa57461bd22a9bdd945382d,venture,19-03-2008,2000000.0,/organization/0-6-com,0-6.com,Curated Web,operating,CHN,Curated Web
6,/funding-round/7d53696f2b4f607a2f2a8cbb83d01839,undisclosed,01-07-2014,41250.0,/organization/01games-technology,01games technology,Games,operating,HKG,Games


In [195]:
master_frame['primary_sector'] = master_frame['primary_sector'].str.lower()

In [196]:
master_frame.head()

Unnamed: 0,funding_round_permalink,funding_round_type,funded_at,raised_amount_usd,permalink,name,category_list,status,country_code,primary_sector
0,/funding-round/9a01d05418af9f794eebff7ace91f638,venture,05-01-2015,10000000.0,/organization/-fame,#fame,Media,operating,IND,media
2,/funding-round/b44fbb94153f6cdef13083530bb48030,seed,01-03-2014,700000.0,/organization/-qounter,:qounter,Application Platforms|Real Time|Social Network...,operating,USA,application platforms
3,/funding-round/650b8f704416801069bb178a1418776b,venture,30-01-2014,3406878.0,/organization/-the-one-of-them-inc-,"(the) one of them,inc.",Apps|Games|Mobile,operating,unknown_country,apps
4,/funding-round/5727accaeaa57461bd22a9bdd945382d,venture,19-03-2008,2000000.0,/organization/0-6-com,0-6.com,Curated Web,operating,CHN,curated web
6,/funding-round/7d53696f2b4f607a2f2a8cbb83d01839,undisclosed,01-07-2014,41250.0,/organization/01games-technology,01games technology,Games,operating,HKG,games


In [None]:
# primary sector column has been created with first category but not mapped according the mapping.csv; lets do that.

In [197]:
df_mapping.shape

(688, 10)

In [198]:
df_mapping.head(10)

Unnamed: 0,category_list,Automotive & Sports,Blanks,Cleantech / Semiconductors,Entertainment,Health,Manufacturing,"News, Search and Messaging",Others,"Social, Finance, Analytics, Advertising"
0,,0,1,0,0,0,0,0,0,0
1,3D,0,0,0,0,0,1,0,0,0
2,3D Printing,0,0,0,0,0,1,0,0,0
3,3D Technology,0,0,0,0,0,1,0,0,0
4,Accounting,0,0,0,0,0,0,0,0,1
5,Active Lifestyle,0,0,0,0,1,0,0,0,0
6,Ad Targeting,0,0,0,0,0,0,0,0,1
7,Advanced Materials,0,0,0,0,0,1,0,0,0
8,Adventure Travel,1,0,0,0,0,0,0,0,0
9,Advertising,0,0,0,0,0,0,0,0,1


In [199]:
df_mapping_melt = pd.melt(df_mapping, id_vars=['category_list'])

In [200]:
df_mapping_melt.shape

(6192, 3)

In [201]:
type(df_mapping_melt)

pandas.core.frame.DataFrame

In [202]:
df_mapping_melt = df_mapping_melt[df_mapping_melt['value']!=0]

In [203]:
df_mapping_melt.shape

(688, 3)

In [205]:
df_mapping_melt.head()

Unnamed: 0,category_list,variable,value
8,Adventure Travel,Automotive & Sports,1
14,Aerospace,Automotive & Sports,1
45,Auto,Automotive & Sports,1
46,Automated Kiosk,Automotive & Sports,1
47,Automotive,Automotive & Sports,1


In [204]:
df_mapping_melt.columns

Index(['category_list', 'variable', 'value'], dtype='object')

In [207]:
df_mapping_melt.drop('value', axis=1, inplace=True)

In [208]:
df_mapping_melt.columns

Index(['category_list', 'variable'], dtype='object')

In [210]:
df_mapping_melt['category_list'] = df_mapping_melt['category_list'].str.lower()

In [215]:
df_mapping_melt.rename(columns={'category_list':'primary_sector'}, inplace=True)

In [220]:
df_mapping_melt.to_csv('mapping_melt.csv')

There are some spelling mistakes in the mapping file on observation; there are 0 present in case where na should be present. so applying regex and correct them

In [221]:
df_mapping_melt['primary_sector'] = df_mapping_melt.primary_sector.apply(lambda x: re.sub('[0]', 'na', str(x)))

In [222]:
df_mapping_melt.to_csv('mapping_melt_corrected.csv')

In [228]:
# master_frame.drop(columns=['_merge','variable'], inplace=True)

If we dont rename, it creating two columns with category_list_x and category_list_y; so rename and merge

In [226]:
master_frame = pd.merge(master_frame, df_mapping_melt, how='left', on='primary_sector',indicator=True)


In [229]:
master_frame.head()

Unnamed: 0,funding_round_permalink,funding_round_type,funded_at,raised_amount_usd,permalink,name,category_list,status,country_code,primary_sector,variable,_merge
0,/funding-round/9a01d05418af9f794eebff7ace91f638,venture,05-01-2015,10000000.0,/organization/-fame,#fame,Media,operating,IND,media,Entertainment,both
1,/funding-round/b44fbb94153f6cdef13083530bb48030,seed,01-03-2014,700000.0,/organization/-qounter,:qounter,Application Platforms|Real Time|Social Network...,operating,USA,application platforms,"News, Search and Messaging",both
2,/funding-round/650b8f704416801069bb178a1418776b,venture,30-01-2014,3406878.0,/organization/-the-one-of-them-inc-,"(the) one of them,inc.",Apps|Games|Mobile,operating,unknown_country,apps,"News, Search and Messaging",both
3,/funding-round/5727accaeaa57461bd22a9bdd945382d,venture,19-03-2008,2000000.0,/organization/0-6-com,0-6.com,Curated Web,operating,CHN,curated web,"News, Search and Messaging",both
4,/funding-round/7d53696f2b4f607a2f2a8cbb83d01839,undisclosed,01-07-2014,41250.0,/organization/01games-technology,01games technology,Games,operating,HKG,games,Entertainment,both


In [230]:
round(100*master_frame.isnull().sum()/len(master_frame.index),2)

funding_round_permalink   0.00
funding_round_type        0.00
funded_at                 0.00
raised_amount_usd         0.00
permalink                 0.00
name                      0.00
category_list             0.00
status                    0.00
country_code              0.00
primary_sector            0.00
variable                  1.19
_merge                    0.00
dtype: float64

In [231]:
master_frame.variable.unique()

array(['Entertainment', 'News, Search and Messaging',
       'Cleantech / Semiconductors',
       'Social, Finance, Analytics, Advertising', 'Others', 'Health', nan,
       'Manufacturing', 'Automotive & Sports'], dtype=object)

In [234]:
master_frame.rename(columns={'variable':'main_sector'}, inplace=True)

In [235]:
master_frame.to_csv('master_1.csv')

## Checkpoint 5: Sector Analysis 2

In [238]:
master_frame.columns

Index(['funding_round_permalink', 'funding_round_type', 'funded_at',
       'raised_amount_usd', 'permalink', 'name', 'category_list', 'status',
       'country_code', 'primary_sector', 'main_sector', '_merge'],
      dtype='object')

In [240]:
f_venture = master_frame['funding_round_type']=='venture'
df_venture = master_frame[f_venture]

In [253]:
df_top3 = df_venture[df_venture['country_code'].isin(top3_country_list)]

In [254]:
df_top3.head()

Unnamed: 0,funding_round_permalink,funding_round_type,funded_at,raised_amount_usd,permalink,name,category_list,status,country_code,primary_sector,main_sector,_merge
6,/funding-round/954b9499724b946ad8c396a57a5f3b72,venture,21-12-2009,719491.0,/organization/0ndine-biomedical-inc,ondine biomedical inc.,Biotechnology,operating,CAN,biotechnology,Cleantech / Semiconductors,both
8,/funding-round/3bb2ee4a2d89251a10aaa735b1180e44,venture,09-11-2015,20000000.0,/organization/0xdata,h2o.ai,Analytics,operating,USA,analytics,"Social, Finance, Analytics, Advertising",both
9,/funding-round/ae2a174c06517c2394aed45006322a7e,venture,03-01-2013,1700000.0,/organization/0xdata,h2o.ai,Analytics,operating,USA,analytics,"Social, Finance, Analytics, Advertising",both
10,/funding-round/e1cfcbe1bdf4c70277c5f29a3482f24e,venture,19-07-2014,8900000.0,/organization/0xdata,h2o.ai,Analytics,operating,USA,analytics,"Social, Finance, Analytics, Advertising",both
16,/funding-round/b952cbaf401f310927430c97b68162ea,venture,17-03-2015,5000000.0,/organization/1-mainstream,1 mainstream,Apps|Cable|Distribution|Software,acquired,USA,apps,"News, Search and Messaging",both


In [255]:
df_top3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39450 entries, 6 to 94946
Data columns (total 12 columns):
funding_round_permalink    39450 non-null object
funding_round_type         39450 non-null object
funded_at                  39450 non-null object
raised_amount_usd          39450 non-null float64
permalink                  39450 non-null object
name                       39450 non-null object
category_list              39450 non-null object
status                     39450 non-null object
country_code               39450 non-null object
primary_sector             39450 non-null object
main_sector                39215 non-null object
_merge                     39450 non-null category
dtypes: category(1), float64(1), object(10)
memory usage: 3.6+ MB


In [256]:
df_top3.shape

(39450, 12)

In [257]:
df_top3.country_code.value_counts()

USA    36139
GBR     2055
CAN     1256
Name: country_code, dtype: int64

In [258]:
df_top3.to_csv('top3.csv')

In [259]:
val2 = 20000000.00
val2>=lower and val2<upper

False

If we observe there are some investment which are not falling in between 5 and 15 millioin, lets filter out

In [263]:
# df_top3.drop(df_top3[(df_top3.raised_amount_usd < 5000000)].index, inplace=True)

In [264]:
 df_top3 = df_top3.drop(df_top3[(df_top3.raised_amount_usd < 5000000)].index)

In [265]:
 df_top3 = df_top3.drop(df_top3[(df_top3.raised_amount_usd > 15000000)].index)

In [266]:
df_top3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13204 entries, 10 to 94946
Data columns (total 12 columns):
funding_round_permalink    13204 non-null object
funding_round_type         13204 non-null object
funded_at                  13204 non-null object
raised_amount_usd          13204 non-null float64
permalink                  13204 non-null object
name                       13204 non-null object
category_list              13204 non-null object
status                     13204 non-null object
country_code               13204 non-null object
primary_sector             13204 non-null object
main_sector                13106 non-null object
_merge                     13204 non-null category
dtypes: category(1), float64(1), object(10)
memory usage: 1.2+ MB


In [268]:
df_top3.dropna(subset=['main_sector'], inplace=True)

In [269]:
df_top3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13106 entries, 10 to 94946
Data columns (total 12 columns):
funding_round_permalink    13106 non-null object
funding_round_type         13106 non-null object
funded_at                  13106 non-null object
raised_amount_usd          13106 non-null float64
permalink                  13106 non-null object
name                       13106 non-null object
category_list              13106 non-null object
status                     13106 non-null object
country_code               13106 non-null object
primary_sector             13106 non-null object
main_sector                13106 non-null object
_merge                     13106 non-null category
dtypes: category(1), float64(1), object(10)
memory usage: 1.2+ MB


In [271]:
df_top3.drop('_merge', axis=1, inplace=True)

In [272]:
df_top3.head()

Unnamed: 0,funding_round_permalink,funding_round_type,funded_at,raised_amount_usd,permalink,name,category_list,status,country_code,primary_sector,main_sector
10,/funding-round/e1cfcbe1bdf4c70277c5f29a3482f24e,venture,19-07-2014,8900000.0,/organization/0xdata,h2o.ai,Analytics,operating,USA,analytics,"Social, Finance, Analytics, Advertising"
16,/funding-round/b952cbaf401f310927430c97b68162ea,venture,17-03-2015,5000000.0,/organization/1-mainstream,1 mainstream,Apps|Cable|Distribution|Software,acquired,USA,apps,"News, Search and Messaging"
78,/funding-round/fb6216a30cb566ede89e0bee0623a634,venture,16-12-2014,11999347.0,/organization/128-technology,128 technology,Service Providers|Technology,operating,USA,service providers,Others
84,/funding-round/424129ce1235cfab2655ee81305f7c2b,venture,15-10-2013,15000000.0,/organization/1366-technologies,1366 technologies,Manufacturing,operating,USA,manufacturing,Manufacturing
85,/funding-round/6d3f3797371956ece035b8478c1441b2,venture,09-04-2015,5000000.0,/organization/1366-technologies,1366 technologies,Manufacturing,operating,USA,manufacturing,Manufacturing


#### D1, D2, D3 Dataframe
D1 - USA

D2 - GBR

D3 - CAN

In [273]:
d1 = df_top3[df_top3['country_code'] == 'USA']
d2 = df_top3[df_top3['country_code'] == 'GBR']
d3 = df_top3[df_top3['country_code'] == 'CAN']

In [274]:
print(d1.shape)
print(d2.shape)
print(d3.shape)

(12063, 11)
(621, 11)
(422, 11)


In [275]:
# The total number (or count) of investments for each main sector in a separate column

In [276]:
d1.columns

Index(['funding_round_permalink', 'funding_round_type', 'funded_at',
       'raised_amount_usd', 'permalink', 'name', 'category_list', 'status',
       'country_code', 'primary_sector', 'main_sector'],
      dtype='object')

### D1

In [283]:
d1.raised_amount_usd.sum()

107757097294.0

In [279]:
d1.pivot_table(values='raised_amount_usd', index='main_sector',aggfunc=['sum',np.size])

Unnamed: 0_level_0,sum,size
Unnamed: 0_level_1,raised_amount_usd,raised_amount_usd
main_sector,Unnamed: 1_level_2,Unnamed: 2_level_2
Automotive & Sports,1454104361.0,167.0
Cleantech / Semiconductors,21633430822.0,2350.0
Entertainment,5099197982.0,591.0
Health,8211859357.0,909.0
Manufacturing,7258553378.0,799.0
"News, Search and Messaging",13971567428.0,1583.0
Others,26321007002.0,2950.0
"Social, Finance, Analytics, Advertising",23807376964.0,2714.0


In [286]:
f_d1_others = d1['main_sector']=='Others'

In [287]:
d1_others = d1[f_d1_others]

In [288]:
d1_others.head()

Unnamed: 0,funding_round_permalink,funding_round_type,funded_at,raised_amount_usd,permalink,name,category_list,status,country_code,primary_sector,main_sector
78,/funding-round/fb6216a30cb566ede89e0bee0623a634,venture,16-12-2014,11999347.0,/organization/128-technology,128 technology,Service Providers|Technology,operating,USA,service providers,Others
103,/funding-round/b84bb882ca873f5fb96535671981196d,venture,16-04-2002,14000000.0,/organization/170-systems,170 systems,Software,acquired,USA,software,Others
109,/funding-round/69690484f51e15bc27ff52bfe472cd96,venture,01-01-2011,5000000.0,/organization/17zuoye,17zuoye,Education|Language Learning,operating,USA,education,Others
110,/funding-round/8d87f771e938e0f31641bd600abbafca,venture,01-09-2013,10000000.0,/organization/17zuoye,17zuoye,Education|Language Learning,operating,USA,education,Others
147,/funding-round/292b074d073fdd9c7e9d8f372c3aa5f6,venture,24-01-2014,15000000.0,/organization/1stdibs,1stdibs,E-Commerce,operating,USA,e-commerce,Others


In [292]:
d1_others.groupby('permalink')['raised_amount_usd'].sum().sort_values(ascending=False).head()

permalink
/organization/virtustream           64300000.00
/organization/capella               54968051.00
/organization/airtight-networks     54201907.00
/organization/decarta               52100000.00
/organization/black-duck-software   51000000.00
Name: raised_amount_usd, dtype: float64

In [294]:
f_d1_sfaa = d1['main_sector']=='Social, Finance, Analytics, Advertising'
d1_sfaa = d1[f_d1_sfaa]
d1_sfaa.head()

Unnamed: 0,funding_round_permalink,funding_round_type,funded_at,raised_amount_usd,permalink,name,category_list,status,country_code,primary_sector,main_sector
10,/funding-round/e1cfcbe1bdf4c70277c5f29a3482f24e,venture,19-07-2014,8900000.0,/organization/0xdata,h2o.ai,Analytics,operating,USA,analytics,"Social, Finance, Analytics, Advertising"
180,/funding-round/bdf9f5bf67ee51155eae223acac57ec5,venture,17-11-2013,5050000.0,/organization/21e6,21 inc,Big Data|Bitcoin|Hardware + Software|Technology,operating,USA,big data,"Social, Finance, Analytics, Advertising"
288,/funding-round/a45d977b48cb54216eb705d59b24ca19,venture,19-07-2012,13100000.0,/organization/33across,33across,Advertising|Advertising Platforms|Content Disc...,operating,USA,advertising,"Social, Finance, Analytics, Advertising"
290,/funding-round/bfc2233768b7c79ed58ad7561423e555,venture,05-01-2011,9000000.0,/organization/33across,33across,Advertising|Advertising Platforms|Content Disc...,operating,USA,advertising,"Social, Finance, Analytics, Advertising"
293,/funding-round/1c51042e815e96ed2653ae9ced99dfc4,venture,29-10-2003,12000000.0,/organization/360commerce,360commerce,Information Technology|Retail|Software,acquired,USA,information technology,"Social, Finance, Analytics, Advertising"


In [295]:
d1_sfaa.groupby('permalink')['raised_amount_usd'].sum().sort_values(ascending=False).head()

permalink
/organization/shotspotter   67933006.00
/organization/demandbase    63000000.00
/organization/intacct       61800000.00
/organization/netbase       60600000.00
/organization/lotame        59700000.00
Name: raised_amount_usd, dtype: float64

### D2

In [284]:
d2.raised_amount_usd.sum()

5379078691.0

In [280]:
d2.pivot_table(values='raised_amount_usd', index='main_sector',aggfunc=['sum',np.size])

Unnamed: 0_level_0,sum,size
Unnamed: 0_level_1,raised_amount_usd,raised_amount_usd
main_sector,Unnamed: 1_level_2,Unnamed: 2_level_2
Automotive & Sports,167051565.0,16.0
Cleantech / Semiconductors,1163990056.0,130.0
Entertainment,482784687.0,56.0
Health,214537510.0,24.0
Manufacturing,361940335.0,42.0
"News, Search and Messaging",615746235.0,73.0
Others,1283624289.0,147.0
"Social, Finance, Analytics, Advertising",1089404014.0,133.0


In [296]:
f_d2_others = d2['main_sector']=='Others'
d2_others = d2[f_d2_others]
d2_others.head()

Unnamed: 0,funding_round_permalink,funding_round_type,funded_at,raised_amount_usd,permalink,name,category_list,status,country_code,primary_sector,main_sector
1462,/funding-round/064999587157b0ceae7843204a105d6d,venture,28-09-2015,13359377.0,/organization/acs-clothing,acs clothing,E-Commerce,operating,GBR,e-commerce,Others
1463,/funding-round/875b4295af2381558cec9d97f86fe3e4,venture,02-03-2014,12879637.0,/organization/acs-clothing,acs clothing,E-Commerce,operating,GBR,e-commerce,Others
2531,/funding-round/57ca5d05989850fb1109b277cb6ae54e,venture,30-06-2010,10000000.0,/organization/aepona,aepona,Web Hosting,acquired,GBR,web hosting,Others
2532,/funding-round/ad79a10f4c3f231375454da6393ec564,venture,05-02-2007,10000000.0,/organization/aepona,aepona,Web Hosting,acquired,GBR,web hosting,Others
3577,/funding-round/0db504d02c87c2b963283fd3c6d17594,venture,22-01-2008,13000000.0,/organization/alfresco,alfresco,Document Management|Enterprises|Enterprise Sof...,operating,GBR,document management,Others


In [297]:
d2_others.groupby('permalink')['raised_amount_usd'].sum().sort_values(ascending=False).head()

permalink
/organization/electric-cloud            37000000.00
/organization/sensage                   36250000.00
/organization/enigmatic                 32500000.00
/organization/silverrail-technologies   29000000.00
/organization/opencloud                 27972766.00
Name: raised_amount_usd, dtype: float64

In [299]:
f_d2_sfaa = d2['main_sector']=='Social, Finance, Analytics, Advertising'
d2_sfaa = d2[f_d2_sfaa]
d2_sfaa.head()
d2_sfaa.groupby('permalink')['raised_amount_usd'].sum().sort_values(ascending=False).head()

permalink
/organization/celltick-technologies   37500000.00
/organization/mythings                34000000.00
/organization/zopa                    32900000.00
/organization/imagini                 28550000.00
/organization/marketinvoice           25553007.00
Name: raised_amount_usd, dtype: float64

### D3

In [285]:
d3.raised_amount_usd.sum()

3599289960.0

In [281]:
d3.pivot_table(values='raised_amount_usd', index='main_sector',aggfunc=['sum',np.size])

Unnamed: 0_level_0,sum,size
Unnamed: 0_level_1,raised_amount_usd,raised_amount_usd
main_sector,Unnamed: 1_level_2,Unnamed: 2_level_2
Automotive & Sports,15000000.0,2.0
Cleantech / Semiconductors,1015887607.0,112.0
Entertainment,160022068.0,23.0
Health,168345064.0,21.0
Manufacturing,237633071.0,31.0
"News, Search and Messaging",420121113.0,46.0
Others,926137962.0,109.0
"Social, Finance, Analytics, Advertising",656143075.0,78.0


In [301]:
f_d3_cs = d3['main_sector']=='Cleantech / Semiconductors'
d3_cs = d3[f_d3_cs]
d3_cs.head()
d3_cs.groupby('permalink')['raised_amount_usd'].sum().sort_values(ascending=False).head()

permalink
/organization/fresco-microchip      48000000.00
/organization/ostara                43152682.00
/organization/diablo-technologies   35500000.00
/organization/zymeworks             34100000.00
/organization/engene                31150000.00
Name: raised_amount_usd, dtype: float64

In [302]:
f_d3_others = d3['main_sector']=='Others'
d3_others = d3[f_d3_others]
d3_others.head()
d3_others.groupby('permalink')['raised_amount_usd'].sum().sort_values(ascending=False).head()

permalink
/organization/quickplay-media        38700000.00
/organization/newstep                31477853.00
/organization/strangeloop-networks   21500000.00
/organization/morega                 20000000.00
/organization/lxdata                 19310000.00
Name: raised_amount_usd, dtype: float64

## Checkpoint 6: Plots