# Checkpoint 4: Sector Analysis 1

- Find top nine countries which have received the highest total funding 

In [50]:
# Loading libraries and reading data

import numpy as np
import pandas as pd

# Reading delimited file using pd.read_csv(filepath, sep, header)
# Using encoding = "ISO-8859-1"
companies = pd.read_csv("companies.txt", sep="\t", encoding = "ISO-8859-1")
rounds2 = pd.read_csv("rounds2.csv", encoding = "ISO-8859-1")
mapping_df = pd.read_csv("mapping.csv", encoding = "ISO-8859-1")

In [51]:
# rows having at least one missing value
companies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66368 entries, 0 to 66367
Data columns (total 10 columns):
permalink        66368 non-null object
name             66367 non-null object
homepage_url     61310 non-null object
category_list    63220 non-null object
status           66368 non-null object
country_code     59410 non-null object
state_code       57821 non-null object
region           58338 non-null object
city             58340 non-null object
founded_at       51147 non-null object
dtypes: object(10)
memory usage: 5.1+ MB


In [52]:
# remove Nan rows for category list
companies['category_list'].isnull().any(axis=0)
companies = companies[~pd.isnull(companies['category_list'])]
companies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 63220 entries, 0 to 66367
Data columns (total 10 columns):
permalink        63220 non-null object
name             63219 non-null object
homepage_url     59074 non-null object
category_list    63220 non-null object
status           63220 non-null object
country_code     57804 non-null object
state_code       56268 non-null object
region           56765 non-null object
city             56767 non-null object
founded_at       49711 non-null object
dtypes: object(10)
memory usage: 5.3+ MB


In [53]:
# cols having at least one missing value
companies['category_list'].isnull().any(axis=0)

False

In [55]:
# remove Nan rows for mapping_df.category_list
mapping_df['category_list'].isnull().any(axis=0)
mapping_df = mapping_df[~pd.isnull(mapping_df['category_list'])]
mapping_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 687 entries, 1 to 687
Data columns (total 10 columns):
category_list                              687 non-null object
Automotive & Sports                        687 non-null int64
Blanks                                     687 non-null int64
Cleantech / Semiconductors                 687 non-null int64
Entertainment                              687 non-null int64
Health                                     687 non-null int64
Manufacturing                              687 non-null int64
News, Search and Messaging                 687 non-null int64
Others                                     687 non-null int64
Social, Finance, Analytics, Advertising    687 non-null int64
dtypes: int64(9), object(1)
memory usage: 59.0+ KB


In [78]:
#mapping_df.loc[(mapping_df['Automotive & Sports'] == 1) | (mapping_df['Blanks']==1), :]
mapping_df.loc[(mapping_df['Automotive & Sports'] == 1) | (mapping_df['Blanks']==1), :]


Index(['category_list', 'Automotive & Sports', 'Blanks',
       'Cleantech / Semiconductors', 'Entertainment', 'Health',
       'Manufacturing', 'News, Search and Messaging', 'Others',
       'Social, Finance, Analytics, Advertising'],
      dtype='object')

In [64]:
# main sector for each category_list
mapping_df.loc[:,mapping_df.loc[:] == 1].columns

SyntaxError: invalid syntax (<ipython-input-64-0e3f46628733>, line 2)

In [56]:
# Merging the dataframes
# company_permalink,permalink is the common column/key, which has to be provided to the 'on' argument
# how = 'inner' makes sure that all the comapnies ids are reflectd in the master_frame

companies['permalink'] = companies['permalink'].astype(str).str.lower()
rounds2['company_permalink'] = rounds2['company_permalink'].astype(str).str.lower()

master_frame = pd.merge(companies, rounds2, how='inner', left_on='permalink', right_on='company_permalink')

In [57]:
# raised_amount_usd has considerable number of missing value.
# So, it will not help in finding average for funding_round_type
# Let's remove such rows from master_frame

# removing NaN Price rows
master_frame = master_frame[~np.isnan(master_frame['raised_amount_usd'])]

In [58]:
# Creating new column primary_sector by spliting '|' the category_list
master_frame['primary_sector'] = master_frame['category_list'].apply(lambda x : x.split('|')[0])

In [59]:
master_frame.head()

Unnamed: 0,permalink,name,homepage_url,category_list,status,country_code,state_code,region,city,founded_at,company_permalink,funding_round_permalink,funding_round_type,funding_round_code,funded_at,raised_amount_usd,primary_sector
0,/organization/-fame,#fame,http://livfame.com,Media,operating,IND,16,Mumbai,Mumbai,,/organization/-fame,/funding-round/9a01d05418af9f794eebff7ace91f638,venture,B,05-01-2015,10000000.0,Media
2,/organization/-qounter,:Qounter,http://www.qounter.com,Application Platforms|Real Time|Social Network...,operating,USA,DE,DE - Other,Delaware City,04-09-2014,/organization/-qounter,/funding-round/b44fbb94153f6cdef13083530bb48030,seed,,01-03-2014,700000.0,Application Platforms
3,/organization/-the-one-of-them-inc-,"(THE) ONE of THEM,Inc.",http://oneofthem.jp,Apps|Games|Mobile,operating,,,,,,/organization/-the-one-of-them-inc-,/funding-round/650b8f704416801069bb178a1418776b,venture,B,30-01-2014,3406878.0,Apps
4,/organization/0-6-com,0-6.com,http://www.0-6.com,Curated Web,operating,CHN,22,Beijing,Beijing,01-01-2007,/organization/0-6-com,/funding-round/5727accaeaa57461bd22a9bdd945382d,venture,A,19-03-2008,2000000.0,Curated Web
6,/organization/01games-technology,01Games Technology,http://www.01games.hk/,Games,operating,HKG,,Hong Kong,Hong Kong,,/organization/01games-technology,/funding-round/7d53696f2b4f607a2f2a8cbb83d01839,undisclosed,,01-07-2014,41250.0,Games


In [60]:
# merge companies and mapping_df for primary to main sector mapping on the category_list
primary_main_sector_df = pd.merge(master_frame, mapping_df, how='inner', on='category_list')

In [61]:
# merged data frame
primary_main_sector_df.head()

Unnamed: 0,permalink,name,homepage_url,category_list,status,country_code,state_code,region,city,founded_at,...,primary_sector,Automotive & Sports,Blanks,Cleantech / Semiconductors,Entertainment,Health,Manufacturing,"News, Search and Messaging",Others,"Social, Finance, Analytics, Advertising"
0,/organization/-fame,#fame,http://livfame.com,Media,operating,IND,16,Mumbai,Mumbai,,...,Media,0,0,0,1,0,0,0,0,0
1,/organization/all-def-digital,All Def Digital,http://alldefdigital.com,Media,operating,USA,CA,Los Angeles,Los Angeles,,...,Media,0,0,0,1,0,0,0,0,0
2,/organization/anthill-magazine,Anthill Magazine,http://anthillonline.com/,Media,operating,AUS,7,Melbourne,Melbourne,23-05-2009,...,Media,0,0,0,1,0,0,0,0,0
3,/organization/basel-switzerland,The Speedel Group,http://www.speedelgroup.com/,Media,closed,CHE,4,Basel,Basel,,...,Media,0,0,0,1,0,0,0,0,0
4,/organization/basel-switzerland,The Speedel Group,http://www.speedelgroup.com/,Media,closed,CHE,4,Basel,Basel,,...,Media,0,0,0,1,0,0,0,0,0
