# importing libraries

In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from numerize import numerize
import plotly.express as px
import plotly.graph_objects as go

# loading data

In [57]:
df = pd.read_csv('data/startup_funding.csv')

# Renaming columns

In [58]:
df.rename(mapper={
        'Date dd/mm/yyyy':'Startup_date',
        'City  Location':'City',
        'Amount in USD':'Amount',
        'Startup Name':'Startupname',
        'Industry Vertical':'Industrytype',
        'Investors Name':'Investorsname'
        },axis=1, inplace=True)


In [59]:
df.isnull().sum()

Sr No                 0
Startup_date          0
Startupname           0
Industrytype        171
SubVertical         936
City                180
Investorsname        24
InvestmentnType       4
Amount              960
Remarks            2625
dtype: int64

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3044 entries, 0 to 3043
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Sr No            3044 non-null   int64 
 1   Startup_date     3044 non-null   object
 2   Startupname      3044 non-null   object
 3   Industrytype     2873 non-null   object
 4   SubVertical      2108 non-null   object
 5   City             2864 non-null   object
 6   Investorsname    3020 non-null   object
 7   InvestmentnType  3040 non-null   object
 8   Amount           2084 non-null   object
 9   Remarks          419 non-null    object
dtypes: int64(1), object(9)
memory usage: 237.9+ KB


In [61]:
df.drop(['Startup_date','Remarks','Sr No'],axis=1,inplace=True)

In [62]:
#df['Startup_date'] = pd.to_datetime(df['Startup_date'])
#print(df.to_string())


Data cleaning

In [63]:
def clean_amount(amt):
    amt = str(amt)
    if ',' in amt:
        amt = amt.replace(',','')
    if amt.isnumeric():
        return float(amt)
    if amt.isalpha() or amt.startswith('\\'):
        return np.nan
    if '.' in amt:
        return float(amt)
    if '+' in amt:
        return float(amt.replace('+',''))

df['Amount'] = df['Amount'].apply(clean_amount)

In [64]:
x=df['Amount'].mean()
df['Amount'].fillna(x,inplace=True)

Categorical coloumn


In [88]:
df.isnull().sum()

Startupname          0
Industrytype       171
SubVertical          0
City               180
Investorsname       24
InvestmentnType      0
Amount               0
dtype: int64

In [87]:
df['SubVertical']

60                               Bike Taxi
651                     Online Marketplace
830     Mobile Wallet & ECommerce platform
966                  ECommerce Marketplace
31                           Mobile Wallet
                       ...                
3020               Online Lending Platform
3017               Online Lending Platform
3018               Online Lending Platform
3019               Online Lending Platform
3021               Online Lending Platform
Name: SubVertical, Length: 3040, dtype: object

In [67]:
df.dropna(subset=['InvestmentnType'],inplace=True)

In [68]:
df.loc[3006:3043,'SubVertical']='Online Lending Platform'

In [69]:
df['SubVertical'].replace(np.nan, df['SubVertical'].value_counts().idxmax(), inplace=True)


In [70]:
df_cat = df.select_dtypes(np.object_)

In [71]:
df_cat

Unnamed: 0,Startupname,Industrytype,SubVertical,City,Investorsname,InvestmentnType
0,BYJU’S,E-Tech,E-learning,Bengaluru,Tiger Global Management,Private Equity Round
1,Shuttl,Transportation,App based shuttle service,Gurgaon,Susquehanna Growth Equity,Series C
2,Mamaearth,E-commerce,Retailer of baby and toddler products,Bengaluru,Sequoia Capital India,Series B
3,https://www.wealthbucket.in/,FinTech,Online Investment,New Delhi,Vinod Khatumal,Pre-series A
4,Fashor,Fashion and Apparel,Embroiled Clothes For Women,Mumbai,Sprout Venture Partners,Seed Round
...,...,...,...,...,...,...
3039,Printvenue,,Online Lending Platform,,Asia Pacific Internet Group,Private Equity
3040,Graphene,,Online Lending Platform,,KARSEMVEN Fund,Private Equity
3041,Mad Street Den,,Online Lending Platform,,"Exfinity Fund, GrowX Ventures.",Private Equity
3042,Simplotel,,Online Lending Platform,,MakeMyTrip,Private Equity


numerical coloumn

In [72]:
df_num = df.select_dtypes(np.number)

In [73]:
df_num

Unnamed: 0,Amount
0,2.000000e+08
1,8.048394e+06
2,1.835886e+07
3,3.000000e+06
4,1.800000e+06
...,...
3039,4.500000e+06
3040,8.250000e+05
3041,1.500000e+06
3042,1.842792e+07


Univairiate Analysis

In [74]:
df['Startupname'].nunique()

2456

In [75]:
df['Amount'].describe()

count    3.040000e+03
mean     1.837781e+07
std      9.997113e+07
min      1.600000e+04
25%      1.000000e+06
50%      7.450000e+06
75%      1.842792e+07
max      3.900000e+09
Name: Amount, dtype: float64

In [76]:
df.describe()

Unnamed: 0,Amount
count,3040.0
mean,18377810.0
std,99971130.0
min,16000.0
25%,1000000.0
50%,7450000.0
75%,18427920.0
max,3900000000.0


In [89]:
df['City'].replace(np.NaN,df['City'].value_counts().idxmax())

60      Bengaluru
651     Bangalore
830     Bangalore
966     Bangalore
31          Noida
          ...    
3020    Bangalore
3017    Bangalore
3018    Bangalore
3019    Bangalore
3021    Bangalore
Name: City, Length: 3040, dtype: object

In [77]:
df['City'].value_counts()

Bangalore             700
Mumbai                566
New Delhi             421
Gurgaon               286
Bengaluru             141
                     ... 
Ahemdabad               1
Kolkatta                1
Delhi & Cambridge       1
Bhubneswar              1
Dallas / Hyderabad      1
Name: City, Length: 110, dtype: int64

Bivariate Analysis

In [78]:
city_df = df['City'].value_counts().reset_index()
city_df.columns = ['City Name','Occurance']
fig = px.bar(city_df.head(10), 'City Name', 'Occurance', title='Startups by City')
fig.show()

In [79]:
df.sort_values(by='Amount', ascending=False, inplace=True)

In [80]:
money_city_df = df.groupby('City')['Amount'].sum().reset_index()
money_city_df.sort_values(by='Amount', ascending=False, inplace=True)
fig=px.bar(money_city_df.head(25), 'City', 'Amount', title='Top Invested Amount for each city')
fig.show()

In [81]:
df['Amount'] = df['Amount'].astype('int64')

In [82]:
df['Industrytype'].value_counts()

Consumer Internet                  941
Technology                         478
eCommerce                          186
Healthcare                          70
Finance                             62
                                  ... 
Speech Recognition Solutions         1
Hyperlocal grocery delivery          1
Railways Information Mobile app      1
Online Lingerie platform             1
Coupon Aggregator Platform           1
Name: Industrytype, Length: 821, dtype: int64

In [83]:
amount_df= df.groupby('Investorsname')['Amount'].sum().reset_index()
amount_df.sort_values(by='Amount', ascending=False, inplace= True)
fig = px.area(amount_df.head(10), x= 'Investorsname',y='Amount',title= ' Investors investment in Startups')
fig.show()

In [84]:
df.isnull().sum()

Startupname          0
Industrytype       171
SubVertical          0
City               180
Investorsname       24
InvestmentnType      0
Amount               0
dtype: int64

In [85]:
df.count()

Startupname        3040
Industrytype       2869
SubVertical        3040
City               2860
Investorsname      3016
InvestmentnType    3040
Amount             3040
dtype: int64

In [86]:
invs=df['Investorsname'].tail(20).value_counts()
invs

Group of Angel Investors                                     6
Hyderabad Angels (at Startup Heroes event)                   5
Multiple investors through Ten Minute Million competition    4
The Ten Minute Million                                       2
Promatus Group                                               1
Undisclosed                                                  1
Not Disclosed                                                1
Name: Investorsname, dtype: int64