# importing libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from numerize import numerize
import plotly.express as px
import plotly.graph_objects as go

# loading data

In [2]:
df = pd.read_csv('data/startup_funding.csv')

# Renaming columns

In [3]:
df.rename(mapper={
        'Date dd/mm/yyyy':'Startup_date',
        'City  Location':'City',
        'Amount in USD':'Amount',
        'Startup Name':'Startupname',
        'Industry Vertical':'Industrytype',
        'Investors Name':'Investorsname'
        },axis=1, inplace=True)


In [4]:
df.isnull().sum()

Sr No                 0
Startup_date          0
Startupname           0
Industrytype        171
SubVertical         936
City                180
Investorsname        24
InvestmentnType       4
Amount              960
Remarks            2625
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3044 entries, 0 to 3043
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Sr No            3044 non-null   int64 
 1   Startup_date     3044 non-null   object
 2   Startupname      3044 non-null   object
 3   Industrytype     2873 non-null   object
 4   SubVertical      2108 non-null   object
 5   City             2864 non-null   object
 6   Investorsname    3020 non-null   object
 7   InvestmentnType  3040 non-null   object
 8   Amount           2084 non-null   object
 9   Remarks          419 non-null    object
dtypes: int64(1), object(9)
memory usage: 237.9+ KB


In [6]:
df.drop(['Remarks','Sr No'],axis=1,inplace=True)

In [7]:
#df['Startup_date'] = pd.to_datetime(df['Startup_date'])
#print(df.to_string())


Data cleaning

In [8]:
def clean_amount(amt):
    amt = str(amt)
    if ',' in amt:
        amt = amt.replace(',','')
    if amt.isnumeric():
        return float(amt)
    if amt.isalpha() or amt.startswith('\\'):
        return np.nan
    if '.' in amt:
        return float(amt)
    if '+' in amt:
        return float(amt.replace('+',''))

df['Amount'] = df['Amount'].apply(clean_amount)

In [9]:
x=df['Amount'].mean()
df['Amount'].fillna(x,inplace=True)

Categorical coloumn


In [34]:
df.isnull().sum()

Startup_date         0
Startupname          0
Industrytype       171
SubVertical          0
City                 0
Investorsname        0
InvestmentnType      0
Amount               0
dtype: int64

Droping the Small Empty cell

In [11]:
df.dropna(subset=['InvestmentnType','Investorsname'],inplace=True)

Replacing NaN Values

In [30]:
df['SubVertical'].replace(np.nan, df['SubVertical'].value_counts().idxmax(), inplace=True)


In [35]:
df['Industrytype'].replace(np.NaN,df['Industrytype'].value_counts().idxmax(),inplace=True)

In [41]:
df['City'].replace(np.NaN,df['City'].value_counts().idxmax(),inplace= True)

In [36]:
df.isnull().sum()

Startup_date       0
Startupname        0
Industrytype       0
SubVertical        0
City               0
Investorsname      0
InvestmentnType    0
Amount             0
dtype: int64

All Nan values Cleaned

In [38]:
df_cat = df.select_dtypes(np.object_)

In [39]:
df_cat

Unnamed: 0,Startup_date,Startupname,Industrytype,SubVertical,City,Investorsname,InvestmentnType
60,27/08/2019,Rapido Bike Taxi,Transportation,Bike Taxi,Bengaluru,Westbridge Capital,Series B
651,11/08/2017,Flipkart,eCommerce,Online Marketplace,Bangalore,Softbank,Private Equity
966,21/03/2017,Flipkart,eCommerce,ECommerce Marketplace,Bangalore,"Microsoft, eBay, Tencent Holdings",Private Equity
830,18/05/2017,Paytm,ECommerce,Mobile Wallet & ECommerce platform,Bangalore,SoftBank Group,Private Equity
31,25/11/2019,Paytm,FinTech,Mobile Wallet,Noida,Vijay Shekhar Sharma,Funding Round
...,...,...,...,...,...,...,...
3020,19/01/2015,Enabli,Consumer Internet,Online Lending Platform,Bangalore,Hyderabad Angels (at Startup Heroes event),Seed Funding
3017,19/01/2015,Hostel Dunia,Consumer Internet,Online Lending Platform,Bangalore,Hyderabad Angels (at Startup Heroes event),Seed Funding
3018,19/01/2015,Play your sport,Consumer Internet,Online Lending Platform,Bangalore,Hyderabad Angels (at Startup Heroes event),Seed Funding
3019,19/01/2015,Yo Grad,Consumer Internet,Online Lending Platform,Bangalore,Hyderabad Angels (at Startup Heroes event),Seed Funding


numerical coloumn

In [15]:
df_num = df.select_dtypes(np.number)

In [16]:
df_num

Unnamed: 0,Amount
0,2.000000e+08
1,8.048394e+06
2,1.835886e+07
3,3.000000e+06
4,1.800000e+06
...,...
3039,4.500000e+06
3040,8.250000e+05
3041,1.500000e+06
3042,1.842792e+07


Univairiate Analysis

In [17]:
df['Startupname'].nunique()

2437

In [18]:
df['Amount'].describe()

count    3.016000e+03
mean     1.829268e+07
std      9.979839e+07
min      1.600000e+04
25%      1.000000e+06
50%      7.500000e+06
75%      1.842792e+07
max      3.900000e+09
Name: Amount, dtype: float64

In [43]:
df['Amount'] = df['Amount'].astype('int64')
df['Amount']

count    3.016000e+03
mean     1.829268e+07
std      9.979839e+07
min      1.600000e+04
25%      1.000000e+06
50%      7.500000e+06
75%      1.842792e+07
max      3.900000e+09
Name: Amount, dtype: float64

Bivariate Analysis

Graphs


In [20]:
city_df = df['City'].value_counts().reset_index()
city_df.columns = ['City Name','Occurance']
fig = px.bar(city_df.head(10), 'City Name', 'Occurance', title='Startups by City')
fig.show()

In [22]:
money_city_df = df.groupby('City')['Amount'].sum().reset_index()
money_city_df.sort_values(by='Amount', ascending=False, inplace=True)
fig=px.bar(money_city_df.head(25), 'City', 'Amount', title='Top Invested Amount for each city')
fig.show()

In [25]:
amount_df= df.groupby('Investorsname')['Amount'].sum().reset_index()
amount_df.sort_values(by='Amount', ascending=False, inplace= True)
fig = px.line(amount_df.head(20), x ='Amount', y='Investorsname',title= 'Top Investors investment in Startups')
fig.show()

In [32]:
df.isnull().sum()

Startup_date         0
Startupname          0
Industrytype       171
SubVertical          0
City                 0
Investorsname        0
InvestmentnType      0
Amount               0
dtype: int64

In [27]:
df.count()

Startup_date       3016
Startupname        3016
Industrytype       2845
SubVertical        3016
City               3016
Investorsname      3016
InvestmentnType    3016
Amount             3016
dtype: int64

In [28]:
invs=df['Investorsname'].tail(20).value_counts()
invs

Group of Angel Investors                                     6
Hyderabad Angels (at Startup Heroes event)                   5
Multiple investors through Ten Minute Million competition    4
The Ten Minute Million                                       2
Promatus Group                                               1
Undisclosed                                                  1
Not Disclosed                                                1
Name: Investorsname, dtype: int64