# importing libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from numerize import numerize
import plotly.express as px
import plotly.graph_objects as go

# loading data

In [3]:
df = pd.read_csv('data/startup_funding.csv')

# Renaming columns

In [4]:
df.rename(mapper={
        'Date dd/mm/yyyy':'Startup_date',
        'City  Location':'City',
        'Amount in USD':'Amount',
        'Startup Name':'Startupname',
        'Industry Vertical':'Industrytype',
        'Investors Name':'Investorsname'
        },axis=1, inplace=True)


In [5]:
df.isnull().sum()

Sr No                 0
Startup_date          0
Startupname           0
Industrytype        171
SubVertical         936
City                180
Investorsname        24
InvestmentnType       4
Amount              960
Remarks            2625
dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3044 entries, 0 to 3043
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Sr No            3044 non-null   int64 
 1   Startup_date     3044 non-null   object
 2   Startupname      3044 non-null   object
 3   Industrytype     2873 non-null   object
 4   SubVertical      2108 non-null   object
 5   City             2864 non-null   object
 6   Investorsname    3020 non-null   object
 7   InvestmentnType  3040 non-null   object
 8   Amount           2084 non-null   object
 9   Remarks          419 non-null    object
dtypes: int64(1), object(9)
memory usage: 237.9+ KB


In [7]:
df.drop(['Remarks','Sr No'],axis=1,inplace=True)

In [8]:
#df['Startup_date'] = pd.to_datetime(df['Startup_date'])
#print(df.to_string())


Data cleaning

In [9]:
def clean_amount(amt):
    amt = str(amt)
    if ',' in amt:
        amt = amt.replace(',','')
    if amt.isnumeric():
        return float(amt)
    if amt.isalpha() or amt.startswith('\\'):
        return np.nan
    if '.' in amt:
        return float(amt)
    if '+' in amt:
        return float(amt.replace('+',''))

df['Amount'] = df['Amount'].apply(clean_amount)

In [10]:
x=df['Amount'].mean()
df['Amount'].fillna(x,inplace=True)

Categorical coloumn


In [11]:
df.isnull().sum()

Startup_date         0
Startupname          0
Industrytype       171
SubVertical        936
City               180
Investorsname       24
InvestmentnType      4
Amount               0
dtype: int64

Droping the Small Empty cell

In [12]:
df.dropna(subset=['InvestmentnType','Investorsname'],inplace=True)

Replacing NaN Values

In [13]:
df['SubVertical'].replace(np.nan, df['SubVertical'].value_counts().idxmax(), inplace=True)


In [14]:
df['Industrytype'].replace(np.NaN,df['Industrytype'].value_counts().idxmax(),inplace=True)

In [15]:
df['City'].replace(np.NaN,df['City'].value_counts().idxmax(),inplace= True)

In [16]:
df.isnull().sum()

Startup_date       0
Startupname        0
Industrytype       0
SubVertical        0
City               0
Investorsname      0
InvestmentnType    0
Amount             0
dtype: int64

All Nan values Cleaned

In [17]:
df_cat = df.select_dtypes(np.object_)

In [18]:
df_cat

Unnamed: 0,Startup_date,Startupname,Industrytype,SubVertical,City,Investorsname,InvestmentnType
0,09/01/2020,BYJU’S,E-Tech,E-learning,Bengaluru,Tiger Global Management,Private Equity Round
1,13/01/2020,Shuttl,Transportation,App based shuttle service,Gurgaon,Susquehanna Growth Equity,Series C
2,09/01/2020,Mamaearth,E-commerce,Retailer of baby and toddler products,Bengaluru,Sequoia Capital India,Series B
3,02/01/2020,https://www.wealthbucket.in/,FinTech,Online Investment,New Delhi,Vinod Khatumal,Pre-series A
4,02/01/2020,Fashor,Fashion and Apparel,Embroiled Clothes For Women,Mumbai,Sprout Venture Partners,Seed Round
...,...,...,...,...,...,...,...
3039,29/01/2015,Printvenue,Consumer Internet,Online Lending Platform,Bangalore,Asia Pacific Internet Group,Private Equity
3040,29/01/2015,Graphene,Consumer Internet,Online Lending Platform,Bangalore,KARSEMVEN Fund,Private Equity
3041,30/01/2015,Mad Street Den,Consumer Internet,Online Lending Platform,Bangalore,"Exfinity Fund, GrowX Ventures.",Private Equity
3042,30/01/2015,Simplotel,Consumer Internet,Online Lending Platform,Bangalore,MakeMyTrip,Private Equity


numerical coloumn

In [19]:
df_num = df.select_dtypes(np.number)

In [20]:
df_num

Unnamed: 0,Amount
0,2.000000e+08
1,8.048394e+06
2,1.835886e+07
3,3.000000e+06
4,1.800000e+06
...,...
3039,4.500000e+06
3040,8.250000e+05
3041,1.500000e+06
3042,1.842792e+07


Univairiate Analysis

In [21]:
df['Startupname'].nunique()

2437

In [22]:
df.Startup_date = pd.to_datetime(df.Startup_date)

  df.Startup_date = pd.to_datetime(df.Startup_date)


ParserError: year 72018 is out of range: 05/072018 present at position 119

In [None]:
df['Amount'].describe()

count    3.016000e+03
mean     1.829268e+07
std      9.979839e+07
min      1.600000e+04
25%      1.000000e+06
50%      7.500000e+06
75%      1.842792e+07
max      3.900000e+09
Name: Amount, dtype: float64

In [None]:
df['Amount'] = df['Amount'].astype('int64')
df['Amount']

count    3.016000e+03
mean     1.829268e+07
std      9.979839e+07
min      1.600000e+04
25%      1.000000e+06
50%      7.500000e+06
75%      1.842792e+07
max      3.900000e+09
Name: Amount, dtype: float64

In [None]:
dg_amount= df.groupby('Startup_date')['Amount'].sum().reset_index()
dg_amount.sort_values(by='Amount',ascending=False,inplace=True)
fig = px.line(dg_amount.head(20),x='Amount',y='Startup_date',markers='*')
fig.show()
#dg_amount

Bivariate Analysis

Graphs


In [None]:
city_df = df['City'].value_counts().reset_index()
city_df.columns = ['City Name','Occurance']
fig = px.bar(city_df.head(10), 'City Name', 'Occurance', title='Startups by City')
fig.show()

In [None]:
money_city_df = df.groupby('City')['Amount'].sum().reset_index()
money_city_df.sort_values(by='Amount', ascending=False, inplace=True)
fig=px.bar(money_city_df.head(25), 'City', 'Amount', title='Top Invested Amount for each city')
fig.show()

In [None]:
amount_df= df.groupby('Investorsname')['Amount'].sum().reset_index()
amount_df.sort_values(by='Amount', ascending=False, inplace= True)
fig = px.line(amount_df.head(20), x ='Amount', y='Investorsname',title= 'Top Investors investment in Startups')
fig.show()

In [None]:
df.isnull().sum()

Startup_date         0
Startupname          0
Industrytype       171
SubVertical          0
City                 0
Investorsname        0
InvestmentnType      0
Amount               0
dtype: int64

In [None]:
df.count()

Startup_date       3016
Startupname        3016
Industrytype       2845
SubVertical        3016
City               3016
Investorsname      3016
InvestmentnType    3016
Amount             3016
dtype: int64

In [None]:
invs=df['Investorsname'].tail(20).value_counts()
invs

Group of Angel Investors                                     6
Hyderabad Angels (at Startup Heroes event)                   5
Multiple investors through Ten Minute Million competition    4
The Ten Minute Million                                       2
Promatus Group                                               1
Undisclosed                                                  1
Not Disclosed                                                1
Name: Investorsname, dtype: int64