In [367]:
#importing the usual library
import numpy as np
import pandas as pd
#train_test_split
from sklearn.model_selection import train_test_split
#for scaling
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
#Pipeline
from sklearn.pipeline import Pipeline
import tensorflow as tf
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor

# Loading the Dataset

In [356]:
df=pd.read_csv('/kaggle/input/indian-startup-funding/startup_funding.csv')
#loading the dataset
df

Unnamed: 0,Sr No,Date dd/mm/yyyy,Startup Name,Industry Vertical,SubVertical,City Location,Investors Name,InvestmentnType,Amount in USD,Remarks
0,1,09/01/2020,BYJU’S,E-Tech,E-learning,Bengaluru,Tiger Global Management,Private Equity Round,200000000,
1,2,13/01/2020,Shuttl,Transportation,App based shuttle service,Gurgaon,Susquehanna Growth Equity,Series C,8048394,
2,3,09/01/2020,Mamaearth,E-commerce,Retailer of baby and toddler products,Bengaluru,Sequoia Capital India,Series B,18358860,
3,4,02/01/2020,https://www.wealthbucket.in/,FinTech,Online Investment,New Delhi,Vinod Khatumal,Pre-series A,3000000,
4,5,02/01/2020,Fashor,Fashion and Apparel,Embroiled Clothes For Women,Mumbai,Sprout Venture Partners,Seed Round,1800000,
...,...,...,...,...,...,...,...,...,...,...
3039,3040,29/01/2015,Printvenue,,,,Asia Pacific Internet Group,Private Equity,4500000,
3040,3041,29/01/2015,Graphene,,,,KARSEMVEN Fund,Private Equity,825000,Govt backed VC Fund
3041,3042,30/01/2015,Mad Street Den,,,,"Exfinity Fund, GrowX Ventures.",Private Equity,1500000,
3042,3043,30/01/2015,Simplotel,,,,MakeMyTrip,Private Equity,,"Strategic Funding, Minority stake"


# Getting the Preliminary Information about the dataset

In [357]:
def onehot_encode(df,columns):
    df=df.copy()
    for column in columns:
        dummies=pd.get_dummies(df[column],prefix=column)
        df=pd.concat([df,dummies],axis=1)
        df=df.drop(column,axis=1)
    return df

In [384]:
def preprocess_inputs(df):
    df=df.copy()
    #droppin the unnecessary column
    df=df.drop(df.columns[0],axis=1)
    
    #dropping the high cardinality column
    high_cardinality_column=['Startup Name','SubVertical','Investors Name']
    df=df.drop(high_cardinality_column,axis=1)
    #dropping the missing target rows
    df=df.drop(df[df['Amount in USD'].isna()].index,axis=0)
    df['Amount in USD']=df['Amount in USD'].astype(str)
    df['Amount in USD']=df['Amount in USD'].apply(lambda x: x.replace(',',''))
    #dropping column with more then 25% missing values
    df=df.sort_values('Amount in USD',ascending=False).reset_index(drop=True)
    df=df.drop(df[0:18].index,axis=0)
    df['Amount in USD']=df['Amount in USD'].replace({'14342000+':np.NaN})
    df['Amount in USD']=df['Amount in USD'].astype(np.float)
    high_missing_values=['Remarks']
    
    df=df.drop(high_missing_values,axis=1)
    
    column_with_missing_values=['Industry Vertical','City  Location','InvestmentnType']
    for column in column_with_missing_values:
        df[column]=df[column].fillna(df[column].mode()[0])
    df['Date dd/mm/yyyy']=df['Date dd/mm/yyyy'].replace({'05/072018':'05/07/2018'})
    df['Date dd/mm/yyyy']=pd.to_datetime(df['Date dd/mm/yyyy'],errors='coerce')
    df= df.drop(df[df['Date dd/mm/yyyy'].isna()].index,axis=0)
    df['Month']=df['Date dd/mm/yyyy'].dt.month.astype(np.int)
    df['Year']=df['Date dd/mm/yyyy'].dt.year.astype(np.int)
    df['Day']=df['Date dd/mm/yyyy'].dt.day.astype(np.int)
    df=df.drop('Date dd/mm/yyyy',axis=1)
    df=onehot_encode(df,['Industry Vertical','City  Location','InvestmentnType'])
    df=df.dropna(axis=0)
    #train_test_split
    
    y=df['Amount in USD']
    x=df.drop('Amount in USD',axis=1)
    
    x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7)
    
    scaler=StandardScaler()
    scaler.fit(x_train)
    x_train=pd.DataFrame(scaler.transform(x_train),columns=x_train.columns)
    x_test=pd.DataFrame(scaler.transform(x_test),columns=x_test.columns)
    
    
    
    
    return x_train,x_test,y_train,y_test

In [385]:
x_train,x_test,y_train,y_test=preprocess_inputs(df)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(1444, 721)
(619, 721)
(1444,)
(619,)


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  app.launch_new_instance()
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


In [386]:
y_train.isna().sum().sum()

0

# Building the Pipeline

In [388]:
model=RandomForestRegressor()
model.fit(x_train,y_train)
model.score(x_test,y_test)

-0.3562512598679237

In [None]:
x[0:17].index

# Checking for missing value in the dataset

In [360]:
x.isna().sum()

Industry Vertical    0
City  Location       0
InvestmentnType      0
Amount in USD        1
Month                0
Year                 0
Day                  0
dtype: int64

# Dropping the rows that have missing values in target column

In [361]:
df[df['Amount in USD'].isna()].index

Int64Index([ 144,  155,  157,  165,  189,  197,  210,  219,  227,  228,
            ...
            3006, 3007, 3009, 3015, 3023, 3027, 3030, 3031, 3035, 3042],
           dtype='int64', length=960)

# Checking for high cardinality column

In [362]:
{column:len(x[column].unique()) for column in x.columns}

{'Industry Vertical': 582,
 'City  Location': 86,
 'InvestmentnType': 50,
 'Amount in USD': 454,
 'Month': 12,
 'Year': 6,
 'Day': 31}