# Project Summary:

## Title: Indian Startup Funding Prediction Using SVR

This project aims to predict the funding amounts of Indian startups based on historical funding data. The dataset includes startup information such as industry, investors, city location, and funding details. The following steps are performed in this project:

# Load and Explore the Dataset

In [35]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error
# Load the dataset
df = pd.read_csv("/kaggle/input/indian-startup-funding/startup_funding.csv")

# Display basic info and check for missing values
print(df.info())
print(df.head())

# Check for missing values
print("Missing Values:\n", df.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3044 entries, 0 to 3043
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Sr No              3044 non-null   int64 
 1   Date dd/mm/yyyy    3044 non-null   object
 2   Startup Name       3044 non-null   object
 3   Industry Vertical  2873 non-null   object
 4   SubVertical        2108 non-null   object
 5   City  Location     2864 non-null   object
 6   Investors Name     3020 non-null   object
 7   InvestmentnType    3040 non-null   object
 8   Amount in USD      2084 non-null   object
 9   Remarks            419 non-null    object
dtypes: int64(1), object(9)
memory usage: 237.9+ KB
None
   Sr No Date dd/mm/yyyy                  Startup Name    Industry Vertical  \
0      1      09/01/2020                        BYJU’S               E-Tech   
1      2      13/01/2020                        Shuttl       Transportation   
2      3      09/01/2020           

# Data Cleaning and Preprocessing

In [36]:
# Rename columns for consistency
df.rename(columns={'Date dd/mm/yyyy': 'Date'}, inplace=True)

# Convert 'Amount in USD' to numeric and handle commas
df['Amount in USD'] = df['Amount in USD'].astype(str).str.replace(',', '', regex=False)
df['Amount in USD'] = pd.to_numeric(df['Amount in USD'], errors='coerce')
df['Amount in USD'] = df['Amount in USD'].fillna(df['Amount in USD'].median())

# Convert 'Date' column to datetime format and extract Year and Month
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y', errors='coerce')
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month

# Drop irrelevant columns (e.g., 'Remarks', 'Date') and rows with missing 'Amount in USD'
df.drop(columns=['Remarks', 'Date'], axis=1, inplace=True)
df.dropna(subset=['Amount in USD'], inplace=True)

# Check data again
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3044 entries, 0 to 3043
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Sr No              3044 non-null   int64  
 1   Startup Name       3044 non-null   object 
 2   Industry Vertical  2873 non-null   object 
 3   SubVertical        2108 non-null   object 
 4   City  Location     2864 non-null   object 
 5   Investors Name     3020 non-null   object 
 6   InvestmentnType    3040 non-null   object 
 7   Amount in USD      3044 non-null   float64
 8   Year               3036 non-null   float64
 9   Month              3036 non-null   float64
dtypes: float64(3), int64(1), object(6)
memory usage: 237.9+ KB
None


# Handle categorical features using One-Hot Encoding

In [37]:
# Handle categorical columns with one-hot encoding
categorical_columns = [
    'Startup Name', 'Industry Vertical', 'SubVertical', 'City  Location', 
    'InvestmentnType', 'Investors Name'
]
# Keep only valid columns
categorical_columns = [col for col in categorical_columns if col in df.columns]
df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Check the processed data
print("Data after preprocessing:")
print(df.head())

Data after preprocessing:
   Sr No  Amount in USD    Year  Month  Startup Name_#Fame  \
0      1    200000000.0  2020.0    1.0               False   
1      2      8048394.0  2020.0    1.0               False   
2      3     18358860.0  2020.0    1.0               False   
3      4      3000000.0  2020.0    1.0               False   
4      5      1800000.0  2020.0    1.0               False   

   Startup Name_121Policy  Startup Name_19th mile  Startup Name_1Crowd  \
0                   False                   False                False   
1                   False                   False                False   
2                   False                   False                False   
3                   False                   False                False   
4                   False                   False                False   

   Startup Name_1mg  Startup Name_1mg (Healthkartplus)  ...  \
0             False                              False  ...   
1             False           

# Handle Outliers and Scale the Data

In [38]:
# Apply log transformation to 'Amount in USD' to handle outliers
df['Amount in USD'] = df['Amount in USD'].apply(lambda x: np.log1p(x) if x > 0 else 0)

# Scale numeric features (e.g., 'Year', 'Month', and 'Amount in USD')
scaler = StandardScaler()
df[['Amount in USD', 'Year', 'Month']] = scaler.fit_transform(df[['Amount in USD', 'Year', 'Month']])

# Check the data again
print(df.head())


   Sr No  Amount in USD      Year     Month  Startup Name_#Fame  \
0      1       2.821161  3.365424 -1.576338               False   
1      2       0.877788  3.365424 -1.576338               False   
2      3       1.376593  3.365424 -1.576338               False   
3      4       0.280860  3.365424 -1.576338               False   
4      5      -0.028126  3.365424 -1.576338               False   

   Startup Name_121Policy  Startup Name_19th mile  Startup Name_1Crowd  \
0                   False                   False                False   
1                   False                   False                False   
2                   False                   False                False   
3                   False                   False                False   
4                   False                   False                False   

   Startup Name_1mg  Startup Name_1mg (Healthkartplus)  ...  \
0             False                              False  ...   
1             False       

# Train-Test Split

In [39]:
# Define features and target
X = df.drop('Amount in USD', axis=1)
y = df['Amount in USD']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Impute missing values using KNNImputer

In [43]:
# Check for missing values in the features (X_train, X_test)
print(X_train.isnull().sum())
print(X_test.isnull().sum())

# Option 1: Impute missing values using KNN Imputer or another imputer
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5)
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)


Sr No                                                                  0
Year                                                                   6
Month                                                                  6
Startup Name_#Fame                                                     0
Startup Name_121Policy                                                 0
                                                                      ..
Investors Name_ru-Net, Sequoia Capital,\\xc2\\xa0 Lightbox Ventures    0
Investors Name_undisclosed investor                                    0
Investors Name_undisclosed investors                                   0
Investors Name_undisclosed private investors                           0
Investors Name_vCommission                                             0
Length: 7798, dtype: int64
Sr No                                                                  0
Year                                                                   2
Month                   

In [44]:
# Re-apply scaling on the imputed data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)


# Train the model using Support Vector Regressor

In [45]:
from sklearn.svm import SVR

# Initialize and train the SVR model
svr_model = SVR(kernel='rbf', C=100, epsilon=0.1)
svr_model.fit(X_train_scaled, y_train)


# Make predictions on the test set
y_pred_log = svr_model.predict(X_test_scaled)

# Convert log scale predictions back to the original scale
y_pred = np.expm1(y_pred_log)

# Evaluate model performance
from sklearn.metrics import mean_squared_error, mean_absolute_error
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"Mean Squared Error (SVR): {mse}")
print(f"Mean Absolute Error (SVR): {mae}")


Mean Squared Error (SVR): 0.6545664179711072
Mean Absolute Error (SVR): 0.579328099636681
