## Setup

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMClassifier, LGBMRegressor
from lightgbm import early_stopping, log_evaluation
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import StratifiedKFold, train_test_split, KFold
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report, plot_confusion_matrix, mean_squared_error, mean_absolute_error
import optuna
import pickle
from imblearn.over_sampling import SMOTE

import warnings
warnings.filterwarnings("ignore")
%matplotlib inline 

## Creating Target variables for cleaned data

In [2]:
df = pd.read_csv("cleaned.csv")
# we delete Organization location since it is redundant
df = df.drop("Organization Location", axis = 1)

In [3]:
#fill null values for money raised with zero, we assume no money raised if value is None
df["Money Raised"] = df["Money Raised"].fillna(0)
# Convert dates to datetime object
df["Announced Date"] = pd.to_datetime(df["Announced Date"])

In [4]:
# sort the data based on date of funding
df = df.sort_values(by='Announced Date')

In [5]:
# remove non common funding types
fs = ['Series D', 'Pre-Seed', 'Seed', 'Private Equity', 'Series B', 'Series A', 'Angel', 'Series C', 'Series E', 'Post-IPO Equity', 'Series F', 'Post-IPO Debt', 'Series G', 'Post-IPO Secondary', 'Series H', 'Series J', 'Series I']
df = df[df["Funding Type"].isin(fs)]

In [6]:
# create a new column with the cumulative sum of money raised
df["Total Funding"]= df.groupby('Organization Name')['Money Raised'].transform(lambda x: x.cumsum())

In [7]:
# drop all the rows whose funding type is not pre seed and total funding is 0
df = df[(df["Funding Type"] == "Pre-Seed") | (df["Money Raised"] != 0)]

In [8]:
# creating a new column funded which is 1 if a stratup is funded and zero otherwise
df["Funded"] = df["Money Raised"].apply(lambda x: 0 if x == 0 else 1 )
# Creating a new column which shows if a company is funded in the next round. 1 if a stratup is funded and zero otherwise. 
# This will be our target variable
df["Funded_in_the_next_round"] = df.groupby("Organization Name")['Funded'].shift(-1)

In [9]:
df["days_till_next_funding"] = df.groupby('Organization Name')["Announced Date"].diff().dt.days
df["days_till_next_funding"] = df.groupby("Organization Name")['days_till_next_funding'].shift(-1)

In [10]:
# Drop all rows with no values
df = df.dropna()
# Drop the Funded column since we no longer need it
df = df.drop("Funded", axis = 1)

In [11]:

# merging relevant columns from the exit status dataset
df2 = pd.read_csv("model_data_v1.csv")
cols_to_keep_df = df2[["Organization Name","Company Type","Number of Founders","Number of Employees"]]
df = df.merge(cols_to_keep_df, on = "Organization Name", how = "left").drop_duplicates().dropna()

In [12]:
final_df = df
final_df.to_csv("final.csv", index = False)

## Quick Glance at Final Data

In [13]:
data = pd.read_csv("final.csv")
train = pd.read_csv("final.csv")

In [14]:
# df[df["Organization Name"] == "Neuralink"].sort_values(by='Announced Date')

In [15]:
# remove unecessary columns 
data = data.drop(["Organization Name", "Announced Date", "Equity Only Funding"], axis = 1)
train = train.drop(["Organization Name", "Announced Date", "Equity Only Funding"], axis = 1)

In [16]:
data.head()

Unnamed: 0,Funding Type,Money Raised,Organization Industries,Funding Stage,Region,Country,City,Total Funding,Funded_in_the_next_round,days_till_next_funding,Company Type,Number of Founders,Number of Employees
0,Pre-Seed,120000.0,"Artificial Intelligence (AI), Industrial Autom...",Seed,North America,United States,Seattle,120000.0,1.0,7307.0,For Profit,3.0,1-10
1,Series A,1000000.0,"B2B, B2C, Business Development, Charity, Finan...",Early Stage Venture,North America,United States,New York,1000000.0,1.0,4792.0,For Profit,1.0,501-1000
2,Seed,1400000.0,"Artificial Intelligence (AI), Business Intelli...",Seed,North America,United States,Mountain View,1400000.0,1.0,1520.0,For Profit,2.0,251-500
3,Angel,3000000.0,"Animation, Communities, Graphic Design",Seed,Asia,China,Beijing,3000000.0,1.0,4633.0,For Profit,1.0,11-50
4,Series A,4200000.0,"Contact Management, CRM, Crowdsourcing, Email,...",Early Stage Venture,North America,United States,San Mateo,4200000.0,1.0,478.0,For Profit,4.0,11-50


In [17]:
data.columns

Index(['Funding Type', 'Money Raised', 'Organization Industries',
       'Funding Stage', 'Region', 'Country', 'City', 'Total Funding',
       'Funded_in_the_next_round', 'days_till_next_funding', 'Company Type',
       'Number of Founders', 'Number of Employees'],
      dtype='object')

In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40496 entries, 0 to 40495
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Funding Type              40496 non-null  object 
 1   Money Raised              40496 non-null  float64
 2   Organization Industries   40496 non-null  object 
 3   Funding Stage             40496 non-null  object 
 4   Region                    40496 non-null  object 
 5   Country                   40496 non-null  object 
 6   City                      40496 non-null  object 
 7   Total Funding             40496 non-null  float64
 8   Funded_in_the_next_round  40496 non-null  float64
 9   days_till_next_funding    40496 non-null  float64
 10  Company Type              40496 non-null  object 
 11  Number of Founders        40496 non-null  float64
 12  Number of Employees       40496 non-null  object 
dtypes: float64(5), object(8)
memory usage: 4.0+ MB


In [19]:
#  Check for Nulll values
data.isnull().sum()

Funding Type                0
Money Raised                0
Organization Industries     0
Funding Stage               0
Region                      0
Country                     0
City                        0
Total Funding               0
Funded_in_the_next_round    0
days_till_next_funding      0
Company Type                0
Number of Founders          0
Number of Employees         0
dtype: int64

In [20]:
# Drop duplicates
data = data.drop_duplicates()
data.duplicated().sum()

0

In [21]:
# shape of the data
data.shape

(40496, 13)

## Preprocessing

#### Encode categorical columns

In [22]:
le=LabelEncoder()

In [23]:
train['Funding Type'] = le.fit_transform(train['Funding Type'])
train['Organization Industries'] = le.fit_transform(train['Organization Industries'])
# train['Equity Only Funding'] = le.fit_transform(train['Equity Only Funding'])
train['Funding Stage'] = le.fit_transform(train['Funding Stage'])
train['Region'] = le.fit_transform(train['Region'])
train['Number of Employees'] = le.fit_transform(train['Number of Employees'])
train['Country'] = le.fit_transform(train['Country'])
train['City'] = le.fit_transform(train['City'])
train['Company Type'] = le.fit_transform(train['Company Type'])

#### Get rid of Outliers

In [24]:
cols_to_remove_outliers = ["Money Raised",'days_till_next_funding']

In [25]:
mean_std={}
col =  'days_till_next_funding'
mean_std[col]=(train[col].mean(), train[col].std())

In [26]:
# Apply zscore 
train[cols_to_remove_outliers] = train[cols_to_remove_outliers].apply(stats.zscore)

In [27]:
# Define a threshold for outliers
threshold = 3
# Get a mask for values which are NOT outliers
mask = (np.abs(train[cols_to_remove_outliers]) < threshold).all(axis=1)

train = train[mask]

In [28]:
#  helper fn to reverse scaling by zscore
def reverse_zscore(pandas_series, mean, std):
    return pandas_series*std+mean
    

#### Scale numerical columns

In [29]:
scaler = StandardScaler()

In [30]:
train["Money Raised"] = scaler.fit_transform(train["Money Raised"].to_frame())
train["Total Funding"] = scaler.fit_transform(train["Total Funding"].to_frame())

### Train Test split

In [31]:
X_class = train.drop(['Funded_in_the_next_round', 'days_till_next_funding'], axis=1)
y_class = train['Funded_in_the_next_round']

In [32]:
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.20,random_state=42,shuffle=True)

## Modelling

#### Random Forest Classifier

In [33]:
model_class = RandomForestClassifier(n_estimators=150) 
model_class.fit(X_train_class.values,y_train_class.values)

In [34]:
y_preds_class =  model_class.predict(X_test_class)

#### Classifier evaluation

In [35]:
print(confusion_matrix(y_test_class,y_preds_class))
print('\n')
print(classification_report(y_test_class,y_preds_class))

[[   7  172]
 [  37 7715]]


              precision    recall  f1-score   support

         0.0       0.16      0.04      0.06       179
         1.0       0.98      1.00      0.99      7752

    accuracy                           0.97      7931
   macro avg       0.57      0.52      0.52      7931
weighted avg       0.96      0.97      0.97      7931



In [36]:
print("Classifier Accuracy score: " ,accuracy_score(y_test_class,y_preds_class))

Classifier Accuracy score:  0.9736477115117892


#### Save the model

In [37]:
with open("classifier_model.pkl", "wb") as f:
    pickle.dump(model_class, f)

## Inference

In [39]:
sample = X_test_class.sample()

In [40]:
sample

Unnamed: 0,Funding Type,Money Raised,Organization Industries,Funding Stage,Region,Country,City,Total Funding,Company Type,Number of Founders,Number of Employees
23018,3,-0.125325,8955,3,3,36,88,-0.095286,0,2.0,4


#### Classification

In [41]:
class_label = ["Not Funded", "Funded"]

In [42]:
def classification_prediction(sample):
    out = {}
    pred = model_class.predict_proba(sample).max()
    if pred > 0.5:
        out["Prediction"] = "Funded"
        out["confidence"] = round(pred * 100, 2)
       
    else:
         out["Prediction"] = "Not Funded"
         out["confidence"] = round(pred * 100, 2)
        
    return out

In [43]:
class_preds = classification_prediction(sample)

In [44]:
print('Actual: ', class_label[int(y_test_class.loc[sample.index[0]])] )
print('Prediction: ', class_preds["Prediction"])
print('confidence: ', class_preds["confidence"])

Actual:  Funded
Prediction:  Funded
confidence:  100.0


## Complete Output

In [45]:
print('The startup forecast is:', class_preds["Prediction"], "with a confidence of ", class_preds["confidence"])

The startup forecast is: Funded with a confidence of  100.0
