# Data Modelling for African Startup Deals Data 

Building a model that predicts the size of funding rounds by African tech startups based on a set of selected factors.

__Data__: [Africa: The Big Deal Startup Deals Database, March 2022 release by Max Cuvellier and Maxime Bayen.](https://thebigdeal.gumroad.com/membership)

__Note:__ For 2020, the database only captures deals of USD 500,000 or more for 2020 and deals of over USD 1 million for 2019.


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm

from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LinearRegression, HuberRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.metrics import mean_squared_error

import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense

In [2]:
# loading data and formatting date object
deals = pd.read_excel("final project/deals_clean.xlsx", index_col=0)
deals['Deal Date'] = pd.to_datetime(deals['Deal Date']).dt.strftime('%Y-%m')

In [3]:
# making woman ceo variable binary
deals['Woman CEO'] = np.where(deals['Woman CEO'] == 'No', 0, 1)

In [4]:
deals.head()

Unnamed: 0,index,Deal Month,Deal Year,Deal Date,Start-up name,Website,Country HQ,Launch,Tweet pitch,Sector,...,Founders gender mix,Amount disclosure,Amount raised $M,Comment,Bracket,Type,Valuation $M,1st $M ?,Link to news,Investors
0,1,1,2019,2019-01,Bbox,https://www.bboxx.co.uk/,Overseas,2010,Plug & play systems to power off-grid areas,Energy & Water,...,Male-only founding team,,31.0,,$10M-$50M,Venture Round,,No,https://mercomindia.com/aiim-invest-31-million...,Africa Infrastructure Investment Managers
1,2,1,2019,2019-01,Copia,https://copiaglobal.com/corporate/,Kenya,2010,Consumer catalog order and delivery system for...,Retail,...,Gender-diverse founding team,,2.0,,$2M-$5M,Series A,,No,https://disrupt-africa.com/2019/01/goodwell-in...,Goodwell Investments
2,3,1,2019,2019-01,Flow,https://flow.rent/,South Africa,2018,Online platform targeted at tenants of residen...,Housing,...,Male-only founding team,,1.5,,$1M-$2M,Seed,,Yes,http://disrupt-africa.com/2019/01/sa-property-...,"Kalon Venture Partners, CRE Venture Capital"
3,4,1,2019,2019-01,M-Tiba,https://m-tiba.co.ke/,Kenya,2015,Mobile payment wallet for healthcare services,Healthcare,...,Male-only founding team,,1.5,,$1M-$2M,Seed,,Yes,https://kenyanwallstreet.com/m-tiba-receives-1...,Agence Francaise de Developpement (AFD)
4,5,1,2019,2019-01,Pura Beverage,https://www.purabeverageco.com/,South Africa,2016,"Pura Soda manufactures, distributes, markets a...",Retail,...,Male-only founding team,Estimation,1.0,$1M+ but exact amount not disclosed publicly,$1M-$2M,Series A,,Yes,https://weetracker.com/2019/02/12/south-africa...,Knife Capital


In [5]:
# filtering out independent variables...
X = deals[['Deal Year', 'Country HQ', 'Sector', 'CEO - University/School - Continent',
          'Years between graduation and startup launch', 'Woman CEO', 'Founders gender mix', 'Type',
          '# of Founders']]

# ...and dependent variables
y = deals['Amount raised $M']

In [6]:
X.head()

Unnamed: 0,Deal Year,Country HQ,Sector,CEO - University/School - Continent,Years between graduation and startup launch,Woman CEO,Founders gender mix,Type,# of Founders
0,2019,Overseas,Energy & Water,Europe,0.0,0,Male-only founding team,Venture Round,3
1,2019,Kenya,Retail,North America,5.0,0,Gender-diverse founding team,Series A,2
2,2019,South Africa,Housing,Africa,12.0,0,Male-only founding team,Seed,3
3,2019,Kenya,Healthcare,Europe,18.0,0,Male-only founding team,Seed,1
4,2019,South Africa,Retail,Africa,13.0,0,Male-only founding team,Series A,1


In [7]:
# making dummy variables
# prefix=['Deal Year', 'Country HQ', 'Sector', 'CEO - University/School - Continent', 'Founders gender mix', 'Type']
X_binary = pd.get_dummies(X,columns=['Deal Year', 'Country HQ', 'Sector', 
                                     'CEO - University/School - Continent', 'Founders gender mix', 'Type'],
                          drop_first=True)

In [8]:
X_binary

Unnamed: 0,Years between graduation and startup launch,Woman CEO,# of Founders,Deal Year_2020,Deal Year_2021,Deal Year_2022,Country HQ_Benin,Country HQ_Botswana,Country HQ_Burkina Faso,Country HQ_Cameroon,...,Type_Pre-Seed,Type_Pre-Series A,Type_Pre-Series B,Type_Pre-Series C,Type_Seed,Type_Series A,Type_Series B,Type_Series C,Type_Series D,Type_Venture Round
0,0.0,0,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,5.0,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,12.0,0,3,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,18.0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,13.0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1162,9.0,0,2,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1163,5.0,0,4,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1164,8.0,1,1,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1165,3.0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [9]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X_binary, y, random_state=42)

In [10]:
# take natural log of target variable
y_train = np.log(y_train)
y_test = np.log(y_test)

In [11]:
y_train.describe()

count    875.000000
mean      -0.113475
std        1.703850
min       -2.302585
25%       -1.609438
50%        0.000000
75%        1.098612
max        4.007333
Name: Amount raised $M, dtype: float64

In [12]:
y_test.describe()

count    292.000000
mean      -0.183439
std        1.742307
min       -2.302585
25%       -2.302585
50%        0.000000
75%        1.098612
max        4.174387
Name: Amount raised $M, dtype: float64

In [13]:
# filling missing data
si = SimpleImputer(strategy = 'mean')
X_train_no_missing = si.fit_transform(X_train)
X_test_no_missing = si.transform(X_test)

In [14]:
# putting simple imputer output into DataFrames
X_train_final = pd.DataFrame(X_train_no_missing, columns = X_train.columns, index = X_train.index)
X_test_final = pd.DataFrame(X_test_no_missing, columns = X_test.columns, index = X_test.index)

In [15]:
# null model
null_model = np.ones(y_test.shape)*np.mean(y_train)
mean_squared_error(y_test, null_model)

3.030131713656713

### Linear regression using statsmodels

In [16]:
X_sm = sm.add_constant(X_train_final) # adding an intercept to the model
model = sm.OLS(y_train, X_sm).fit() # fitting the model

  x = pd.concat(x[::order], 1)


In [17]:
model.summary()

0,1,2,3
Dep. Variable:,Amount raised $M,R-squared:,0.545
Model:,OLS,Adj. R-squared:,0.509
Method:,Least Squares,F-statistic:,15.16
Date:,"Mon, 09 May 2022",Prob (F-statistic):,6.250000000000001e-100
Time:,18:28:15,Log-Likelihood:,-1362.9
No. Observations:,875,AIC:,2856.0
Df Residuals:,810,BIC:,3166.0
Df Model:,64,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0243,1.261,0.019,0.985,-2.450,2.499
Years between graduation and startup launch,0.0060,0.007,0.889,0.374,-0.007,0.019
Woman CEO,0.0657,0.194,0.339,0.734,-0.314,0.446
# of Founders,0.1799,0.055,3.247,0.001,0.071,0.289
Deal Year_2020,-0.2627,0.175,-1.497,0.135,-0.607,0.082
Deal Year_2021,-0.6313,0.161,-3.919,0.000,-0.947,-0.315
Deal Year_2022,-0.1196,0.180,-0.665,0.506,-0.473,0.233
Country HQ_Benin,1.6575,1.407,1.178,0.239,-1.105,4.420
Country HQ_Botswana,-0.4550,1.722,-0.264,0.792,-3.834,2.924

0,1,2,3
Omnibus:,14.1,Durbin-Watson:,2.092
Prob(Omnibus):,0.001,Jarque-Bera (JB):,14.587
Skew:,0.314,Prob(JB):,0.00068
Kurtosis:,2.932,Cond. No.,1.32e+16


### Linear regression using sklearn

In [18]:
lr = LinearRegression()
lr.fit(X_train_final, y_train) # instantiate and fit the model

LinearRegression()

In [19]:
# cross val on train set - mse is better than null model
np.abs(cross_val_score(lr, X_train_final, y_train, scoring='neg_mean_squared_error'))

array([1.59293022, 1.80928828, 1.56049293, 1.33190586, 1.54890134])

In [20]:
# cross val on test set - note significant variation in MSE 
np.abs(cross_val_score(lr, X_test_final, y_test, scoring='neg_mean_squared_error'))

array([2.26377034, 1.60945662, 1.83175234, 1.88714319, 1.50047141])

In [21]:
mean_squared_error(y_train, lr.predict(X_train_final), squared=True) # error on train set

1.3194813254726898

In [22]:
mean_squared_error(y_test, lr.predict(X_test_final), squared=True) # error on test set

1.6163602210641825

### Huber regression with GridSearch

In [23]:
hub = HuberRegressor().fit(X_train_final, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [24]:
mean_squared_error(y_test, hub.predict(X_test_final), squared=True) # mse of model with default params

1.5903346706637116

In [25]:
params = {
    'epsilon': [1, 1.1, 1.2, 1.35, 1.5, 1.75, 2, 2.5, 3, 3.25, 3.5, 4],
    'max_iter': [100,1000,10000]
}

In [26]:
grid_huber = GridSearchCV(HuberRegressor(), param_grid = params, 
                          cv = 5, scoring = "neg_mean_squared_error")

In [27]:
import warnings
warnings.filterwarnings('ignore')

In [28]:
grid_huber.fit(X_train_final, y_train)

GridSearchCV(cv=5, estimator=HuberRegressor(),
             param_grid={'epsilon': [1, 1.1, 1.2, 1.35, 1.5, 1.75, 2, 2.5, 3,
                                     3.25, 3.5, 4],
                         'max_iter': [100, 1000, 10000]},
             scoring='neg_mean_squared_error')

In [29]:
grid_huber.best_params_

{'epsilon': 3.25, 'max_iter': 100}

In [30]:
mean_squared_error(y_train, grid_huber.predict(X_train_final), squared=True) 

1.3343762179669856

In [31]:
# marginally better performance on test set than OLS and Huber with default settings
mean_squared_error(y_test, grid_huber.predict(X_test_final), squared=True)

1.570905245534798

### Building an Ensemble Model

In [32]:
# instantiating the other estimators
knn = KNeighborsRegressor()
forest = RandomForestRegressor()
tree = DecisionTreeRegressor()
huber = HuberRegressor()

In [33]:
# knn.fit(X_train_final, y_train)
# mean_squared_error(y_test, knn.predict(X_test_final), squared=True)

In [34]:
# forest.fit(X_train_final, y_train)
# mean_squared_error(y_test, forest.predict(X_test_final), squared=True)

In [35]:
# tree.fit(X_train_final, y_train)
# mean_squared_error(y_test, tree.predict(X_test_final), squared=True)

In [36]:
ensemble = VotingRegressor([
    ('knn', KNeighborsRegressor()),
    ('lr', LinearRegression()),
    ('forest', RandomForestRegressor()),
    ('huber', HuberRegressor()),
    ('tree', DecisionTreeRegressor())
])

In [37]:
ensemble.fit(X_train_final, y_train)

VotingRegressor(estimators=[('knn', KNeighborsRegressor()),
                            ('lr', LinearRegression()),
                            ('forest', RandomForestRegressor()),
                            ('huber', HuberRegressor()),
                            ('tree', DecisionTreeRegressor())])

In [38]:
ens_preds = ensemble.predict(X_test_final)
mean_squared_error(y_test, ens_preds, squared = True) # ever better performance 

1.3867390475056334

#### Augmenting model performance using GridSearch and weights

In [39]:
# ens_params = {
#    'knn__n_neighbors': [5,6,7,8,9,10],
#    'forest__max_depth': [2,3,4],
#    'tree__max_depth': [2,3,4],
#    'huber__epsilon': [1, 1.5, 2.5, 3.5],
#    'weights': [[.2,.2,.2,.2,.2], [.1, .05,.4,.6, .05], [0.1, 0.15, .35, .3, 0.1]]
# }

In [40]:
# ens_grid = GridSearchCV(ensemble, param_grid=ens_params, cv=5, scoring = 'neg_mean_squared_error')

In [41]:
# ens_grid.fit(X_train_final, y_train)

In [42]:
# ens_grid.best_params_

In [43]:
# ens_grid_preds = ensemble.predict(X_test_final)
# mean_squared_error(y_test, ens_grid_preds, squared = True)

### Building a Neural Net

In [44]:
network = Sequential()
network.add(Dense(100, activation = 'relu')) 
network.add(Dense(100, activation = 'relu')) 
network.add(Dense(100, activation = 'relu')) 
network.add(Dense(1, activation='linear'))

2022-05-09 18:28:54.568789: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [45]:
network.compile(loss = 'mae', optimizer = 'adam', metrics = ['mse'])

In [46]:
# neural net performs worse than ensemble model
history = network.fit(X_train_final, y_train, validation_data = (X_test_final, y_test), epochs = 20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [47]:
# pickling ensemble model
import pickle
with open ('ensemble_model.pkl', 'wb') as ensemble_model:
    pickle.dump(ensemble, ensemble_model)

### Conclusion

Our analysis revealed that the ensemble model is the most accurate predictor of deal size. Surprisingly, the artificial neural network was a weaker estimator.