# Table of Contents:
* [Modelling](#model)
    * [Logistic Regression](#logregmodel)
    * [Decision Tree](#dtmodel)
    * [Random Forest](#rfmodel)
    * [Support Vector Machine](#svmmodel)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Modelling <a class="anchor" id="model"></a>

In [2]:
df = pd.read_csv('../Data/companies_preprocessed.csv', index_col=0)

In [3]:
pd.set_option('display.max_columns', 500)

In [4]:
df.head()

Unnamed: 0,funding_rounds,Advertising,Analytics,Apps,Big Data,Biotechnology,Clean Technology,Cloud Computing,Consulting,Curated Web,E-Commerce,Education,Enterprise Software,Entertainment,Fashion,Games,Hardware + Software,Health Care,Health and Wellness,Hospitality,Information Technology,Internet,Manufacturing,Marketplaces,Media,Medical,Messaging,Mobile,Music,Networking,News,Real Estate,Retail,SaaS,Sales and Marketing,Search,Security,Semiconductors,Services,Social Media,Social Network Media,Software,Sports,Startups,Technology,Travel,Video,Web Hosting,Other,company_age_days,country_is_USA,Austin,Beijing,Boston,Cambridge,Chicago,London,Los Angeles,Mountain View,New York,Palo Alto,Paris,San Diego,San Francisco,Seattle,successful,Anything_Finance
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1878,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,623,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3278,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6930,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,4,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1817,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0


In [5]:
df.shape

(46961, 67)

In [6]:
X = df.drop(columns='successful')
y = df['successful']

In [7]:
X.shape

(46961, 66)

In [8]:
y.shape

(46961,)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Logistic Regression Model <a class="anchor" id="logregmodel"></a>

In [10]:
# instantiate a logistic regression model
logreg_model = LogisticRegression(max_iter=10000)

# fit it to training data
logreg_model.fit(X_train, y_train)

# score the accuracy of the model
lr_train_score = logreg_model.score(X_train, y_train)
lr_test_score = logreg_model.score(X_test, y_test)

print(f"Score on training set: {lr_train_score}")
print(f"Score on test set: {lr_test_score}")

[0.7169284  0.71320202 0.70588235 0.71156662 0.71609211]
Score on training set: 0.7155025553662692
Score on test set: 0.7066964760992228


In [11]:
# creating a dataframe of the coefficients
coef = pd.DataFrame(logreg_model.coef_,  columns = X.columns)

In [12]:
coef

Unnamed: 0,funding_rounds,Advertising,Analytics,Apps,Big Data,Biotechnology,Clean Technology,Cloud Computing,Consulting,Curated Web,E-Commerce,Education,Enterprise Software,Entertainment,Fashion,Games,Hardware + Software,Health Care,Health and Wellness,Hospitality,Information Technology,Internet,Manufacturing,Marketplaces,Media,Medical,Messaging,Mobile,Music,Networking,News,Real Estate,Retail,SaaS,Sales and Marketing,Search,Security,Semiconductors,Services,Social Media,Social Network Media,Software,Sports,Startups,Technology,Travel,Video,Web Hosting,Other,company_age_days,country_is_USA,Austin,Beijing,Boston,Cambridge,Chicago,London,Los Angeles,Mountain View,New York,Palo Alto,Paris,San Diego,San Francisco,Seattle,Anything_Finance
0,0.693935,-0.062138,0.033443,-0.175068,0.02509,0.300542,0.050111,-0.001352,-0.076865,-0.273669,-0.3022,-0.204929,0.13728,-0.06016,-0.070345,-0.164115,-0.032913,0.094753,-0.060203,-0.069946,-0.016144,-0.125353,-0.043431,-0.149207,-0.035455,-0.018125,-0.057693,-0.258516,-0.109287,-0.038986,-0.047736,-0.053649,-0.037044,-0.107775,-0.046087,-0.078685,0.064046,0.077847,-0.062723,-0.372481,-0.135452,-0.131515,-0.11214,-0.076141,-0.048855,-0.088818,-0.032815,0.025352,-0.310522,0.000285,0.259026,0.008059,0.10249,0.048082,0.071261,-0.004142,-0.225841,-0.044188,0.070491,0.112057,0.074645,-0.017921,0.033977,0.250345,0.018317,0.007385


In [13]:
# adding the odds ratios to the dataframe to look at them side by side with the coefficients
coef.loc[1] = np.exp(logreg_model.coef_)[0]
coef.rename({0:'coefficients', 1:'odds_ratios'}, inplace=True)

In [14]:
# sorting by coefficients
coef.sort_values(axis=1, by='coefficients', ascending=False)

Unnamed: 0,funding_rounds,Biotechnology,country_is_USA,San Francisco,Enterprise Software,New York,Beijing,Health Care,Semiconductors,Palo Alto,Cambridge,Mountain View,Security,Clean Technology,Boston,San Diego,Analytics,Web Hosting,Big Data,Seattle,Austin,Anything_Finance,company_age_days,Cloud Computing,Chicago,Information Technology,Paris,Medical,Video,Hardware + Software,Media,Retail,Networking,Manufacturing,Los Angeles,Sales and Marketing,News,Technology,Real Estate,Messaging,Entertainment,Health and Wellness,Advertising,Services,Hospitality,Fashion,Startups,Consulting,Search,Travel,SaaS,Music,Sports,Internet,Software,Social Network Media,Marketplaces,Games,Apps,Education,London,Mobile,Curated Web,E-Commerce,Other,Social Media
coefficients,0.693935,0.300542,0.259026,0.250345,0.13728,0.112057,0.10249,0.094753,0.077847,0.074645,0.071261,0.070491,0.064046,0.050111,0.048082,0.033977,0.033443,0.025352,0.02509,0.018317,0.008059,0.007385,0.000285,-0.001352,-0.004142,-0.016144,-0.017921,-0.018125,-0.032815,-0.032913,-0.035455,-0.037044,-0.038986,-0.043431,-0.044188,-0.046087,-0.047736,-0.048855,-0.053649,-0.057693,-0.06016,-0.060203,-0.062138,-0.062723,-0.069946,-0.070345,-0.076141,-0.076865,-0.078685,-0.088818,-0.107775,-0.109287,-0.11214,-0.125353,-0.131515,-0.135452,-0.149207,-0.164115,-0.175068,-0.204929,-0.225841,-0.258516,-0.273669,-0.3022,-0.310522,-0.372481
odds_ratios,2.001577,1.350591,1.295668,1.284469,1.147149,1.118576,1.107926,1.099388,1.080957,1.077502,1.073861,1.073035,1.066141,1.051388,1.049257,1.034561,1.034009,1.025676,1.025408,1.018486,1.008092,1.007412,1.000285,0.998649,0.995867,0.983986,0.982238,0.982038,0.967718,0.967622,0.965166,0.963633,0.961765,0.957499,0.956774,0.954959,0.953386,0.95232,0.947765,0.94394,0.941614,0.941574,0.939753,0.939204,0.932445,0.932072,0.926685,0.926014,0.924331,0.915012,0.89783,0.896473,0.893919,0.882185,0.876766,0.873321,0.861391,0.848645,0.8394,0.814705,0.797845,0.772196,0.760583,0.73919,0.733064,0.689023


In [15]:
# add confusion matrix

In [16]:
# add other accuracy scores

## Decision Tree Model <a class="anchor" id="dtmodel"></a>

In [17]:
# instantiate decision tree
dt_model = DecisionTreeClassifier(max_depth=7)

# fit it to training data
dt_model.fit(X_train, y_train)

# score the accuracy
dt_train_score = dt_model.score(X_train, y_train)
dt_test_score = dt_model.score(X_test, y_test)

print(f"Score on training set: {dt_train_score}")
print(f"Score on test set: {dt_test_score}")

Score on training set: 0.7276405451448041
Score on test set: 0.7155328436069414


In [18]:
# without going in-depth into hyperparameter optimization, I can see that this tree model is more accurate than the logistic regression by about 1%

## Random Forest Model <a class="anchor" id="rfmodel"></a>

In [19]:
# instantiate
rf_model = RandomForestClassifier(n_estimators=50, max_depth=9)

# fit
rf_model.fit(X_train, y_train)

# score
rf_train_score = rf_model.score(X_train, y_train)
rf_test_score = rf_model.score(X_test, y_test)

print(f"Score on training set: {rf_train_score}")
print(f"Score on test set: {rf_test_score}")

Score on training set: 0.728199531516184
Score on test set: 0.715213456829554


In [20]:
# no better than decision tree 

## Support Vector Machine Model <a class="anchor" id="svmmodel"></a>

In [21]:
# instantiate
svm_model = LinearSVC(dual='auto')

# fit
svm_model.fit(X_train, y_train)

# score
svm_train_score = svm_model.score(X_train, y_train)
svm_test_score = svm_model.score(X_test, y_test)

print(f"Score on training set: {svm_train_score}")
print(f"Score on test set: {svm_test_score}")

Score on training set: 0.714118398637138
Score on test set: 0.7062706270627063


In [11]:
# scale the data
minmax_scaler = MinMaxScaler()

minmax_scaler.fit(X_train)

X_train = minmax_scaler.transform(X_train)
X_test = minmax_scaler.transform(X_test)

# instantiate
svc_model = SVC()

# fit
svc_model.fit(X_train, y_train)

# score
svc_train_score = svc_model.score(X_train, y_train)
svc_test_score = svc_model.score(X_test, y_test)

print(f"Score on training set: {svc_train_score}")
print(f"Score on test set: {svc_test_score}")

Score on training set: 0.7284124787052811
Score on test set: 0.6822101564995209


In [10]:
# scale the data
standard_scaler = StandardScaler()

standard_scaler.fit(X_train)

X_train = standard_scaler.transform(X_train)
X_test = standard_scaler.transform(X_test)

# instantiate
svc_model = SVC()

# fit
svc_model.fit(X_train, y_train)

# score
svc_train_score = svc_model.score(X_train, y_train)
svc_test_score = svc_model.score(X_test, y_test)

print(f"Score on training set: {svc_train_score}")
print(f"Score on test set: {svc_test_score}")

Score on training set: 0.75037265758092
Score on test set: 0.7056318535079315


In [None]:
# go through model evaluation notebook, plot ROC curves

## GridSearch

## Neural Network