In [11]:
# Load all the data and transform it into lists of strings

import pandas as pd

def intoList(seq_string):
    return list(seq_string.split(','))

def intoConverters(list_of_columns):
    return {c : intoList for c in list_of_columns}

def successfulAndUnsuccessfulFromSheet(sheet_name, columns):
    c = intoConverters(columns)
    successful = pd.read_csv('Moneyball, Successful Companies (Internship data) - ' + sheet_name + '.csv', converters=c)
    unsuccessful = pd.read_csv('Moneyball, Unsuccessful Companies (Intership data) - ' + sheet_name + '.csv', converters=c)
    # Remove two unnecessary rows from some datasets of the successful startups
    if sheet_name != 'List':
        successful = successful.drop(index=[len(successful) - 2, len(successful) - 1])
    return (successful, unsuccessful)

academic_columns = ['universities_of_founders', 'degrees_of_founders', 'subject_degrees_of_founders', 'gender_of_founders', 'city_of_founders']
list_columns     = ['founded_year', 'country_code', 'city', 'category_list', 'category_groups_list']
work_columns     = ['prev_companies_of_founders', 'prev_title_of_founders']
investor_columns = ['investor_name']

(s_list, u_list)         = successfulAndUnsuccessfulFromSheet('List', list_columns)
(s_academic, u_academic) = successfulAndUnsuccessfulFromSheet('Academic', academic_columns)
(s_work, u_work)         = successfulAndUnsuccessfulFromSheet('Work', work_columns)
(s_investor, u_investor) = successfulAndUnsuccessfulFromSheet('Investor', investor_columns)

In [12]:
# Merge dataframes for different sheets in the table

def mergeDataframes(list_of_dfs):
    if len(list_of_dfs) == 1:
        return list_of_dfs[0]
    return pd.merge(list_of_dfs[-1], mergeDataframes(list_of_dfs[:-1]), on='name')

s_include = [s_investor, s_list, s_work, s_academic]
u_include = [u_investor, u_list, u_work, u_academic]

successful = mergeDataframes(s_include)
unsuccessful = mergeDataframes(u_include)

In [13]:
# Combine successful and unsuccessfult startups' data and create labels

y_successful = [True] * len(successful)
y_unsuccessful = [False] * len(unsuccessful)

x = pd.concat([successful, unsuccessful])
y = y_successful + y_unsuccessful

x = x.reset_index(drop=True)

In [14]:
# Fix a problem in the dataset:
#   Some universities have different names in the successful and unsuccessful datasets
#
# Solve this by choosing one of the names and changing all its appearances into the other one

def fixUniversities(universities):
    result = []
    d = {'Massachusetts Institute of Technology - MIT' : 'Massachusetts Institute of Technology', 
         ' Los Angeles (UCLA)' : ' Los Angeles', 
         'Caltech - California Institute of Technology' : 'Caltech',
         'University of Illinois at Urbana-Champaign (UIUC)' : 'University of Illinois at Urbana-Champaign'}
    for university in universities:
        if university in d:
            result.append(d[university])
        else:
            result.append(university)
    return result

if 'universities_of_founders' in x.columns:
    x['universities_of_founders'] = x['universities_of_founders'].apply(fixUniversities)

In [15]:
# Create new feature 'number_of_founders'
# As the data has a lot of missing values, get values from two sources and then take their maximum
#
# The sources are:
# 1. 'universities_of_founders' - the length of this list should be the same as the number of founders
# 2. 'prev_companies_of_founders' - as the startup of the founders is also included, by counting its
# appeareance in this list, we should get the number of founders

x['number_of_founders'] = 0 * len(x)
for index, companies in x['prev_companies_of_founders'].items():
    from_work = 0
    for company in companies:
        if company == x['name'][index]:
            from_work += 1
    from_university = len(x['universities_of_founders'][index])

    x.at[index, 'number_of_founders'] = max(from_work, from_university)

x['number_of_founders'] = x['number_of_founders'].apply(lambda num: [str(num)] if num <= 5 else ['More than 5'])

In [None]:
# Remove certain features

x.drop(columns=['name', 'founded_year', 'country_code'], inplace=True)

In [16]:
# One-hot encode each label

start_columns = x.columns
for start_column in start_columns:
    x = x.join(x[start_column].str.join('|').str.get_dummies())
    x.drop(columns = [start_column], inplace=True)
    for column in x.columns:
        if not any(column.startswith(c) for c in start_columns):
            x.rename(columns={column : start_column + ': ' + column}, inplace=True)

In [None]:
# Remove this specific value because it appears only in the unsuccessful dataset

x.drop(columns='category_groups_list: Other', inplace=True)

In [52]:
# Split the data into a training and a test set

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=170)

In [53]:
# k-Nearest Neighbours model

from sklearn.neighbors import KNeighborsClassifier

neigh = KNeighborsClassifier().fit(x_train, y_train)
print()
print('Training Dataset Score, k-Nearest Neighbours:', neigh.score(x_train, y_train))
print('Test Dataset Score, k-Nearest Neighbours:', neigh.score(x_test, y_test))


Training Dataset Score, k-Nearest Neighbours: 0.7105575326215896
Test Dataset Score, k-Nearest Neighbours: 0.7014218009478673


In [54]:
# Random Forest model

from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier().fit(x_train, y_train)
print()
print('Training Dataset Score, Random Forest:', forest.score(x_train, y_train))
print('Test Dataset Score, Random Forest:', forest.score(x_test, y_test))


Training Dataset Score, Random Forest: 1.0
Test Dataset Score, Random Forest: 0.7298578199052133


In [57]:
# Logistic Regression model

from sklearn.linear_model import LogisticRegression
import numpy

logistic_regression = LogisticRegression(C=0.1).fit(x_train, y_train)
print()
print('Training Dataset Score, Logistic Regression:', logistic_regression.score(x_train, y_train))
print('Test Dataset Score, Logistic Regression:', logistic_regression.score(x_test, y_test))

# Output the sorted features by their coefficients

print()
print('Features ordered by their coefficients in the model:')
with numpy.printoptions(threshold=numpy.inf):
    v = []
    for i, val in enumerate(logistic_regression.coef_.tolist()[0]):
        v.append((val, x.columns[i]))
    v.sort(reverse=True)
    print(v)


Training Dataset Score, Logistic Regression: 0.9608540925266904
Test Dataset Score, Logistic Regression: 0.7582938388625592

Features ordered by their coefficients in the model:
[(1.0675928815042581, 'prev_title_of_founders: Board Member'), (0.408939392093113, 'category_list: SaaS'), (0.3723971367794686, 'subject_degrees_of_founders: Computer Science'), (0.3694804776006485, 'investor_name: Accel'), (0.3605489254778517, 'category_list: Enterprise Software'), (0.3383494071452607, 'city_of_founders: San Francisco'), (0.33311908636938314, 'category_list: Financial Services'), (0.3269444662444348, 'prev_title_of_founders: Advisor'), (0.31231556473568745, 'investor_name: GV'), (0.31222885920922483, 'number_of_founders: More than 5'), (0.3083381875614447, 'universities_of_founders: Stanford University'), (0.30015887906773764, 'investor_name: SV Angel'), (0.2981881738426729, 'degrees_of_founders: Bachelor'), (0.29682248155150587, 'category_groups_list: Transportation'), (0.2900363521521912, '