In [1]:
    !pip install pandas
!pip install numpy
!pip install matplotlib
!pip install pyspark



In [2]:
import pandas as pd
import numpy as np
# Load functionality to manipulate dataframes
from pyspark.sql import functions as fn
import matplotlib.pyplot as plt
from pyspark.sql.functions import stddev, mean, col
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
# Functionality for computing features
from pyspark.ml import feature, regression, classification, Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import feature, regression, classification, Pipeline
from pyspark.ml.feature import Tokenizer, VectorAssembler, HashingTF, Word2Vec, StringIndexer, OneHotEncoder
from pyspark.ml import clustering
from itertools import chain
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml import classification
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, DecisionTreeClassifier
from pyspark.ml import evaluation
from pyspark.ml.evaluation import BinaryClassificationEvaluator,MulticlassClassificationEvaluator
from pyspark import keyword_only
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param

In [3]:
from google.colab import files
uploaded = files.upload()

Saving master.csv to master.csv


In [4]:
MAX_MEMORY = "45g"

spark = SparkSession \
    .builder \
    .appName("how to read csv file") \
    .config("spark.executor.memory", MAX_MEMORY) \
    .config("spark.driver.memory", MAX_MEMORY) \
    .getOrCreate()

In [5]:
# load master dataset
dfmaster = spark.read.format("csv").load("master.csv", delimiter = ",", header = True)


Preparations and Understanding before Modeling

In [6]:
# create a 0/1 column for acquistions
dfmaster = dfmaster.\
  withColumn("labelacq", fn.when(col("status") == "acquired","1").otherwise("0"))

In [7]:
# number of rows in master table
print(dfmaster.count())

49445


In [8]:
dfmaster

DataFrame[_c0: string, permalink: string, name: string, market: string, funding_total_usd: string, status: string, country_code: string, city: string, funding_rounds: string, founded_year: string, quarter_new: string, age: string, permalink_agg: string, count_investor: string, time_to_first_funding: string, investor_country_codes: string, funding_round_types: string, total_raised_usd: string, permalink_sub: string, category_final: string, perma: string, investor_country_code: string, permaround: string, funding_round_type: string, labelacq: string]

NAs and market column (with too many levels) handeling

In [9]:
# check for missing values 
dfmaster.toPandas().isnull().sum()

_c0                           0
permalink                     1
name                          1
market                     3967
funding_total_usd             1
status                     1315
country_code               5273
city                       6116
funding_rounds                1
founded_year              10956
quarter_new               10956
age                       10956
permalink_agg                 1
count_investor                1
time_to_first_funding     24731
investor_country_codes    20954
funding_round_types       17110
total_raised_usd          49435
permalink_sub                 1
category_final            45459
perma                     20954
investor_country_code     20954
permaround                17110
funding_round_type        17110
labelacq                      0
dtype: int64

In [10]:
# drop market columns because of too many level and better breakdown with the category_final column
dfmaster1 = dfmaster.drop("market")

In [11]:
dfmaster1 = dfmaster1.toPandas()
dfmaster1

Unnamed: 0,_c0,permalink,name,funding_total_usd,status,country_code,city,funding_rounds,founded_year,quarter_new,age,permalink_agg,count_investor,time_to_first_funding,investor_country_codes,funding_round_types,total_raised_usd,permalink_sub,category_final,perma,investor_country_code,permaround,funding_round_type,labelacq
0,0,/organization/1lay,1Lay,170000,operating,,,1,2013,Q3,1.5,/organization/1lay,0.0,,,,,/organization/1lay,,,,,,0
1,1,/organization/24pagebooks,24PageBooks,50000,closed,USA,Rochester,1,2010,Q1,5.0,/organization/24pagebooks,1.0,1.25,FRA,seed,,/organization/24pagebooks,,/organization/24pagebooks,FRA,/organization/24pagebooks,seed,0
2,2,/organization/5min,5min Media,12800000,acquired,USA,New York,3,2007,Q1,8.0,/organization/5min,3.0,0.75,"USA, USA, USA","venture, venture, venture",,/organization/5min,,/organization/5min,USA,/organization/5min,venture,1
3,3,/organization/abpathfinder,ABPathfinder,960000,operating,USA,Overland Park,3,2010,Q2,4.75,/organization/abpathfinder,4.0,2.5,"USA, USA, USA","venture, venture, venture, venture",,/organization/abpathfinder,,/organization/abpathfinder,USA,/organization/abpathfinder,venture,0
4,4,/organization/acid-labs,Acid Labs,-,operating,USA,Santa Monica,1,2003,Q1,12.0,/organization/acid-labs,0.0,,,,,/organization/acid-labs,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49440,49440,/organization/yieldmo,Yieldmo,22100000,operating,USA,New York,3,2012,Q2,2.75,/organization/yieldmo,12.0,0.25,"USA, USA, USA, USA, USA, ISR, USA, USA, USA, U...","venture, venture, venture, venture, venture, v...",,/organization/yieldmo,,/organization/yieldmo,USA,/organization/yieldmo,venture,0
49441,49441,/organization/youxinpai,Youxinpai,30000000,operating,CHN,Beijing,1,2011,Q3,3.5,/organization/youxinpai,4.0,1.75,"CHN, USA, CHN","venture, venture, venture, venture",,/organization/youxinpai,,/organization/youxinpai,CHN,/organization/youxinpai,venture,0
49442,49442,/organization/zeropercent-us,ZeroPercent.us,-,operating,USA,Chicago,1,2011,Q3,3.5,/organization/zeropercent-us,1.0,2.5,USA,seed,,/organization/zeropercent-us,,/organization/zeropercent-us,USA,/organization/zeropercent-us,seed,0
49443,49443,/organization/zinch,Zinch,5486842,acquired,USA,Orem,3,2007,Q1,8.0,/organization/zinch,3.0,0.75,"USA, USA","venture, venture, angel",,/organization/zinch,,/organization/zinch,USA,/organization/zinch,venture,1


In [12]:
# Replace NaN with mode for categorical variables
dfmaster1['total_raised_usd'] = dfmaster1['total_raised_usd'].fillna(dfmaster1['total_raised_usd'].mode()[0])
dfmaster1['time_to_first_funding'] = dfmaster1['time_to_first_funding'].fillna(dfmaster1['time_to_first_funding'].mode()[0])
dfmaster1['founded_year'] = dfmaster1['founded_year'].fillna(dfmaster1['founded_year'].mode()[0])
dfmaster1['age'] = dfmaster1['age'].fillna(dfmaster1['age'].mode()[0])
dfmaster1['status'] = dfmaster1['status'].fillna(dfmaster1['status'].mode()[0])
dfmaster1['country_code'] = dfmaster1['country_code'].fillna(dfmaster1['country_code'].mode()[0])
dfmaster1['city'] = dfmaster1['city'].fillna(dfmaster1['city'].mode()[0])
dfmaster1['quarter_new'] = dfmaster1['quarter_new'].fillna(dfmaster1['quarter_new'].mode()[0])
dfmaster1['investor_country_codes'] = dfmaster1['investor_country_codes'].fillna(dfmaster1['investor_country_codes'].mode()[0])
dfmaster1['funding_round_types'] = dfmaster1['funding_round_types'].fillna(dfmaster1['funding_round_types'].mode()[0])
dfmaster1['permaround'] = dfmaster1['permaround'].fillna(dfmaster1['permaround'].mode()[0])
dfmaster1['investor_country_code'] = dfmaster1['investor_country_code'].fillna(dfmaster1['investor_country_code'].mode()[0])
dfmaster1['funding_round_type'] = dfmaster1['funding_round_type'].fillna(dfmaster1['funding_round_type'].mode()[0])
dfmaster1['category_final'] = dfmaster1['category_final'].fillna(dfmaster1['category_final'].mode()[0])
dfmaster1['perma'] = dfmaster1['perma'].fillna(dfmaster1['perma'].mode()[0])

In [13]:
# check for missing values 
dfmaster1.isnull().sum()

_c0                       0
permalink                 1
name                      1
funding_total_usd         1
status                    0
country_code              0
city                      0
funding_rounds            1
founded_year              0
quarter_new               0
age                       0
permalink_agg             1
count_investor            1
time_to_first_funding     0
investor_country_codes    0
funding_round_types       0
total_raised_usd          0
permalink_sub             1
category_final            0
perma                     0
investor_country_code     0
permaround                0
funding_round_type        0
labelacq                  0
dtype: int64

In [14]:
# drop rows with missing values
dfmaster1drop = dfmaster1.dropna()

In [15]:
print(dfmaster1drop.count())

_c0                       49444
permalink                 49444
name                      49444
funding_total_usd         49444
status                    49444
country_code              49444
city                      49444
funding_rounds            49444
founded_year              49444
quarter_new               49444
age                       49444
permalink_agg             49444
count_investor            49444
time_to_first_funding     49444
investor_country_codes    49444
funding_round_types       49444
total_raised_usd          49444
permalink_sub             49444
category_final            49444
perma                     49444
investor_country_code     49444
permaround                49444
funding_round_type        49444
labelacq                  49444
dtype: int64


In [16]:
sql = SQLContext(spark)

In [17]:
dfmaster2 = sql.createDataFrame(dfmaster1drop)

In [18]:
display(dfmaster2)

DataFrame[_c0: string, permalink: string, name: string, funding_total_usd: string, status: string, country_code: string, city: string, funding_rounds: string, founded_year: string, quarter_new: string, age: string, permalink_agg: string, count_investor: string, time_to_first_funding: string, investor_country_codes: string, funding_round_types: string, total_raised_usd: string, permalink_sub: string, category_final: string, perma: string, investor_country_code: string, permaround: string, funding_round_type: string, labelacq: string]

String indexer, one hot encoder and casting to numerics

In [19]:
# create index for categorical variables
# use pipline to apply indexer
list1 = ["country_code","city","quarter_new","investor_country_code","funding_round_type","category_final"]
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(dfmaster2) for column in list1]
pipelineindex = Pipeline(stages=indexers).fit(dfmaster2)
dfmasternew = pipelineindex.transform(dfmaster2)

In [20]:
# convert string to double for numerical variables
dfmasternew = dfmasternew.\
  withColumn("numeric_funding_rounds", dfmasternew["funding_rounds"].cast("int")).\
  withColumn("numeric_age", dfmasternew["age"].cast("int")).\
  withColumn("numeric_count_investor", dfmasternew["count_investor"].cast("int")).\
  withColumn("numeric_time_to_first_funding", dfmasternew["time_to_first_funding"].cast("int")).\
  withColumn("numeric_total_raised_usd", dfmasternew["total_raised_usd"].cast("int")).\
  withColumn("label", dfmasternew["labelacq"].cast("int"))

In [21]:
dfmasternew = dfmasternew.\
  withColumn("funding_round_type", dfmasternew["funding_round_type"].cast("double")).\
  withColumn("country_code_index", dfmasternew["country_code_index"].cast("double")).\
  withColumn("city_index", dfmasternew["city_index"].cast("double")).\
  withColumn("quarter_new_index", dfmasternew["quarter_new_index"].cast("double")).\
  withColumn("labelacq", dfmasternew["labelacq"].cast("double"))

In [22]:
# save
dfone = dfmasternew

In [23]:
display(dfone)

DataFrame[_c0: string, permalink: string, name: string, funding_total_usd: string, status: string, country_code: string, city: string, funding_rounds: string, founded_year: string, quarter_new: string, age: string, permalink_agg: string, count_investor: string, time_to_first_funding: string, investor_country_codes: string, funding_round_types: string, total_raised_usd: string, permalink_sub: string, category_final: string, perma: string, investor_country_code: string, permaround: string, funding_round_type: double, labelacq: double, country_code_index: double, city_index: double, quarter_new_index: double, investor_country_code_index: double, funding_round_type_index: double, category_final_index: double, numeric_funding_rounds: int, numeric_age: int, numeric_count_investor: int, numeric_time_to_first_funding: int, numeric_total_raised_usd: int, label: int]

In [24]:
print(dfone.count())

49444


In [25]:
# list of index columns of categorical variables for the onehotencoder
list2 = dfone.columns[24:30]
list2

['country_code_index',
 'city_index',
 'quarter_new_index',
 'investor_country_code_index',
 'funding_round_type_index',
 'category_final_index']

In [26]:
# create sparse matrix of indexed categorical columns
# use pipline to apply the encoder
onehotencoder_stages = [OneHotEncoder(inputCol=c, outputCol='onehotencoded_' + c) for c in list2]
pipelineonehot = Pipeline(stages=onehotencoder_stages)
pipeline_mode = pipelineonehot.fit(dfone)
df_coded = pipeline_mode.transform(dfone)

In [27]:
df_coded.show()

+---+--------------------+--------------------+-----------------+---------+------------+-------------+--------------+------------+-----------+----+--------------------+--------------+---------------------+----------------------+--------------------+----------------+--------------------+--------------+--------------------+---------------------+--------------------+------------------+--------+------------------+----------+-----------------+---------------------------+------------------------+--------------------+----------------------+-----------+----------------------+-----------------------------+------------------------+-----+--------------------------------+------------------------+-------------------------------+-----------------------------------------+--------------------------------------+----------------------------------+
|_c0|           permalink|                name|funding_total_usd|   status|country_code|         city|funding_rounds|founded_year|quarter_new| age|       perm


Data split, defining vector assemblers & standard scaler and creating labellist

In [28]:
# split dataset into training, validaiton and testing dataset
training_df, validation_df, testing_df = df_coded.randomSplit([0.6, 0.3, 0.1])

In [29]:
training_df.columns[30:35]

['numeric_funding_rounds',
 'numeric_age',
 'numeric_count_investor',
 'numeric_time_to_first_funding',
 'numeric_total_raised_usd']

In [30]:
training_df.columns[36:42]

['onehotencoded_country_code_index',
 'onehotencoded_city_index',
 'onehotencoded_quarter_new_index',
 'onehotencoded_investor_country_code_index',
 'onehotencoded_funding_round_type_index',
 'onehotencoded_category_final_index']

In [31]:
# define vector assembler with the features for the modelling
vanum = VectorAssembler(). \
      setInputCols(training_df.columns[30:35]). \
      setOutputCol('features_nonstd')

In [32]:
# define vector assembler with the features for the modelling
vacate = VectorAssembler(). \
      setInputCols(training_df.columns[36:42]). \
      setOutputCol('featurescate')

In [33]:
va = VectorAssembler(). \
      setInputCols(['featuresnum','featurescate']). \
      setOutputCol('features')

In [34]:
std = feature.StandardScaler(withMean=True, withStd=True).setInputCol('features_nonstd').setOutputCol('featuresnum')

In [35]:
# suffix for investor country code because intersection with county_code of  the companies
invcc = ['{}_{}'.format(a, "investor") for a in indexers[3].labels]

In [36]:
# define labellist by using the indexer stages for displaying the weights & loadings
labellist = training_df.columns[30:35] + indexers[0].labels + indexers[1].labels + indexers[2].labels + invcc + indexers[4].labels + indexers[5].labels

In [37]:
# null dummy for onehotencoded_country_code_index
print("null dummy for onehotencoded_country_code_index")
print(len(indexers[0].labels))
print(indexers[0].labels)
# null dummy for onehotencoded_city_index
print("null dummy for onehotencoded_city_index")
print(indexers[1].labels)
print(len(indexers[1].labels))
# null dummy for onehotencoded_quarter_new_index
print("null dummy for onehotencoded_quarter_new_index")
print(len(indexers[2].labels))
print(indexers[2].labels)
# null dummy for onehotencoded_investor_country_code_index
print("null dummy for onehotencoded_investor_country_code_index")
print(len(invcc))
print(invcc)
# null dummy for onehotencoded_funding_round_type_index
print("null dummy for onehotencoded_funding_round_type_index")
print(len(indexers[4].labels))
print(indexers[4].labels)
# null dummy for onehotencoded_category_final_index
print("null dummy for onehotencoded_category_final_index")
print(len(indexers[5].labels))
print(indexers[5].labels)

null dummy for onehotencoded_country_code_index
115
['USA', 'GBR', 'CAN', 'CHN', 'DEU', 'FRA', 'IND', 'ISR', 'ESP', 'RUS', 'SWE', 'AUS', 'ITA', 'NLD', 'IRL', 'SGP', 'BRA', 'CHL', 'JPN', 'KOR', 'CHE', 'DNK', 'FIN', 'ARG', 'BEL', 'HKG', 'TUR', 'AUT', 'NOR', 'POL', 'MEX', 'PRT', 'BGR', 'ARE', 'NZL', 'IDN', 'ZAF', 'CZE', 'MYS', 'UKR', 'EST', 'HUN', 'TWN', 'THA', 'COL', 'PHL', 'GRC', 'LTU', 'PER', 'NGA', 'KEN', 'EGY', 'LUX', 'ROM', 'VNM', 'DZA', 'JOR', 'PAK', 'ISL', 'SVK', 'LBN', 'CYP', 'LVA', 'URY', 'CYM', 'GHA', 'SVN', 'KHM', 'SRB', 'UGA', 'HRV', 'BGD', 'SAU', 'TZA', 'CRI', 'PAN', 'BMU', 'BWA', 'GTM', 'AZE', 'BHR', 'BLR', 'DOM', 'MAR', 'MLT', 'SLV', 'TUN', 'ARM', 'BHS', 'CMR', 'ECU', 'GIB', 'KWT', 'LAO', 'MDA', 'MKD', 'MMR', 'NIC', 'NPL', 'ALB', 'BRN', 'CIV', 'JAM', 'JEY', 'LIE', 'MAF', 'MCO', 'MOZ', 'MUS', 'OMN', 'SOM', 'SYC', 'TTO', 'UZB', 'ZWE']
null dummy for onehotencoded_city_index
['San Francisco', 'New York', 'London', 'Palo Alto', 'Austin', 'Cambridge', 'Seattle', 'Chicago', 'Los

# Modeling

## DECISION TREE

In [38]:
# define multiclass classification evaluator
mce = MulticlassClassificationEvaluator()

In [39]:
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")

In [40]:
dt_pipeline = Pipeline(stages=[vanum, std, vacate, va, dt]).fit(training_df)

In [41]:
dfdt = dt_pipeline.transform(validation_df)

In [42]:
# print the areas under the curve for the different random forest pipelines
print("Decision Tree: AUC = {}".format(mce.evaluate(dfdt)))

Decision Tree: AUC = 0.889513358604838


In [43]:
# print the accuracies for the different random forest pipelines
print(dfdt.select(fn.expr('float(label = prediction)').alias('correct')).select(fn.avg('correct').alias("Accuracy for Decision Tree")).show())

+--------------------------+
|Accuracy for Decision Tree|
+--------------------------+
|         0.924978171804688|
+--------------------------+

None


## RANDOM FOREST

In [44]:
# define binary classification evaluator
bce = BinaryClassificationEvaluator()

In [45]:
# define default, 15 trees and 25 trees random forest classifier
rf = RandomForestClassifier(maxBins=10000, featuresCol='features', labelCol='label')
rf15 = RandomForestClassifier(numTrees=15, maxBins=10000, featuresCol='features', labelCol='label')
rf25 = RandomForestClassifier(numTrees=25, maxBins=10000, featuresCol='features', labelCol='label')

In [46]:
# define and fit pipelines with vector assembler and random forest classifier 
rf_pipeline = Pipeline(stages=[vanum, std, vacate, va, rf]).fit(training_df)
rf_pipeline_15 = Pipeline(stages=[vanum, std, vacate, va, rf15]).fit(training_df)
rf_pipeline_25 = Pipeline(stages=[vanum, std, vacate, va, rf25]).fit(training_df)

In [47]:
dfrf = rf_pipeline.transform(validation_df)
dfrf_15 = rf_pipeline_15.transform(validation_df)
dfrf_25 = rf_pipeline_25.transform(validation_df)

In [48]:
dfrf.show()

+-----+--------------------+--------------------+-----------------+---------+------------+--------------+--------------+------------+-----------+----+--------------------+--------------+---------------------+----------------------+--------------------+----------------+--------------------+--------------+--------------------+---------------------+--------------------+------------------+--------+------------------+----------+-----------------+---------------------------+------------------------+--------------------+----------------------+-----------+----------------------+-----------------------------+------------------------+-----+--------------------------------+------------------------+-------------------------------+-----------------------------------------+--------------------------------------+----------------------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|  _c0|           

## Performance

In [49]:
# print the areas under the curve for the different random forest pipelines
print("Random Forest with 20 trees: AUC = {}".format(bce.evaluate(dfrf)))
print("Random Forest 15 trees: AUC = {}".format(bce.evaluate(dfrf_15)))
print("Random Forest 25 trees: AUC = {}".format(bce.evaluate(dfrf_25)))

Random Forest with 20 trees: AUC = 0.7001709984266348
Random Forest 15 trees: AUC = 0.7083893275885205
Random Forest 25 trees: AUC = 0.7107933708197851


In [50]:
# print the accuracies for the different random forest pipelines
print(dfrf.select(fn.expr('float(label = prediction)').alias('correct')).select(fn.avg('correct').alias("Accuracy for Random Forest with 20 trees")).show())
print(dfrf_15.select(fn.expr('float(label = prediction)').alias('correct')).select(fn.avg('correct').alias("Accuracy for Random Forest with 15 trees")).show())
print(dfrf_25.select(fn.expr('float(label = prediction)').alias('correct')).select(fn.avg('correct').alias("Accuracy for Random Forest with 25 trees")).show())

+----------------------------------------+
|Accuracy for Random Forest with 20 trees|
+----------------------------------------+
|                       0.925313990194103|
+----------------------------------------+

None
+----------------------------------------+
|Accuracy for Random Forest with 15 trees|
+----------------------------------------+
|                       0.925313990194103|
+----------------------------------------+

None
+----------------------------------------+
|Accuracy for Random Forest with 25 trees|
+----------------------------------------+
|                       0.925313990194103|
+----------------------------------------+

None


For some reason the accuracy is exactly the same for all three models, probably meaning that the three models do not have a significant difference. However, the values for the importancies (see down below) are different.

## Importancies and Weights

In [51]:
# create spark df with the 20 highest labels and the corresponding importancies + sorting by importancy
rfw = spark.createDataFrame(pd.DataFrame(list(zip(labellist, rf_pipeline.stages[4].featureImportances.toArray())),
            columns = ['column', 'importancy']).sort_values('importancy').tail(20))
display(rfw)
rfw.show()

DataFrame[column: string, importancy: double]

+--------------------+--------------------+
|              column|          importancy|
+--------------------+--------------------+
|        Little River|0.013912700384966507|
|         Culver City|0.014456104494976243|
|         Steinhausen|0.014598402993142856|
|            Prospect|0.014600395745150572|
|                Bray| 0.01646026575387154|
|numeric_time_to_f...| 0.01811280030889253|
|               Orsay|0.019502531047753787|
|            San Jose|0.020545251300477893|
|              Moscow|  0.0221019046042994|
|             Norwich|0.023010138120640407|
|        Philadelphia|0.029166669618492038|
|                 IND|0.031807109735621765|
|     Hoffman Estates| 0.03715019751454071|
|           Cambridge| 0.04002761433179831|
|          Brookfield| 0.04319476912484573|
|numeric_count_inv...|0.048718867112921234|
|         numeric_age|0.056279540059085136|
|numeric_funding_r...| 0.07010327551287102|
|        MAR_investor|  0.0705646712055127|
|            Gembloux| 0.0726204

In [52]:
# create spark df with the labels and the corresponding importancies + sorting by importancy
rf15w = spark.createDataFrame(pd.DataFrame(list(zip(labellist, rf_pipeline_15.stages[4].featureImportances.toArray())),
            columns = ['column', 'importancy']).sort_values('importancy').tail(20))
display(rf15w)
rf15w.show()

DataFrame[column: string, importancy: double]

+--------------------+--------------------+
|              column|          importancy|
+--------------------+--------------------+
|        Temple Hills|0.015479448575504224|
|              Novato| 0.01604121374305548|
|          Springdale| 0.01605554990466834|
| Magdeburg-rothensee|0.016229597699387904|
|           Levittown| 0.01632092090581517|
|         Albertville|0.020518085779776492|
|             Redmond| 0.02105604590999835|
|                Lima| 0.02357283971930415|
|        Agoura Hills|0.024786054705578573|
|               Wayne|0.029811538581815546|
|               Alamo|  0.0307173204799975|
|        KWT_investor| 0.03274477003488468|
|         Los Angeles| 0.04398992693454515|
|             Atlanta| 0.04515962979389391|
|           Cambridge|  0.0553114562448182|
|        MAR_investor|0.056926678881167384|
|         numeric_age|0.061393953422846544|
|numeric_count_inv...| 0.06349339118943091|
|numeric_funding_r...| 0.06892375749720289|
|            Moorpark| 0.0714285

In [53]:
# create spark df with the labels and the corresponding importancies + sorting by importancy
rf25w = spark.createDataFrame(pd.DataFrame(list(zip(labellist, rf_pipeline_25.stages[4].featureImportances.toArray())),
            columns = ['column', 'weight']).sort_values('weight').tail(20))
display(rf25w)
rf25w.show()

DataFrame[column: string, weight: double]

+--------------------+--------------------+
|              column|              weight|
+--------------------+--------------------+
|                Oslo|0.016087455610669932|
|Feasterville Trevose|0.016616446681470994|
|             Hasselt|0.016931581014104125|
|          Wellington|0.016949804804523877|
|            Suresnes| 0.01749804305383187|
|             Cologne|  0.0176284231667193|
|              Irving|0.017726800673899652|
|                 USA|0.020366058015948207|
|               Macon|0.021676877406198666|
|           Haymarket| 0.02237157683328067|
|               Aptos| 0.02335198314336349|
|           Cambridge|0.024275363570069042|
|         Los Angeles|0.024782017501426657|
|        KWT_investor|0.027324344638237987|
|                Bray|0.032049562044388524|
|numeric_count_inv...|0.032840363628898195|
|numeric_funding_r...|0.033872197719853094|
|        MAR_investor|0.035271467241434734|
|         numeric_age| 0.03748722527444741|
|         Schuttrange|0.05131051

# TESTING PERFORMANCE

We tested with the best performing (AUC and Accuracy) Decision Tree and Random Forest model.

In [55]:
# Decision Tree
dfdt_test = dt_pipeline.transform(testing_df)
print("Decision Tree: AUC = {}".format(mce.evaluate(dfdt_test)))
print(dfdt_test.select(fn.expr('float(label = prediction)').alias('correct')).select(fn.avg('correct').alias("Accuracy for Decision Tree")).show())
# Best performing random forest model
dfrf_25_test = rf_pipeline_25.transform(testing_df)
print("Random Forest 25 trees: AUC = {}".format(bce.evaluate(dfrf_25_test)))
print(dfrf_25_test.select(fn.expr('float(label = prediction)').alias('correct')).select(fn.avg('correct').alias("Accuracy for Random Forest with 15 trees")).show())

Decision Tree: AUC = 0.8963869491955121
+--------------------------+
|Accuracy for Decision Tree|
+--------------------------+
|        0.9299442033477991|
+--------------------------+

None
Random Forest 25 trees: AUC = 0.7105225794662331
+----------------------------------------+
|Accuracy for Random Forest with 15 trees|
+----------------------------------------+
|                      0.9301508576152098|
+----------------------------------------+

None
