## Pyspark ANN implementation

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyspark

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Project-4').getOrCreate()
df = spark.read.csv('2-years-before.csv', header = True, inferSchema = True)
df.printSchema()

root
 |-- SIZE-3year: double (nullable = true)
 |-- GR-PROFIT-MARGIN-3year: double (nullable = true)
 |-- CAPITAL-EMPLOYED-TURNOVER-3year: double (nullable = true)
 |-- STOCKHOLDERS-EQUITY-TURNOVER-3year: double (nullable = true)
 |-- CAPITAL-EMPLOYED/NET-FIXED-ASSETS-3year: double (nullable = true)
 |-- DEBT/EQUITY-3year: double (nullable = true)
 |-- EQUITY/CAPITAL-EMPLOYED-3year: double (nullable = true)
 |-- WORKING-CAPITAL/TOTAL-ASSETS-3year: double (nullable = true)
 |-- AV-COLLECTION-PERIOD-FOR-RECIEVABLES-3year: double (nullable = true)
 |-- AV-PAYMENT-PERIOD-3year: double (nullable = true)
 |-- AV-TURNOVER-PERIOD-FOR-INVENTORIES-3year: double (nullable = true)
 |-- final: string (nullable = true)



In [3]:
from pyspark.sql import functions as F
df = df.withColumn('final',F.when(df['final']=='Bankrupted',1).otherwise(0))
df.select("final").show(145)

+-----+
|final|
+-----+
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|


In [4]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=["SIZE-3year", "GR-PROFIT-MARGIN-3year", "CAPITAL-EMPLOYED-TURNOVER-3year","STOCKHOLDERS-EQUITY-TURNOVER-3year","CAPITAL-EMPLOYED/NET-FIXED-ASSETS-3year","DEBT/EQUITY-3year","EQUITY/CAPITAL-EMPLOYED-3year","WORKING-CAPITAL/TOTAL-ASSETS-3year","AV-COLLECTION-PERIOD-FOR-RECIEVABLES-3year","AV-PAYMENT-PERIOD-3year","AV-TURNOVER-PERIOD-FOR-INVENTORIES-3year"],outputCol="features")

df = assembler.transform(df)

In [5]:
df = df.withColumnRenamed('final', 'label')
df.select("features").show(145)
df.select("label").show(145)
splits = df.randomSplit([0.8, 0.2], 43)
train = splits[0]
test = splits[1]

+--------------------+
|            features|
+--------------------+
|[17.048357,11.394...|
|[17.777371,5.0220...|
|[11.855231,6.3727...|
|[13.911165,16.481...|
|[14.93396,24.8770...|
|[13.742265,38.307...|
|[15.120019,19.436...|
|[15.462392,41.984...|
|[14.943006,33.950...|
|[18.084061,-71.78...|
|[13.230511,35.945...|
|[15.388187,12.896...|
|[14.109318,14.424...|
|[13.071884,32.754...|
|[15.228073,16.067...|
|[15.929209,10.659...|
|[15.71231,35.8495...|
|[14.219207,7.1721...|
|[12.148851,15.448...|
|[14.21983,24.8279...|
|[19.309802,2.9653...|
|[14.715864,14.908...|
|[15.182899,3.2659...|
|[14.363132,30.070...|
|[16.244218,4.9042...|
|[15.919981,27.008...|
|[14.796652,12.759...|
|[11.563343,58.625...|
|[16.029913,95.891...|
|[11.669031,33.575...|
|[12.385001,22.473...|
|[14.997842,26.528...|
|[13.842263,26.339...|
|[15.23283,38.3538...|
|[13.324138,-82.09...|
|[12.920208,66.498...|
|[14.737367,3.0295...|
|[12.571295,48.953...|
|[9.729313,5.29695...|
|[13.465346,21.081...|
|[13.73385,

In [6]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

layers = [11, 8, 4, 2]

# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=1000, layers=layers, blockSize=128, seed=1234)

# train the model
model = trainer.fit(train)

# compute accuracy on the test set
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

Test set accuracy = 0.7


In [7]:
y_true = result.select(['label']).collect()
y_pred = result.select(['prediction']).collect()

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.88      0.79        26
           1       0.62      0.36      0.45        14

    accuracy                           0.70        40
   macro avg       0.67      0.62      0.62        40
weighted avg       0.69      0.70      0.67        40



We observe that the accuracy is equal to 0.7 and greater than DDMP accuracy which is equal to 0.627

## Auto-Sklearn

In [8]:
dataset = pd.read_csv('2-years-before.csv')

In [9]:
dataset

Unnamed: 0,SIZE-3year,GR-PROFIT-MARGIN-3year,CAPITAL-EMPLOYED-TURNOVER-3year,STOCKHOLDERS-EQUITY-TURNOVER-3year,CAPITAL-EMPLOYED/NET-FIXED-ASSETS-3year,DEBT/EQUITY-3year,EQUITY/CAPITAL-EMPLOYED-3year,WORKING-CAPITAL/TOTAL-ASSETS-3year,AV-COLLECTION-PERIOD-FOR-RECIEVABLES-3year,AV-PAYMENT-PERIOD-3year,AV-TURNOVER-PERIOD-FOR-INVENTORIES-3year,final
0,17.048357,11.394002,8.553323,9.782489,0.545139,22.955679,0.874350,-0.338667,431.196373,307.433926,59.958526,Bankrupted
1,17.777371,5.022056,0.405121,0.405121,1.335460,0.676128,1.000000,0.073301,61.828454,134.429902,365.416021,Bankrupted
2,11.855231,6.372713,2.939412,2.939412,50.054526,2.089913,1.000000,0.192623,188.645150,29.310724,109.392223,Bankrupted
3,13.911165,16.481641,5.661395,5.661360,13.304319,6.360569,1.000006,0.101952,357.850150,239.631257,0.000000,Bankrupted
4,14.933960,24.877081,6.295984,6.295984,0.855904,5.916377,1.000000,-0.079627,232.387033,173.791784,58.048552,Bankrupted
...,...,...,...,...,...,...,...,...,...,...,...,...
140,16.007338,17.450000,1.050000,1.050000,1.240000,0.840000,1.000000,0.101211,154.000000,254.000000,261.000000,Non-Bankrupted
141,15.875201,28.230000,2.020000,2.020000,1.770000,0.600000,1.000000,0.273218,91.000000,80.000000,133.000000,Non-Bankrupted
142,14.378253,68.390000,2.810000,2.840000,6.520000,1.690000,0.990000,0.308747,112.000000,106.000000,324.000000,Non-Bankrupted
143,14.597912,12.920000,1.010000,1.010000,0.730000,0.830000,1.000000,-0.203044,38.000000,70.000000,142.000000,Non-Bankrupted


In [10]:
#Splitting the data into training and test test
X = dataset.iloc[:,0:11].values
y = dataset.iloc[:,11].values

from sklearn.preprocessing import LabelEncoder
encoder =  LabelEncoder()
y1 = encoder.fit_transform(y)

Y = pd.get_dummies(y1).values

from sklearn.model_selection import train_test_split
X_train,X_test, y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=43) 


In [11]:
if __name__ == "__main__":
    ############################################################################
    # Define a callback that instantiates SuccessiveHalving
    # =====================================================

    def get_smac_object_callback(budget_type):
        def get_smac_object(
            scenario_dict,
            seed,
            ta,
            ta_kwargs,
            metalearning_configurations,
            n_jobs,
            dask_client,
        ):
            from smac.facade.smac_ac_facade import SMAC4AC
            from smac.intensification.successive_halving import SuccessiveHalving
            from smac.runhistory.runhistory2epm import RunHistory2EPM4LogCost
            from smac.scenario.scenario import Scenario

            if n_jobs > 1 or (dask_client and len(dask_client.nthreads()) > 1):
                raise ValueError("Please make sure to guard the code invoking Auto-sklearn by "
                                 "`if __name__ == '__main__'` and remove this exception.")

            scenario = Scenario(scenario_dict)
            if len(metalearning_configurations) > 0:
                default_config = scenario.cs.get_default_configuration()
                initial_configurations = [default_config] + metalearning_configurations
            else:
                initial_configurations = None
            rh2EPM = RunHistory2EPM4LogCost

            ta_kwargs['budget_type'] = budget_type

            return SMAC4AC(
                scenario=scenario,
                rng=seed,
                runhistory2epm=rh2EPM,
                tae_runner=ta,
                tae_runner_kwargs=ta_kwargs,
                initial_configurations=initial_configurations,
                run_id=seed,
                intensifier=SuccessiveHalving,
                intensifier_kwargs={
                    'initial_budget': 10.0,
                    'max_budget': 100,
                    'eta': 2,
                    'min_chall': 1},
                n_jobs=n_jobs,
                dask_client=dask_client,
            )
        return get_smac_object


In [12]:
import autosklearn.classification
automl = autosklearn.classification.AutoSklearnClassifier(time_left_for_this_task=40,
        per_run_time_limit=10)
automl.fit(X_train, y_train)
y_hat = automl.predict(X_test)
import sklearn
print("Accuracy score", sklearn.metrics.accuracy_score(y_test, y_hat))



Accuracy score 0.6551724137931034


In [13]:
print(classification_report(y_test,y_hat))


              precision    recall  f1-score   support

           0       0.50      0.10      0.17        10
           1       0.67      0.95      0.78        19

   micro avg       0.66      0.66      0.66        29
   macro avg       0.58      0.52      0.47        29
weighted avg       0.61      0.66      0.57        29
 samples avg       0.66      0.66      0.66        29



We observe that the accuracy is equal to 0.6551 and greater than DDMP accuracy which is equal to 0.627

## H2O

In [5]:
import h2o
from h2o.estimators import H2ODeepLearningEstimator
h2o.init()

# Import the insurance dataset into H2O:
df = h2o.upload_file("2-years-before.csv")

df["final"] = (df["final"] == "Bankrupted").ifelse(1, 0)
df


Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,4 mins 59 secs
H2O_cluster_timezone:,Europe/Athens
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.32.0.2
H2O_cluster_version_age:,1 month and 19 days
H2O_cluster_name:,H2O_from_python_chvou_vxfb1n
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,81.1 Mb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


Parse progress: |█████████████████████████████████████████████████████████| 100%


SIZE-3year,GR-PROFIT-MARGIN-3year,CAPITAL-EMPLOYED-TURNOVER-3year,STOCKHOLDERS-EQUITY-TURNOVER-3year,CAPITAL-EMPLOYED/NET-FIXED-ASSETS-3year,DEBT/EQUITY-3year,EQUITY/CAPITAL-EMPLOYED-3year,WORKING-CAPITAL/TOTAL-ASSETS-3year,AV-COLLECTION-PERIOD-FOR-RECIEVABLES-3year,AV-PAYMENT-PERIOD-3year,AV-TURNOVER-PERIOD-FOR-INVENTORIES-3year,final
17.0484,11.394,8.55332,9.78249,0.545139,22.9557,0.87435,-0.338667,431.196,307.434,59.9585,1
17.7774,5.02206,0.405121,0.405121,1.33546,0.676128,1.0,0.073301,61.8285,134.43,365.416,1
11.8552,6.37271,2.93941,2.93941,50.0545,2.08991,1.0,0.192623,188.645,29.3107,109.392,1
13.9112,16.4816,5.66139,5.66136,13.3043,6.36057,1.00001,0.101952,357.85,239.631,0.0,1
14.934,24.8771,6.29598,6.29598,0.855904,5.91638,1.0,-0.079627,232.387,173.792,58.0486,1
13.7423,38.3074,6.12232,6.12232,0.526064,5.63726,1.0,-0.283839,11.8471,245.601,0.0,1
15.12,19.4367,6.70298,6.70298,63.7685,5.35213,1.0,0.14074,311.934,29.9276,32.6707,1
15.4624,41.9848,2.92924,3.44413,8.89938,5.07154,0.850502,0.14212,395.91,202.878,176.986,1
14.943,33.9506,3.96655,4.10605,0.779867,2.93403,0.966026,-0.133341,165.189,26.0325,63.5665,1
18.0841,-71.7875,0.021042,0.038628,2.81772,1.14337,0.544736,0.521513,2089.15,1686.27,771.431,1




In [6]:
train, test = df.split_frame(ratios = [.8], seed = 43)

In [7]:
predictors = ["SIZE-3year", "GR-PROFIT-MARGIN-3year", "CAPITAL-EMPLOYED-TURNOVER-3year","STOCKHOLDERS-EQUITY-TURNOVER-3year","CAPITAL-EMPLOYED/NET-FIXED-ASSETS-3year","DEBT/EQUITY-3year","EQUITY/CAPITAL-EMPLOYED-3year","WORKING-CAPITAL/TOTAL-ASSETS-3year","AV-COLLECTION-PERIOD-FOR-RECIEVABLES-3year","AV-PAYMENT-PERIOD-3year","AV-TURNOVER-PERIOD-FOR-INVENTORIES-3year"]
# Build and train the model:
dl = H2ODeepLearningEstimator(epochs=1000,
                               activation="Tanh",
                               seed=43)
dl.train(x=predictors,
          y="final",
          training_frame=train)

# Eval performance:
perf = dl.model_performance()

# Generate predictions on a test set (if necessary):
pred = dl.predict(test)

deeplearning Model Build progress: |██████████████████████████████████████| 100%
deeplearning prediction progress: |███████████████████████████████████████| 100%


In [8]:
dl.model_performance(test)


ModelMetricsRegression: deeplearning
** Reported on test data. **

MSE: 0.29085219036369653
RMSE: 0.5393071391736777
MAE: 0.4134794865198673
RMSLE: 0.37659638808834983
Mean Residual Deviance: 0.29085219036369653




H2O model has low MSE and we can conclude that the predictions are accurate

## Auto-Weka

![Weka](Weka.png)

We observe that the ROC Area is equal to 0.653 and slightly greater than DDMP accuracy which is equal to 0.627