In [1]:
from __future__ import nested_scopes
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML('<style>.CodeMirror{font-family: "Courier New";font-size: 12pt;}</style>'))


In [2]:
import sys
import os
import numpy as np

os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars /home/xgboost/xgboost4j/jars/xgboost4j_2.12-1.3.3.jar,/home/xgboost/xgboost4j/jars/xgboost4j-spark_2.12-1.3.3.jar,/home/xgboost/xgboost4j/jars/spark-arrow-datasource-standard-1.1.0-jar-with-dependencies.jar pyspark-shell'
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars /home/xgboost/xgboost4j/jars/xgboost4j_2.12-1.3.3.jar,/home/xgboost/xgboost4j/jars/xgboost4j-spark_2.12-1.3.3.jar,/home/xgboost/xgboost4j/jars/spark-arrow-datasource-0.9.0-jar-with-dependencies.jar pyspark-shell'
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars /home/xgboost/jars-for-aws-cpu/1.3/xgboost4j_2.12-1.3.3.jar,/home/xgboost/jars-for-aws-cpu/1.3/xgboost4j-spark_2.12-1.3.3.jar,/home/xgboost/xgboost4j/jars/spark-arrow-datasource-0.9.0-jar-with-dependencies.jar pyspark-shell'
# os.environ['ARROW_LIBHDFS3_DIR'] = '/home/xgboost/miniconda3/lib'
os.environ['LD_LIBRARY_PATH'] = '/home/xgboost/miniconda3/lib'

In [3]:
SPARK_HOME='/home/xgboost/spark-3.0.0-bin-hadoop2.7'
import findspark
findspark.init(SPARK_HOME)

In [4]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.types import FloatType, IntegerType, StructField, StructType
from time import time, sleep
import subprocess
import math

In [5]:
clients = ['sr572']
nrepeat = 1
executors_per_node = 1
nodes=len(clients)
cores_per_executor=8
task_per_core=1

cache_size=30000
arrow_datasource_jar = '/home/xgboost/xgboost4j/jars/spark-arrow-datasource-standard-1.1.0-jar-with-dependencies.jar'
#arrow_datasource_jar = '/home/xgboost/xgboost4j/jars/spark-arrow-datasource-0.9.0-jar-with-dependencies.jar'

def start_cntx():
    conf = SparkConf()\
        .set('spark.default.parallelism', '{:d}'.format(nodes*executors_per_node*cores_per_executor))\
        .set('spark.executor.instances', '{:d}'.format(executors_per_node*nodes))\
        .set('spark.rdd.compress', 'False')\
        .set('spark.files.maxPartitionBytes', '512m')\
        .set('spark.executor.cores','{:d}'.format(cores_per_executor))\
        .set('spark.executor.memory', '20g') \
        .set('spark.executor.memoryOverhead', '4g') \
        .set('spark.task.cpus', '{:d}'.format(task_per_core))\
        .set('spark.driver.memory', '10g')\
        .set('spark.executor.extraJavaOptions',\
            '-XX:+UseParallelGC -XX:+UseParallelOldGC -verbose:gc -XX:+PrintGCDetails')\
        .set('spark.driver.maxResultSize', '0')\
        .set('spark.serializer','org.apache.spark.serializer.KryoSerializer')\
        .set('spark.memory.offHeap.enabled','True')\
        .set('spark.memory.offHeap.size','10g')\
        .set('spark.executorEnv.ARROW_LIBHDFS3_DIR', '/home/xgboost/miniconda3/lib')\
        .set('spark.executorEnv.LD_LIBRARY_PATH', '/home/xgboost/miniconda3/lib')\
        .set('spark.driver.extraClassPath', arrow_datasource_jar) \
        .set('spark.executor.extraClassPath', arrow_datasource_jar) \
        .setAppName('mortgage')

    spark = SparkSession.builder\
                .master('spark://sr572:7077')\
                .config(conf=conf)\
                .getOrCreate()
        
    sc = spark.sparkContext
    sc.setLogLevel('INFO')
    sc.addPyFile('/home/xgboost/xgboost4j/sparkxgb_1.24.zip')
    return sc, spark    

In [6]:
def train(numWorkers, label, features):
    from sparkxgb import XGBoostClassifier
    params = { 
        'labelCol': label,
        #'featuresCols': features,
        'eta': 0.1,
        'gamma': 0.1,
        'missing': 0.0,
        'treeMethod': 'hist',
        'maxDepth': 8, 
        'maxLeaves': 256,
        'alpha':0.9,
        'objective':'reg:squarederror',
        'growPolicy': 'depthwise',
        'minChildWeight': 30.0,
        'reg_lambda': 1.0,
        'scalePosWeight': 2.0,
        'subsample': 1.0,
        'numRound': 100,
        'maxBin': 256,
        'nthread': cores_per_executor,
        'numWorkers': numWorkers,
        'singlePrecisionHistogram': True,
        'verbosity': 3     
    }
        
    #classifier = XGBoostClassifier(**params).setLabelCol(label).setFeaturesCols(features)
    classifier = XGBoostClassifier(**params)
    return classifier

In [7]:
def with_benchmark(phrase, action):
    start = time()
    result = action()
    end = time()
    display(HTML("{} takes <font size=6pt color=red>{} seconds </font>".format(phrase, round(end - start, 2))))
    return result


def train_data_fn(nworker, train_data, label, features):
    classifier=train(nworker, label, features)
    return with_benchmark('Training', lambda: classifier.fit(train_data))

In [8]:
def load_parquet(path):
    label = 'delinquency_12'
    train_data = spark.read.format('arrow').load(path) 
    return (label,'features',train_data)

In [9]:
SPARK_HOME='/home/xgboost/spark-3.0.0-bin-hadoop2.7'
!$SPARK_HOME/sbin/start-master.sh
!$SPARK_HOME/sbin/start-slave.sh spark://sr572:7077 -c 8

/bin/bash: /home/xgboost/miniconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
bash: /home/xgboost/miniconda3/lib/libtinfo.so.6: no version information available (required by bash)
bash: /home/xgboost/miniconda3/lib/libtinfo.so.6: no version information available (required by bash)
starting org.apache.spark.deploy.master.Master, logging to /home/xgboost/spark-3.0.0-bin-hadoop2.7/logs/spark-xgboost-org.apache.spark.deploy.master.Master-1-sr572.out
/bin/bash: /home/xgboost/miniconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
bash: /home/xgboost/miniconda3/lib/libtinfo.so.6: no version information available (required by bash)
bash: /home/xgboost/miniconda3/lib/libtinfo.so.6: no version information available (required by bash)
starting org.apache.spark.deploy.worker.Worker, logging to /home/xgboost/spark-3.0.0-bin-hadoop2.7/logs/spark-xgboost-org.apache.spark.deploy.worker.Worker-1-sr572.out


In [10]:
sc, spark = start_cntx()
appid = sc.applicationId
        
label,features,df = load_parquet('file:///home/xgboost/data/xgboost_4M_float.dataframe.parquet')
train_data, test_data = df.randomSplit([0.8, 0.2], 123)
model = train_data_fn(executors_per_node*nodes, train_data, label, features)
#model.write().overwrite().save('file:///home/xgboost/model')


Py4JJavaError: An error occurred while calling o119.fit.
: java.lang.IllegalStateException: Internal Error class org.apache.spark.sql.execution.SampleExec has column support mismatch:
Sample 0.0, 0.8, false, 123
+- Sort [interest_rate#0 ASC NULLS FIRST, current_actual_upb#1 ASC NULLS FIRST, loan_age#2 ASC NULLS FIRST, remaining_months_to_legal_maturity#3 ASC NULLS FIRST, adj_remaining_months_to_maturity#4 ASC NULLS FIRST, msa#5 ASC NULLS FIRST, current_loan_delinquency_status#6 ASC NULLS FIRST, foreclosure_costs#7 ASC NULLS FIRST, prop_preservation_and_repair_costs#8 ASC NULLS FIRST, asset_recovery_costs#9 ASC NULLS FIRST, misc_holding_expenses#10 ASC NULLS FIRST, holding_taxes#11 ASC NULLS FIRST, net_sale_proceeds#12 ASC NULLS FIRST, credit_enhancement_proceeds#13 ASC NULLS FIRST, repurchase_make_whole_proceeds#14 ASC NULLS FIRST, other_foreclosure_proceeds#15 ASC NULLS FIRST, non_interest_bearing_upb#16 ASC NULLS FIRST, principal_forgiveness_upb#17 ASC NULLS FIRST, foreclosure_principal_write_off_amount#18 ASC NULLS FIRST, servicer_idx#19 ASC NULLS FIRST, mod_flag_idx#20 ASC NULLS FIRST, zero_balance_code_idx#21 ASC NULLS FIRST, repurchase_make_whole_proceeds_flag_idx#22 ASC NULLS FIRST, servicing_activity_indicator_idx#23 ASC NULLS FIRST, ... 23 more fields], false, 0
   +- BatchScan[interest_rate#0, current_actual_upb#1, loan_age#2, remaining_months_to_legal_maturity#3, adj_remaining_months_to_maturity#4, msa#5, current_loan_delinquency_status#6, foreclosure_costs#7, prop_preservation_and_repair_costs#8, asset_recovery_costs#9, misc_holding_expenses#10, holding_taxes#11, net_sale_proceeds#12, credit_enhancement_proceeds#13, repurchase_make_whole_proceeds#14, other_foreclosure_proceeds#15, non_interest_bearing_upb#16, principal_forgiveness_upb#17, foreclosure_principal_write_off_amount#18, servicer_idx#19, mod_flag_idx#20, zero_balance_code_idx#21, repurchase_make_whole_proceeds_flag_idx#22, servicing_activity_indicator_idx#23, ... 23 more fields] ArrowScan DataFilters: [], Location: InMemoryFileIndex[file:/home/xgboost/data/xgboost_4M_float.dataframe.parquet], PartitionFilters: [], ReadSchema: struct<interest_rate:float,current_actual_upb:float,loan_age:float,remaining_months_to_legal_matu...

	at org.apache.spark.sql.execution.SparkPlan.doExecuteColumnar(SparkPlan.scala:303)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeColumnar$1(SparkPlan.scala:202)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210)
	at org.apache.spark.sql.execution.SparkPlan.executeColumnar(SparkPlan.scala:198)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecuteColumnar(WholeStageCodegenExec.scala:688)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeColumnar$1(SparkPlan.scala:202)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210)
	at org.apache.spark.sql.execution.SparkPlan.executeColumnar(SparkPlan.scala:198)
	at ml.dmlc.xgboost4j.scala.spark.DataUtils$.$anonfun$convertDataFrameToArrowRecordBatchRDDs$1(DataUtils.scala:201)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:238)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198)
	at scala.collection.TraversableLike.map(TraversableLike.scala:238)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:231)
	at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:198)
	at ml.dmlc.xgboost4j.scala.spark.DataUtils$.convertDataFrameToArrowRecordBatchRDDs(DataUtils.scala:169)
	at ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier.train(XGBoostClassifier.scala:224)
	at ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier.fit(XGBoostClassifier.scala:184)
	at ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier.fit(XGBoostClassifier.scala:46)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:834)


In [None]:
#from pyspark.ml.evaluation import BinaryClassificationEvaluator
#pred_df = model.transform(test_data)
#accuracy = BinaryClassificationEvaluator().setLabelCol("delinquency_12").evaluate(pred_df)
#print(accuracy)

In [11]:
sc.stop()

In [12]:
!$SPARK_HOME/sbin/stop-slave.sh
!$SPARK_HOME/sbin/stop-master.sh

/bin/bash: /home/xgboost/miniconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
bash: /home/xgboost/miniconda3/lib/libtinfo.so.6: no version information available (required by bash)
bash: /home/xgboost/miniconda3/lib/libtinfo.so.6: no version information available (required by bash)
stopping org.apache.spark.deploy.worker.Worker
/bin/bash: /home/xgboost/miniconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
bash: /home/xgboost/miniconda3/lib/libtinfo.so.6: no version information available (required by bash)
bash: /home/xgboost/miniconda3/lib/libtinfo.so.6: no version information available (required by bash)
stopping org.apache.spark.deploy.master.Master
