In [1]:
from __future__ import nested_scopes
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML('<style>.CodeMirror{font-family: "Courier New";font-size: 12pt;}</style>'))


In [2]:
import sys
import os
import numpy as np

os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars /home/xgboost/xgboost4j/jars/xgboost4j_2.12-1.3.3.jar,/home/xgboost/xgboost4j/jars/xgboost4j-spark_2.12-1.3.3.jar,/home/xgboost/xgboost4j/jars/spark-arrow-datasource-standard-1.1.0-jar-with-dependencies.jar pyspark-shell'
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars /home/xgboost/xgboost4j/jars/xgboost4j_2.12-1.3.3.jar,/home/xgboost/xgboost4j/jars/xgboost4j-spark_2.12-1.3.3.jar,/home/xgboost/xgboost4j/jars/spark-arrow-datasource-0.9.0-jar-with-dependencies.jar pyspark-shell'
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars /home/xgboost/jars-for-aws-cpu/1.3/xgboost4j_2.12-1.3.3.jar,/home/xgboost/jars-for-aws-cpu/1.3/xgboost4j-spark_2.12-1.3.3.jar,/home/xgboost/xgboost4j/jars/spark-arrow-datasource-0.9.0-jar-with-dependencies.jar pyspark-shell'
# os.environ['ARROW_LIBHDFS3_DIR'] = '/home/xgboost/miniconda3/lib'
os.environ['LD_LIBRARY_PATH'] = '/home/xgboost/miniconda3/lib'

In [3]:
SPARK_HOME='/home/xgboost/spark-3.0.0-bin-hadoop2.7'
import findspark
findspark.init(SPARK_HOME)

In [4]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.types import FloatType, IntegerType, StructField, StructType
from time import time, sleep
import subprocess
import math

In [5]:
nrepeat = 1
executors_per_node = 1
nodes=1
cores_per_executor=4
task_per_core=1

cache_size=30000
arrow_datasource_jar = '/home/xgboost/xgboost4j/jars/spark-arrow-datasource-standard-1.1.0-jar-with-dependencies.jar'
#arrow_datasource_jar = '/home/xgboost/xgboost4j/jars/spark-arrow-datasource-0.9.0-jar-with-dependencies.jar'

def start_cntx():
    conf = SparkConf()\
        .set('spark.default.parallelism', '{:d}'.format(nodes*executors_per_node*cores_per_executor))\
        .set('spark.executor.instances', '{:d}'.format(executors_per_node*nodes))\
        .set('spark.rdd.compress', 'False')\
        .set('spark.files.maxPartitionBytes', '512m')\
        .set('spark.executor.cores','{:d}'.format(cores_per_executor))\
        .set('spark.executor.memory', '20g') \
        .set('spark.executor.memoryOverhead', '4g') \
        .set('spark.task.cpus', '{:d}'.format(task_per_core))\
        .set('spark.driver.memory', '10g')\
        .set('spark.executor.extraJavaOptions',\
            '-XX:+UseParallelGC -XX:+UseParallelOldGC -verbose:gc -XX:+PrintGCDetails')\
        .set('spark.driver.maxResultSize', '0')\
        .set('spark.serializer','org.apache.spark.serializer.KryoSerializer')\
        .set('spark.memory.offHeap.enabled','True')\
        .set('spark.memory.offHeap.size','10g')\
        .set('spark.executorEnv.ARROW_LIBHDFS3_DIR', '/home/xgboost/miniconda3/lib')\
        .set('spark.executorEnv.LD_LIBRARY_PATH', '/home/xgboost/miniconda3/lib')\
        .set('spark.driver.extraClassPath', arrow_datasource_jar) \
        .set('spark.executor.extraClassPath', arrow_datasource_jar) \
        .setAppName('mortgage')

    spark = SparkSession.builder\
                .master('spark://sr572:7077')\
                .config(conf=conf)\
                .getOrCreate()
        
    sc = spark.sparkContext
    sc.setLogLevel('INFO')
    sc.addPyFile('/home/xgboost/xgboost4j/sparkxgb_1.24.zip')
    return sc, spark    

In [6]:
def train(numWorkers, label, features):
    from sparkxgb import XGBoostClassifier
    params = { 
        'labelCol': label,
        #'featuresCols': features,
        'eta': 0.1,
        'gamma': 0.1,
        'missing': 0.0,
        'treeMethod': 'hist',
        'maxDepth': 8, 
        'maxLeaves': 256,
        'alpha':0.9,
        'objective':'reg:squarederror',
        'growPolicy': 'depthwise',
        'minChildWeight': 30.0,
        'reg_lambda': 1.0,
        'scalePosWeight': 2.0,
        'subsample': 1.0,
        'numRound': 100,
        'maxBin': 256,
        'nthread': cores_per_executor,
        'numWorkers': numWorkers,
        'singlePrecisionHistogram': True,
        'verbosity': 3     
    }
        
    #classifier = XGBoostClassifier(**params).setLabelCol(label).setFeaturesCols(features)
    classifier = XGBoostClassifier(**params)
    return classifier

In [7]:
def with_benchmark(phrase, action):
    start = time()
    result = action()
    end = time()
    display(HTML("{} takes <font size=6pt color=red>{} seconds </font>".format(phrase, round(end - start, 2))))
    return result


def train_data_fn(nworker, train_data, label, features):
    classifier=train(nworker, label, features)
    return with_benchmark('Training', lambda: classifier.fit(train_data))
    

In [8]:
def load_parquet(path):
    label = 'label'
    train_data = spark.read.format('arrow').load(path)
    print("DataFrame schma: ", train_data.schema)
    print("DataFrame columns: ", train_data.columns)
    #print(f'features: ${features}')
    #train_data = train_data.coalesce(executors_per_node*nodes)   
    return (label,'features',train_data)

In [9]:
sc, spark = start_cntx()
appid = sc.applicationId
        
label,features,train_data = load_parquet('file:///home/xgboost/data/HiBench10Kx50.dataframe.float.parquet')
model = train_data_fn(executors_per_node*nodes, train_data, label, features)
#model.write().overwrite().save('file:///home/xgboost/model')

#acc_prof = '/home/xgboost/profile/'+appid      
#xgbtck = acc_prof + '/*/xgbtck.txt'
#dmat_create_times = !grep dmat_time --no-filename $xgbtck | cut -d' ' -f3
#train_loop_times = !grep train_time --no-filename $xgbtck | cut -d' ' -f3
#dmat_create_time = np.array([float(t) for t in dmat_create_times]).max()
#train_loop_time = np.array([float(t) for t in train_loop_times]).max()
#display(HTML(('Time spent in building DMatrix (sec): <font size=6pt color=red>{:f}</font>'.format(dmat_create_time))))
#display(HTML(('Time spent in train loops (sec): <font size=6pt color=red>{:f}</font>'.format(train_loop_time))))
    

DataFrame schma:  StructType(List(StructField(f0,FloatType,true),StructField(f1,FloatType,true),StructField(f2,FloatType,true),StructField(f3,FloatType,true),StructField(f4,FloatType,true),StructField(f5,FloatType,true),StructField(f6,FloatType,true),StructField(f7,FloatType,true),StructField(f8,FloatType,true),StructField(f9,FloatType,true),StructField(f10,FloatType,true),StructField(f11,FloatType,true),StructField(f12,FloatType,true),StructField(f13,FloatType,true),StructField(f14,FloatType,true),StructField(f15,FloatType,true),StructField(f16,FloatType,true),StructField(f17,FloatType,true),StructField(f18,FloatType,true),StructField(f19,FloatType,true),StructField(f20,FloatType,true),StructField(f21,FloatType,true),StructField(f22,FloatType,true),StructField(f23,FloatType,true),StructField(f24,FloatType,true),StructField(f25,FloatType,true),StructField(f26,FloatType,true),StructField(f27,FloatType,true),StructField(f28,FloatType,true),StructField(f29,FloatType,true),StructField(f30,

In [10]:
sc.stop()