# LightGBM on Spark

In [1]:
import os
import sys

spark_home = os.environ.get('SPARK_HOME', None)
sys.path.insert(0, spark_home + "/python")
sys.path.insert(0, os.path.join(spark_home, "/home/spark/python/lib/py4j*"))

In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.feature import PCA, VectorAssembler, StandardScaler

In [3]:
# Initialize SparkSession
spark = (SparkSession
         .builder
         .appName("news")
#          .config("spark.driver.extraJavaOptions","-Dlog4j.configuration=file:/home/spark/conf/log4j.properties")
         .config("spark.jars.packages","com.microsoft.ml.spark:mmlspark_2.11:0.18.1")
         .enableHiveSupport()
         .getOrCreate())

In [4]:
from mmlspark.lightgbm import LightGBMRegressor

In [5]:
# Read raw data
df = spark.read.csv('/home/worker/data/Data7602.csv', header=True, inferSchema=True, mode="DROPMALFORMED", encoding='UTF-8').drop("Area")

print("==== 生データ ====")
df.show(truncate=False)

==== 生データ ====
+--------+----+---------+--------+
|anzsic06|year|geo_count|ec_count|
+--------+----+---------+--------+
|A       |2000|96       |130     |
|A       |2000|198      |110     |
|A       |2000|42       |25      |
|A       |2000|66       |40      |
|A       |2000|63       |40      |
|A       |2000|21       |12      |
|A       |2000|45       |60      |
|A       |2000|36       |60      |
|A       |2000|78       |18      |
|A       |2000|42       |9       |
|A       |2000|39       |35      |
|A       |2000|105      |20      |
|A       |2000|99       |30      |
|A       |2000|42       |12      |
|A       |2000|57       |9       |
|A       |2000|54       |15      |
|A       |2000|81       |25      |
|A       |2000|63       |50      |
|A       |2000|75       |50      |
|A       |2000|123      |30      |
+--------+----+---------+--------+
only showing top 20 rows



In [6]:
assembler = VectorAssembler(inputCols=df.columns[1:], outputCol="変量")
feature_vectors = assembler.transform(df)
feature_vectors.show()

+--------+----+---------+--------+--------------------+
|anzsic06|year|geo_count|ec_count|                変量|
+--------+----+---------+--------+--------------------+
|       A|2000|       96|     130| [2000.0,96.0,130.0]|
|       A|2000|      198|     110|[2000.0,198.0,110.0]|
|       A|2000|       42|      25|  [2000.0,42.0,25.0]|
|       A|2000|       66|      40|  [2000.0,66.0,40.0]|
|       A|2000|       63|      40|  [2000.0,63.0,40.0]|
|       A|2000|       21|      12|  [2000.0,21.0,12.0]|
|       A|2000|       45|      60|  [2000.0,45.0,60.0]|
|       A|2000|       36|      60|  [2000.0,36.0,60.0]|
|       A|2000|       78|      18|  [2000.0,78.0,18.0]|
|       A|2000|       42|       9|   [2000.0,42.0,9.0]|
|       A|2000|       39|      35|  [2000.0,39.0,35.0]|
|       A|2000|      105|      20| [2000.0,105.0,20.0]|
|       A|2000|       99|      30|  [2000.0,99.0,30.0]|
|       A|2000|       42|      12|  [2000.0,42.0,12.0]|
|       A|2000|       57|       9|   [2000.0,57.0,

In [7]:
print("==== LightGBMの学習 ====")
model = LightGBMRegressor(alpha=0.3,
                          learningRate=0.3,
                          numIterations=100,
                          numLeaves=31,
                          featuresCol='変量',
                          labelCol='geo_count')

==== LightGBMの学習 ====


In [8]:
model.fit(feature_vectors)

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/site-packages/py4j/java_gateway.py", line 1159, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/site-packages/py4j/java_gateway.py", line 985, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.7/site-packages/py4j/java_gateway.py", line 1164, in send_command
    "Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:43038)
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 2961, in run_code
 

Py4JError: An error occurred while calling o69.fit

In [None]:
print("==== 元のデータフレーム行数 ====")
print((df.count(), len(df.columns)))