### pySpark array, csv, vector 型態轉換

In [1]:
# 設定spark session 環境變數 spark home
# 詳細 ?findspark.init
import findspark
findspark.init('/usr/local/spark')
# 載入必要module
from pyspark.sql import SparkSession
import pyspark.sql.functions as fn
from pyspark.sql import types, Row
from pyspark.ml.linalg import Vectors

### array to other

In [2]:
source_data = [
    Row(city="Chicago", temp_array=[-1.0, -2.0, -3.0]),
    Row(city="New York", temp_array=[-7.0, -7.0, -5.0]), 
]
df_array = ss.createDataFrame(source_data)
df_array.show()

+--------+------------------+
|    city|        temp_array|
+--------+------------------+
| Chicago|[-1.0, -2.0, -3.0]|
|New York|[-7.0, -7.0, -5.0]|
+--------+------------------+



In [3]:
# array to vector
city_rdd = df_array.rdd.map(lambda row:row[0])
temp_rdd = df_array.rdd.map(lambda row:row[1])
new_df = city_rdd.zip(temp_rdd.map(lambda x:Vectors.dense(x))).toDF(schema=['city','temp_vector'])
# new_df = df_array.rdd.map(lambda row: (row['city'], Vectors.dense(row['temp_array'])))
#                  .toDF(schema = ['city', 'temp_vector'])
new_df.printSchema()

root
 |-- city: string (nullable = true)
 |-- temp_vector: vector (nullable = true)



In [4]:
# array to csv
new_df = df_array.rdd.map(lambda row: (row['city'], ) + tuple(row['temp_array'])).toDF(['city'])

In [5]:
new_df.printSchema()

root
 |-- city: string (nullable = true)
 |-- _2: double (nullable = true)
 |-- _3: double (nullable = true)
 |-- _4: double (nullable = true)



### vector to orther

In [6]:
source_data = [
    Row(city="Chicago", temp_vector=Vectors.dense([1.0, -2.0, -3.0])),
    Row(city="New York", temp_vector=Vectors.dense([-7.0, -7.0, -5.0])), 
]
df_vector = ss.createDataFrame(source_data)
df_vector.show()

+--------+----------------+
|    city|     temp_vector|
+--------+----------------+
| Chicago| [1.0,-2.0,-3.0]|
|New York|[-7.0,-7.0,-5.0]|
+--------+----------------+



In [7]:
# vector to array
city_rdd = df_vector.rdd.map(lambda row:row[0])
temp_rdd = df_vector.rdd.map(lambda row:row[1])
new_df = city_rdd.zip(temp_rdd.map(lambda x: x.toArray().tolist() )).toDF(schema=['city','temp_array'])
new_df.printSchema()

root
 |-- city: string (nullable = true)
 |-- temp_array: array (nullable = true)
 |    |-- element: double (containsNull = true)



In [8]:
# vector to csv
new_df = df_vector.rdd.map(lambda row: (row['city'], ) + tuple(row['temp_vector'].toArray().tolist())).toDF(['city'])
new_df.printSchema()

root
 |-- city: string (nullable = true)
 |-- _2: double (nullable = true)
 |-- _3: double (nullable = true)
 |-- _4: double (nullable = true)



In [9]:
ss.stop()