In [1]:
import pyspark
from pyspark import SparkContext
from pyspark.sql import SQLContext

In [2]:
master = 'local'
sc = SparkContext(master, 'Sum')

In [3]:
data_path = '../../data/small_car_price_train.201908.csv'
data_path1 = '../../data/small_car_price_test.201908.csv'

sqlContext = SQLContext(sc)
traindata = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load(data_path)
testdata = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load(data_path1)

# 回归
### 线性回归

In [4]:
from pyspark.ml.regression import LinearRegression

In [9]:
feat_path = '../../running/gbdt/car_price_feat.txt'
feats = list(filter(lambda x: x[0] != '#', [line.strip() for line in open(feat_path)]))
feats

['bumper_change',
 'bumper_chromatic',
 'bumper_flaw',
 'bumper_metal',
 'bumper_spray',
 'car_box_type',
 'car_height',
 'car_length',
 'car_width',
 'chromatic_num',
 'deal_city',
 'displacement',
 'door_chromatic',
 'door_flaw',
 'door_metal',
 'door_spray',
 'driving_seat_abnormal',
 'driving_seat_abrasion',
 'driving_seat_blister',
 'driving_seat_damage',
 'driving_seat_disassembly',
 'driving_seat_flaw',
 'driving_seat_smudginess',
 'edition',
 'emission',
 'engine_intake_form',
 'fender_change',
 'fender_chromatic',
 'fender_flaw',
 'fender_metal',
 'fender_spray',
 'front_light_change',
 'front_light_damage',
 'front_light_flaw',
 'gdp_level',
 'gearbox_type',
 'manufacturer',
 'max_horse_power',
 'metal_num',
 'model_age',
 'model_brand',
 'model_series',
 'model_year',
 'pillar_change',
 'pillar_chromatic',
 'pillar_flaw',
 'pillar_metal',
 'pillar_spray',
 'power_displacement_cartesian',
 'region',
 'seat_flaw',
 'spray_num',
 'steering_wheel_flaw',
 'tire_flaw',
 'transmiss

In [10]:
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

In [12]:
traindata[feats]

DataFrame[bumper_change: int, bumper_chromatic: int, bumper_flaw: int, bumper_metal: int, bumper_spray: int, car_box_type: string, car_height: double, car_length: double, car_width: double, chromatic_num: int, deal_city: string, displacement: double, door_chromatic: int, door_flaw: int, door_metal: int, door_spray: int, driving_seat_abnormal: int, driving_seat_abrasion: int, driving_seat_blister: int, driving_seat_damage: int, driving_seat_disassembly: int, driving_seat_flaw: int, driving_seat_smudginess: int, edition: string, emission: int, engine_intake_form: string, fender_change: int, fender_chromatic: int, fender_flaw: int, fender_metal: int, fender_spray: int, front_light_change: int, front_light_damage: int, front_light_flaw: int, gdp_level: int, gearbox_type: string, manufacturer: string, max_horse_power: int, metal_num: int, model_age: int, model_brand: string, model_series: string, model_year: int, pillar_change: int, pillar_chromatic: int, pillar_flaw: int, pillar_metal: int

In [27]:
from pyspark.ml.feature import OneHotEncoderEstimator

rdd = sc.parallelize([
    (0.0, 1.0),
    (1.0, 0.0),
    (2.0, 1.0),
    (0.0, 2.0),
    (0.0, 1.0),
    (4.0, 3.0),
    (4.0, 1.0),
    (2.0, 0.0)
], 2)
df = rdd.toDF(["categoryIndex1", "categoryIndex2"])

encoder = OneHotEncoderEstimator(inputCols=["categoryIndex1", "categoryIndex2"],
                                 outputCols=["categoryVec1", "categoryVec2"])
encoder.setDropLast(False)
model = encoder.fit(df)
encoded = model.transform(df)
encoded.show()

+--------------+--------------+-------------+-------------+
|categoryIndex1|categoryIndex2| categoryVec1| categoryVec2|
+--------------+--------------+-------------+-------------+
|           0.0|           1.0|(5,[0],[1.0])|(4,[1],[1.0])|
|           1.0|           0.0|(5,[1],[1.0])|(4,[0],[1.0])|
|           2.0|           1.0|(5,[2],[1.0])|(4,[1],[1.0])|
|           0.0|           2.0|(5,[0],[1.0])|(4,[2],[1.0])|
|           0.0|           1.0|(5,[0],[1.0])|(4,[1],[1.0])|
|           4.0|           3.0|(5,[4],[1.0])|(4,[3],[1.0])|
|           4.0|           1.0|(5,[4],[1.0])|(4,[1],[1.0])|
|           2.0|           0.0|(5,[2],[1.0])|(4,[0],[1.0])|
+--------------+--------------+-------------+-------------+



In [28]:
encoded.head(5)

[Row(categoryIndex1=0.0, categoryIndex2=1.0, categoryVec1=SparseVector(5, {0: 1.0}), categoryVec2=SparseVector(4, {1: 1.0})),
 Row(categoryIndex1=1.0, categoryIndex2=0.0, categoryVec1=SparseVector(5, {1: 1.0}), categoryVec2=SparseVector(4, {0: 1.0})),
 Row(categoryIndex1=2.0, categoryIndex2=1.0, categoryVec1=SparseVector(5, {2: 1.0}), categoryVec2=SparseVector(4, {1: 1.0})),
 Row(categoryIndex1=0.0, categoryIndex2=2.0, categoryVec1=SparseVector(5, {0: 1.0}), categoryVec2=SparseVector(4, {2: 1.0})),
 Row(categoryIndex1=0.0, categoryIndex2=1.0, categoryVec1=SparseVector(5, {0: 1.0}), categoryVec2=SparseVector(4, {1: 1.0}))]