利用Spark进行机器学习建模，要求在train.csv数据上建模，并利用模型在test.csv数据上进行预测，并最终输出预测结果，将结果单独设置一列（取名为predict_result）补充在test.csv文件最后，最终将根据模型实际AUC值判定模型效果。

In [1]:
#读取数据
from pyspark.sql import SparkSession 
spark=SparkSession.builder.appName("dataframe").getOrCreate()
sc = SparkContext.getOrCreate()
train = spark.read.csv(r"data/train.csv",header=True,inferSchema=True)
test  = spark.read.csv(r"data/test.csv",header=True,inferSchema=True)


  "You are passing in an insecure Py4j gateway.  This "


In [2]:
train.show(5)

+-------+---------+------+-----------+-----+
|user_id|age_range|gender|merchant_id|label|
+-------+---------+------+-----------+-----+
|  34176|        6|     0|        944|   -1|
|  34176|        6|     0|        412|   -1|
|  34176|        6|     0|       1945|   -1|
|  34176|        6|     0|       4752|   -1|
|  34176|        6|     0|        643|   -1|
+-------+---------+------+-----------+-----+
only showing top 5 rows



In [14]:
train.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- age_range: integer (nullable = true)
 |-- gender: integer (nullable = true)
 |-- merchant_id: integer (nullable = true)
 |-- label: integer (nullable = true)



In [4]:
train.groupBy('age_range').count().show()

+---------+-------+
|age_range|  count|
+---------+-------+
|     null|  19380|
|        1|    286|
|        6| 655922|
|        3|1913722|
|        5| 752927|
|        4|1459923|
|        8|  20290|
|        7| 124493|
|        2| 731938|
|        0|1351842|
+---------+-------+


In [3]:
train.groupBy('gender').count().show()

+------+-------+
|gender|  count|
+------+-------+
|  null|  61712|
|     1|1618110|
|     2| 249171|
|     0|5101730|
+------+-------+



In [15]:
train.groupBy('merchant_id').count().show()

+-----------+-----+
|merchant_id|count|
+-----------+-----+
|       4818| 3707|
|        496| 1149|
|       3749| 1157|
|       1580| 3298|
|       1645|  432|
|        471| 3524|
|       4519|  828|
|       3918| 1894|
|        148| 3058|
|       1238|  967|
|       3997| 3762|
|       1342| 1262|
|       3794|  553|
|       1088| 1032|
|       2122| 1162|
|       1591|  724|
|        833|  299|
|       3175| 1312|
|       2366| 1022|
|        463|  851|
+-----------+-----+
only showing top 20 rows



In [16]:
train.groupBy('label').count().show()

+-----+-------+
|label|  count|
+-----+-------+
|   -1|6769859|
|    1|  15952|
|    0| 244912|
+-----+-------+



In [13]:
train.filter(isnull('merchant_id')).show()

+-------+---------+------+-----------+-----+
|user_id|age_range|gender|merchant_id|label|
+-------+---------+------+-----------+-----+
+-------+---------+------+-----------+-----+



这里看基本都是处理好的数据，为了简化age_range ，gender只用确定的，空值不用，merchant都是相当于分好类的特征，直接编码就好，而label我们看是  
不平衡的数据，这里不做采样的了直接用逻辑回归跑个模型。


In [28]:
train.where(train['age_range']>0).count()  

5659501

In [29]:
train1 = train.where(train['age_range']>0)

In [30]:
train2 = train1.where(train1['gender']<2)

In [33]:
train3 = train2.where(~(train2['label']==-1))

In [37]:
#特征
features = list(set(train.columns)-set(['label','user_id']))
features


['gender', 'age_range', 'merchant_id']

In [39]:
from pyspark.ml.feature import OneHotEncoder 
from pyspark.ml import Pipeline
##创建OneHotEncoder对象，设定输入输出参数
categoryFeaturesIndex = ['age_range','merchant_id']
pipeline = Pipeline(stages=[
    OneHotEncoder(inputCol=c, outputCol='{}_vec'.format(c))
    for c in categoryFeaturesIndex
])

onehot = pipeline.fit(train3)
encodeData = onehot.transform(train3)

In [40]:
encodeData.show()


+-------+---------+------+-----------+-----+-------------+-------------------+
|user_id|age_range|gender|merchant_id|label|age_range_vec|    merchant_id_vec|
+-------+---------+------+-----------+-----+-------------+-------------------+
|  34176|        6|     0|       3906|    0|(8,[6],[1.0])|(4993,[3906],[1.0])|
|  34176|        6|     0|        121|    0|(8,[6],[1.0])| (4993,[121],[1.0])|
|  34176|        6|     0|       4356|    1|(8,[6],[1.0])|(4993,[4356],[1.0])|
|  34176|        6|     0|       2217|    0|(8,[6],[1.0])|(4993,[2217],[1.0])|
| 362112|        4|     1|       2618|    0|(8,[4],[1.0])|(4993,[2618],[1.0])|
|  34944|        5|     0|       2051|    0|(8,[5],[1.0])|(4993,[2051],[1.0])|
| 231552|        5|     0|       3828|    1|(8,[5],[1.0])|(4993,[3828],[1.0])|
| 231552|        5|     0|       2124|    0|(8,[5],[1.0])|(4993,[2124],[1.0])|
| 232320|        4|     1|       1168|    0|(8,[4],[1.0])|(4993,[1168],[1.0])|
| 232320|        4|     1|       4270|    0|(8,[4],[

In [44]:
#去除无用特征列
print(encodeData.columns)
use_data = encodeData.select([ 'age_range_vec',  'merchant_id_vec',  'gender','label'])
use_data.printSchema()

['user_id', 'age_range', 'gender', 'merchant_id', 'label', 'age_range_vec', 'merchant_id_vec']
root
 |-- age_range_vec: vector (nullable = true)
 |-- merchant_id_vec: vector (nullable = true)
 |-- gender: integer (nullable = true)
 |-- label: integer (nullable = true)



In [46]:
#特征归并到一列
from pyspark.ml.feature import VectorAssembler  #一个 导入VerctorAssembler 将多个列合并成向量列的特征转换器,即将表中各列用一个类似list表示，输出预测列为单独一列。

assembler = VectorAssembler(inputCols=['age_range_vec',  'merchant_id_vec',  'gender'],outputCol="features")
trainset = assembler.transform(use_data)
trainset.printSchema()

trainset.show(10,False)

root
 |-- age_range_vec: vector (nullable = true)
 |-- merchant_id_vec: vector (nullable = true)
 |-- gender: integer (nullable = true)
 |-- label: integer (nullable = true)
 |-- features: vector (nullable = true)

+-------------+-------------------+------+-----+----------------------------------+
|age_range_vec|merchant_id_vec    |gender|label|features                          |
+-------------+-------------------+------+-----+----------------------------------+
|(8,[6],[1.0])|(4993,[3906],[1.0])|0     |0    |(5002,[6,3914],[1.0,1.0])         |
|(8,[6],[1.0])|(4993,[121],[1.0]) |0     |0    |(5002,[6,129],[1.0,1.0])          |
|(8,[6],[1.0])|(4993,[4356],[1.0])|0     |1    |(5002,[6,4364],[1.0,1.0])         |
|(8,[6],[1.0])|(4993,[2217],[1.0])|0     |0    |(5002,[6,2225],[1.0,1.0])         |
|(8,[4],[1.0])|(4993,[2618],[1.0])|1     |0    |(5002,[4,2626,5001],[1.0,1.0,1.0])|
|(8,[5],[1.0])|(4993,[2051],[1.0])|0     |0    |(5002,[5,2059],[1.0,1.0])         |
|(8,[5],[1.0])|(4993,[3828],[

## logistics回归建模

In [57]:
# 创建评估器
import pyspark.ml.classification as cl
logistic = cl.LogisticRegression(
    maxIter=10, 
    regParam=0.01, 
    labelCol='label')
print ('logistic:', logistic) 

logistic: LogisticRegression_0bb482445d3e


In [56]:
# 创建一个管道
from pyspark.ml import Pipeline

# fit 
traindata, testdata = trainset.randomSplit([0.7, 0.3], seed=666)
 
# 运行管道，评估模型。
lr = logistic.fit(traindata.select(['features','label']))
test_model = lr.transform(testdata)
 
print ('test_model:', test_model)
 
 
test_model.take(1)
 
print ('test_model.take(1):', test_model.take(1))
 


test_model: DataFrame[age_range_vec: vector, merchant_id_vec: vector, gender: int, label: int, features: vector, rawPrediction: vector, probability: vector, prediction: double]
test_model.take(1): [Row(age_range_vec=SparseVector(8, {}), merchant_id_vec=SparseVector(4993, {67: 1.0}), gender=0, label=0, features=SparseVector(5002, {75: 1.0}), rawPrediction=DenseVector([3.1645, -3.1645]), probability=DenseVector([0.9595, 0.0405]), prediction=0.0)]


In [62]:
 
# 评估模型性能,这里略过搜索和tune
import pyspark.ml.evaluation as ev
 
evaluator = ev.BinaryClassificationEvaluator(
    rawPredictionCol='probability', 
    labelCol='label')
 
print(evaluator.evaluate(test_model, 
     {evaluator.metricName: 'areaUnderROC'}))
print(evaluator.evaluate(test_model, {evaluator.metricName: 'areaUnderPR'})) 

0.6416230017778205
0.109574965741648
