## 复购建模改进版
当时只做了baseline，后面再进一步做点改进

###  数据预处理

In [1]:
#读取数据
from pyspark.sql import SparkSession 
spark=SparkSession.builder.appName("dataframe").getOrCreate()
sc = SparkContext.getOrCreate()
train = spark.read.csv(r"data/train.csv",header=True,inferSchema=True)
test  = spark.read.csv(r"data/test.csv",header=True,inferSchema=True)


  "You are passing in an insecure Py4j gateway.  This "


In [2]:
#查看需要数据
train.groupBy('label').count().show()

+-----+-------+
|label|  count|
+-----+-------+
|   -1|6769859|
|    1|  15952|
|    0| 244912|
+-----+-------+



In [3]:
train1 = train.where(~(train['label']==-1))  #使用的数据

In [4]:
# 一般100M内转为pandas的DataFrame做处理，减少内存使用
train2 = train1.toPandas() 
test1 = test.toPandas()

In [5]:
train2['age_range'].fillna('0',inplace=True)
test1['age_range'].fillna('0',inplace=True)


In [6]:
train2['gender'].fillna('2',inplace=True)
test1['gender'].fillna('2',inplace=True)
print(train2['age_range'].value_counts())

3.0    69369
0.0    55809
4.0    51235
2.0    31026
5.0    25618
6.0    21701
7.0     4120
0       1253
8.0      720
1.0       13
Name: age_range, dtype: int64


In [7]:
train2['age_range'] = train2['age_range'].astype(int)
train2['gender'] = train2['gender'].astype(int)
test1['age_range'] = test1['age_range'].astype(int)
test1['gender'] = test1['gender'].astype(int)

In [8]:
print(train2['age_range'].value_counts())

3    69369
0    57062
4    51235
2    31026
5    25618
6    21701
7     4120
8      720
1       13
Name: age_range, dtype: int64


In [9]:
print(train2['gender'].value_counts())

0    176414
1     73756
2     10694
Name: gender, dtype: int64


In [10]:
#转为spark的dataframe
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
spark_train = sqlContext.createDataFrame(train2)
spark_test = sqlContext.createDataFrame(test1)

In [11]:
spark_train.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- age_range: long (nullable = true)
 |-- gender: long (nullable = true)
 |-- merchant_id: long (nullable = true)
 |-- label: long (nullable = true)



In [13]:
from pyspark.sql.types import IntegerType
spark_train = spark_train.withColumn("user_id", spark_train["user_id"].cast(IntegerType()))
spark_train = spark_train.withColumn("age_range", spark_train["age_range"].cast(IntegerType()))
spark_train = spark_train.withColumn("gender", spark_train["gender"].cast(IntegerType()))
spark_train = spark_train.withColumn("merchant_id", spark_train["merchant_id"].cast(IntegerType()))
spark_train = spark_train.withColumn("label", spark_train["label"].cast(IntegerType()))

In [14]:
#特征
features = list(set(spark_train.columns)-set(['label','user_id']))
features

['gender', 'age_range', 'merchant_id']

In [15]:
#onehot
from pyspark.ml.feature import OneHotEncoder 
from pyspark.ml import Pipeline
##创建OneHotEncoder对象，设定输入输出参数
categoryFeaturesIndex = ['age_range','merchant_id']
pipeline = Pipeline(stages=[
    OneHotEncoder(inputCol=c, outputCol='{}_vec'.format(c))
    for c in categoryFeaturesIndex
])

onehot = pipeline.fit(spark_train)
encodeData = onehot.transform(spark_train)

In [16]:
encodeData.show()

+-------+---------+------+-----------+-----+-------------+-------------------+
|user_id|age_range|gender|merchant_id|label|age_range_vec|    merchant_id_vec|
+-------+---------+------+-----------+-----+-------------+-------------------+
|  34176|        6|     0|       3906|    0|(8,[6],[1.0])|(4993,[3906],[1.0])|
|  34176|        6|     0|        121|    0|(8,[6],[1.0])| (4993,[121],[1.0])|
|  34176|        6|     0|       4356|    1|(8,[6],[1.0])|(4993,[4356],[1.0])|
|  34176|        6|     0|       2217|    0|(8,[6],[1.0])|(4993,[2217],[1.0])|
| 230784|        0|     0|       4818|    0|(8,[0],[1.0])|(4993,[4818],[1.0])|
| 362112|        4|     1|       2618|    0|(8,[4],[1.0])|(4993,[2618],[1.0])|
|  34944|        5|     0|       2051|    0|(8,[5],[1.0])|(4993,[2051],[1.0])|
| 231552|        5|     0|       3828|    1|(8,[5],[1.0])|(4993,[3828],[1.0])|
| 231552|        5|     0|       2124|    0|(8,[5],[1.0])|(4993,[2124],[1.0])|
| 232320|        4|     1|       1168|    0|(8,[4],[

In [17]:
#特征归并到一列
from pyspark.ml.feature import VectorAssembler  
#一个 导入VerctorAssembler 将多个列合并成向量列的特征转换器,即将表中各列用一个类似list表示，输出预测列为单独一列。

assembler = VectorAssembler(inputCols=['age_range_vec',  'merchant_id_vec',  'gender'],outputCol="features")
trainset = assembler.transform(encodeData)
trainset.printSchema()

trainset.show(10,False)

root
 |-- user_id: integer (nullable = true)
 |-- age_range: integer (nullable = true)
 |-- gender: integer (nullable = true)
 |-- merchant_id: integer (nullable = true)
 |-- label: integer (nullable = true)
 |-- age_range_vec: vector (nullable = true)
 |-- merchant_id_vec: vector (nullable = true)
 |-- features: vector (nullable = true)

+-------+---------+------+-----------+-----+-------------+-------------------+----------------------------------+
|user_id|age_range|gender|merchant_id|label|age_range_vec|merchant_id_vec    |features                          |
+-------+---------+------+-----------+-----+-------------+-------------------+----------------------------------+
|34176  |6        |0     |3906       |0    |(8,[6],[1.0])|(4993,[3906],[1.0])|(5002,[6,3914],[1.0,1.0])         |
|34176  |6        |0     |121        |0    |(8,[6],[1.0])|(4993,[121],[1.0]) |(5002,[6,129],[1.0,1.0])          |
|34176  |6        |0     |4356       |1    |(8,[6],[1.0])|(4993,[4356],[1.0])|(5002,[6,43

In [18]:
#模型调参，类似GridSearchCV
import pyspark.ml.classification as cl
import pyspark.ml.tuning as tune
logistic = cl.LogisticRegression(labelCol='label')

grid = tune.ParamGridBuilder().addGrid(logistic.maxIter,[2,10,50]).addGrid(logistic.regParam,[0.01,0.05,0.3]).build()

In [19]:
#结果评价器
import pyspark.ml.evaluation as ev
evaluator = ev.BinaryClassificationEvaluator(
    rawPredictionCol='probability', 
    labelCol='label')
#estimator 评估器，estimatorParamMaps为网格，evaluator比较性能，numFolds进行几折交叉验证    
cv = tune.CrossValidator(estimator=logistic,estimatorParamMaps=grid,evaluator=evaluator,numFolds=5)

In [20]:
cvmodel = cv.fit(trainset)

In [21]:
results = cvmodel.transform(trainset)
print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderROC'}))
print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderPR'})) 

0.6998704463737617
0.13252691056217156


In [22]:
#最优参数
results = [
    (
         [
             {key.name : paramValue}
             for key,paramValue in zip(params.keys(),params.values())
   
         ],metric)
for params,metric in zip(cvmodel.getEstimatorParamMaps(),cvmodel.avgMetrics)
]
sorted(results,key=lambda e1 : e1[1],reverse=True)[0]

([{'maxIter': 2}, {'regParam': 0.05}], 0.6544724424373507)

In [27]:
#预测
spark_test = spark_test.withColumn("user_id", spark_test["user_id"].cast(IntegerType()))
spark_test = spark_test.withColumn("age_range", spark_test["age_range"].cast(IntegerType()))
spark_test = spark_test.withColumn("gender", spark_test["gender"].cast(IntegerType()))
spark_test = spark_test.withColumn("merchant_id", spark_test["merchant_id"].cast(IntegerType()))
 

onehot1 = pipeline.fit(spark_test)
encodeData1 = onehot1.transform(spark_test)


In [28]:
assembler1 = VectorAssembler(inputCols=['age_range_vec',  'merchant_id_vec',  'gender'],outputCol="features")
testset = assembler1.transform(encodeData1)

testset.show(10,False)

+-------+---------+------+-----------+-----+--------------+-------------+-------------------+-------------------------+
|user_id|age_range|gender|merchant_id|label|predict_result|age_range_vec|merchant_id_vec    |features                 |
+-------+---------+------+-----------+-----+--------------+-------------+-------------------+-------------------------+
|163968 |0        |0     |4378       |-1.0 |0             |(8,[0],[1.0])|(4995,[4378],[1.0])|(5004,[0,4386],[1.0,1.0])|
|163968 |0        |0     |2300       |-1.0 |0             |(8,[0],[1.0])|(4995,[2300],[1.0])|(5004,[0,2308],[1.0,1.0])|
|163968 |0        |0     |1551       |-1.0 |0             |(8,[0],[1.0])|(4995,[1551],[1.0])|(5004,[0,1559],[1.0,1.0])|
|163968 |0        |0     |4343       |-1.0 |0             |(8,[0],[1.0])|(4995,[4343],[1.0])|(5004,[0,4351],[1.0,1.0])|
|163968 |0        |0     |4911       |-1.0 |0             |(8,[0],[1.0])|(4995,[4911],[1.0])|(5004,[0,4919],[1.0,1.0])|
|163968 |0        |0     |4043       |-1

In [29]:
lr = cl.LogisticRegression(
    maxIter=2, 
    regParam=0.05, 
    labelCol='label')
lr1 = lr.fit(trainset.select(['features','label']))
test_model = lr1.transform(testset)

In [35]:
result = test_model.select("user_id","prediction") 
print(test_model)

DataFrame[user_id: int, age_range: int, gender: int, merchant_id: int, label: double, predict_result: bigint, age_range_vec: vector, merchant_id_vec: vector, features: vector, rawPrediction: vector, probability: vector, prediction: double]


Name: org.apache.toree.interpreter.broker.BrokerException
Message: Py4JJavaError: An error occurred while calling o10476.save.
: org.apache.spark.SparkException: Job aborted.
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:198)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:159)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:104)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:102)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.doExecute(commands.scala:122)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at o