In [1]:
%matplotlib inline
import findspark
import os

In [2]:
findspark.init(os.environ['SPARK_HOME'])

In [113]:
%matplotlib inline
import pyspark 
from pyspark.sql.session import SparkSession
from pyspark.ml.linalg import *
from pyspark.sql.functions import *  
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.feature import SQLTransformer
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import Normalizer
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit


import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql.types import *

In [4]:
# creamos la sesión de spark
spark = SparkSession\
.builder\
.appName("spark_Parte2")\
.getOrCreate()

In [5]:
profecoSchema = StructType([StructField("producto", StringType(), True), \
                     StructField("presentacion", StringType(), True), \
                     StructField("marca", StringType(), True), \
                     StructField("categoria", StringType(), True), \
                     StructField("catalogo", StringType(), True), \
                     StructField("precio", DoubleType(), True), \
                     StructField("fecharegistro", TimestampType(), True), \
                     StructField("cadenacomercial", StringType(), True), \
                     StructField("giro", StringType(), True), \
                     StructField("nombrecomercial", StringType(), True), \
                     StructField("direccion", StringType(), True), \
                     StructField("estado", StringType(), True), \
                     StructField("municipio", StringType(), True), \
                     StructField("latitud", DoubleType(), True), \
                     StructField("longitud", DoubleType(), True)] )

In [7]:
profecoDf = spark.read.format("csv")\
        .option("delimiter", "|")\
        .option("header","true")\
        .schema(profecoSchema) \
        .option("inferSchema", "true")\
        .load("profeco_final_bash.csv")

In [6]:
profecoDF = spark.read.format('parquet')\
        .load("profecoFinal.parquet")

In [8]:
profecoDf.printSchema()

root
 |-- producto: string (nullable = true)
 |-- presentacion: string (nullable = true)
 |-- marca: string (nullable = true)
 |-- categoria: string (nullable = true)
 |-- catalogo: string (nullable = true)
 |-- precio: double (nullable = true)
 |-- fecharegistro: timestamp (nullable = true)
 |-- cadenacomercial: string (nullable = true)
 |-- giro: string (nullable = true)
 |-- nombrecomercial: string (nullable = true)
 |-- direccion: string (nullable = true)
 |-- estado: string (nullable = true)
 |-- municipio: string (nullable = true)
 |-- latitud: double (nullable = true)
 |-- longitud: double (nullable = true)



In [132]:
# Manipulación de variables
sqlTrans = SQLTransformer( \
    statement="SELECT *,DAY(fecharegistro) AS dia FROM  __THIS__ \
          WHERE categoria LIKE '%medicamentos%' \
          AND estado='distrito federal' \
          AND MONTH(fecharegistro)=4 \
          AND YEAR(fecharegistro)=2016")

In [133]:
# Formato de label / predictor
#assemblerPrecio = VectorAssembler(inputCols=["precio"],outputCol="precioVec")

In [134]:
# Transformación de variables
#scalerPrecio = Normalizer(inputCol='precioVec', outputCol='scaled_precio')

In [135]:
# Selección de variables
formula = SQLTransformer( \
    statement="SELECT producto, marca, precio AS label, dia ,cadenacomercial, municipio , latitud,longitud FROM  __THIS__ ")

In [136]:
# Formato de String a categótico
productoIndexer = StringIndexer(inputCol="producto", outputCol="productoIndex")
marcaIndexer = StringIndexer(inputCol="marca", outputCol="marcaIndex")
cadenacomercialIndexer = StringIndexer(inputCol="cadenacomercial", outputCol="cadenacomercialIndex")
municipioIndexer = StringIndexer(inputCol="municipio", outputCol="municipioIndex")

In [137]:
assembler = VectorAssembler(
    inputCols=["productoIndex", "marcaIndex", "dia","cadenacomercialIndex","municipioIndex","latitud","longitud"],
    outputCol="Features")

In [138]:
# Train a random forest model
rf = RandomForestRegressor(labelCol='label',featuresCol='Features',maxBins=400)

In [139]:
pipeline = Pipeline(stages=[sqlTrans,formula,productoIndexer,marcaIndexer,cadenacomercialIndexer,municipioIndexer,assembler,rf])

In [140]:
paramGrid = ParamGridBuilder()\
    .addGrid(rf.maxDepth, [2,5,7])\
    .addGrid(rf.numTrees, [64,96,128])\
    .build()

In [141]:
tvs = TrainValidationSplit(estimator=pipeline,
                           estimatorParamMaps=paramGrid,
                           evaluator=RegressionEvaluator(),
                           # 80% of the data will be used for training, 20% for validation.
                           trainRatio=0.8)

In [142]:
model = tvs.fit(profecoDF)

In [147]:
model.estimator

Param(parent='TrainValidationSplitModel_e0b5af93dd08', name='estimator', doc='estimator to be cross-validated')

In [156]:
model.uid

'TrainValidationSplitModel_e0b5af93dd08'

In [157]:
model.bestModel

PipelineModel_86e67f8e1978

In [181]:
model.explainParams()

"estimator: estimator to be cross-validated (current: Pipeline_a9c622451ec0)\nestimatorParamMaps: estimator param maps (current: [{Param(parent='RandomForestRegressor_4b2e9af478b9', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 2, Param(parent='RandomForestRegressor_4b2e9af478b9', name='numTrees', doc='Number of trees to train (>= 1).'): 64}, {Param(parent='RandomForestRegressor_4b2e9af478b9', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 2, Param(parent='RandomForestRegressor_4b2e9af478b9', name='numTrees', doc='Number of trees to train (>= 1).'): 96}, {Param(parent='RandomForestRegressor_4b2e9af478b9', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 2, Param(parent='RandomForestRegressor_4b2e9af478b9', name='numTre

In [183]:
model.extractParamMap()

{Param(parent='TrainValidationSplitModel_e0b5af93dd08', name='seed', doc='random seed.'): 3376962289120859831,
 Param(parent='TrainValidationSplitModel_e0b5af93dd08', name='estimator', doc='estimator to be cross-validated'): Pipeline_a9c622451ec0,
 Param(parent='TrainValidationSplitModel_e0b5af93dd08', name='estimatorParamMaps', doc='estimator param maps'): [{Param(parent='RandomForestRegressor_4b2e9af478b9', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 2,
   Param(parent='RandomForestRegressor_4b2e9af478b9', name='numTrees', doc='Number of trees to train (>= 1).'): 64},
  {Param(parent='RandomForestRegressor_4b2e9af478b9', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 2,
   Param(parent='RandomForestRegressor_4b2e9af478b9', name='numTrees', doc='Number of trees to train (>= 1).'): 96},
  {Param(parent='

In [194]:
best_model.uid

'PipelineModel_86e67f8e1978'

In [214]:
model.explainParams('numTrees')

TypeError: explainParams() takes 1 positional argument but 2 were given

In [212]:
model.getParam()

TypeError: getParam() missing 1 required positional argument: 'paramName'

In [None]:
# Model Fit
model = pipeline.fit(profecoDF)

In [112]:
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="precio", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(prediction)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

rfModel = model.stages[1]
print(rfModel)  # summary only

Root Mean Squared Error (RMSE) on test data = 166.484
SQLTransformer_e9dabaa24c40


In [109]:
prediction.select("precio","prediction").show(5)

+------+------------------+
|precio|        prediction|
+------+------------------+
|1203.9| 993.4524419686411|
|179.68|166.25676723535088|
|111.55|158.21065861855678|
|1008.9| 993.4524419686411|
|  80.7|129.40895074936634|
+------+------------------+
only showing top 5 rows



In [217]:
spark.stop()

In [220]:
spark_cluster = SparkSession\
.builder\
.master('spark://ec2-34-212-226-93.us-west-2.compute.amazonaws.com:7077')\
.appName("spark_cluster")\
.getOrCreate()

Py4JJavaError: An error occurred while calling None.org.apache.spark.api.java.JavaSparkContext.
: java.nio.channels.UnresolvedAddressException
	at sun.nio.ch.Net.checkAddress(Net.java:101)
	at sun.nio.ch.ServerSocketChannelImpl.bind(ServerSocketChannelImpl.java:218)
	at io.netty.channel.socket.nio.NioServerSocketChannel.doBind(NioServerSocketChannel.java:128)
	at io.netty.channel.AbstractChannel$AbstractUnsafe.bind(AbstractChannel.java:558)
	at io.netty.channel.DefaultChannelPipeline$HeadContext.bind(DefaultChannelPipeline.java:1283)
	at io.netty.channel.AbstractChannelHandlerContext.invokeBind(AbstractChannelHandlerContext.java:501)
	at io.netty.channel.AbstractChannelHandlerContext.bind(AbstractChannelHandlerContext.java:486)
	at io.netty.channel.DefaultChannelPipeline.bind(DefaultChannelPipeline.java:989)
	at io.netty.channel.AbstractChannel.bind(AbstractChannel.java:254)
	at io.netty.bootstrap.AbstractBootstrap$2.run(AbstractBootstrap.java:364)
	at io.netty.util.concurrent.AbstractEventExecutor.safeExecute(AbstractEventExecutor.java:163)
	at io.netty.util.concurrent.SingleThreadEventExecutor.runAllTasks(SingleThreadEventExecutor.java:403)
	at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:463)
	at io.netty.util.concurrent.SingleThreadEventExecutor$5.run(SingleThreadEventExecutor.java:858)
	at io.netty.util.concurrent.DefaultThreadFactory$DefaultRunnableDecorator.run(DefaultThreadFactory.java:138)
	at java.lang.Thread.run(Thread.java:748)


In [105]:
prediction=model.transform(profecoDF)