In [1]:
%matplotlib inline
import findspark
import os

In [2]:
findspark.init(os.environ['SPARK_HOME'])

In [111]:
%matplotlib inline
import pyspark 
from pyspark.sql.session import SparkSession
from pyspark.ml.linalg import *
from pyspark.sql.functions import *  
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.feature import SQLTransformer
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import Normalizer
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator


import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql.types import *

In [4]:
# creamos la sesión de spark
spark = SparkSession\
.builder\
.appName("spark_Parte2")\
.getOrCreate()

In [5]:
profecoSchema = StructType([StructField("producto", StringType(), True), \
                     StructField("presentacion", StringType(), True), \
                     StructField("marca", StringType(), True), \
                     StructField("categoria", StringType(), True), \
                     StructField("catalogo", StringType(), True), \
                     StructField("precio", DoubleType(), True), \
                     StructField("fecharegistro", TimestampType(), True), \
                     StructField("cadenacomercial", StringType(), True), \
                     StructField("giro", StringType(), True), \
                     StructField("nombrecomercial", StringType(), True), \
                     StructField("direccion", StringType(), True), \
                     StructField("estado", StringType(), True), \
                     StructField("municipio", StringType(), True), \
                     StructField("latitud", DoubleType(), True), \
                     StructField("longitud", DoubleType(), True)] )

In [7]:
profecoDf = spark.read.format("csv")\
        .option("delimiter", "|")\
        .option("header","true")\
        .schema(profecoSchema) \
        .option("inferSchema", "true")\
        .load("profeco_final_bash.csv")

In [6]:
profecoDF = spark.read.format('parquet')\
        .load("profecoFinal.parquet")

In [8]:
profecoDf.printSchema()

root
 |-- producto: string (nullable = true)
 |-- presentacion: string (nullable = true)
 |-- marca: string (nullable = true)
 |-- categoria: string (nullable = true)
 |-- catalogo: string (nullable = true)
 |-- precio: double (nullable = true)
 |-- fecharegistro: timestamp (nullable = true)
 |-- cadenacomercial: string (nullable = true)
 |-- giro: string (nullable = true)
 |-- nombrecomercial: string (nullable = true)
 |-- direccion: string (nullable = true)
 |-- estado: string (nullable = true)
 |-- municipio: string (nullable = true)
 |-- latitud: double (nullable = true)
 |-- longitud: double (nullable = true)



In [76]:
# Manipulación de variables
sqlTrans = SQLTransformer( \
    statement="SELECT *,DAY(fecharegistro) AS dia FROM  __THIS__ \
          WHERE categoria LIKE '%medicamentos%' \
          AND estado='distrito federal' \
          AND MONTH(fecharegistro)=4 \
          AND YEAR(fecharegistro)=2016")

In [77]:
# Formato de label / predictor
#assemblerPrecio = VectorAssembler(inputCols=["precio"],outputCol="precioVec")

In [78]:
# Transformación de variables
#scalerPrecio = Normalizer(inputCol='precioVec', outputCol='scaled_precio')

In [79]:
# Selección de variables
formula = SQLTransformer( \
    statement="SELECT producto, marca, precio, dia ,cadenacomercial, municipio , latitud,longitud FROM  __THIS__ ")

In [80]:
# Formato de String a categótico
productoIndexer = StringIndexer(inputCol="producto", outputCol="productoIndex")
marcaIndexer = StringIndexer(inputCol="marca", outputCol="marcaIndex")
cadenacomercialIndexer = StringIndexer(inputCol="cadenacomercial", outputCol="cadenacomercialIndex")
municipioIndexer = StringIndexer(inputCol="municipio", outputCol="municipioIndex")

In [81]:
assembler = VectorAssembler(
    inputCols=["productoIndex", "marcaIndex", "dia","cadenacomercialIndex","municipioIndex","latitud","longitud"],
    outputCol="Features")

In [88]:
# Train a random forest model
rf = RandomForestRegressor(labelCol='precio',featuresCol='Features',maxBins=400)

In [89]:
pipeline = Pipeline(stages=[sqlTrans,formula,productoIndexer,marcaIndexer,cadenacomercialIndexer,municipioIndexer,assembler,rf])

In [103]:
# Model Fit
model = pipeline.fit(profecoDF)

In [105]:
prediction=model.transform(profecoDF)

In [109]:
prediction.select("precio","prediction").show(5)

+------+------------------+
|precio|        prediction|
+------+------------------+
|1203.9| 993.4524419686411|
|179.68|166.25676723535088|
|111.55|158.21065861855678|
|1008.9| 993.4524419686411|
|  80.7|129.40895074936634|
+------+------------------+
only showing top 5 rows



In [112]:
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="precio", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(prediction)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

rfModel = model.stages[1]
print(rfModel)  # summary only

Root Mean Squared Error (RMSE) on test data = 166.484
SQLTransformer_e9dabaa24c40
