# Loading data

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, StructType, StructField


spark = SparkSession.builder.appName("logistic_prediction").getOrCreate()

df = spark.read.csv('../data/dataset/DataCoSupplyChainDataset.csv',header=True, inferSchema=True)

df.printSchema()


root
 |-- Type: string (nullable = true)
 |-- Days for shipping (real): integer (nullable = true)
 |-- Days for shipment (scheduled): integer (nullable = true)
 |-- Benefit per order: double (nullable = true)
 |-- Sales per customer: double (nullable = true)
 |-- Delivery Status: string (nullable = true)
 |-- Late_delivery_risk: integer (nullable = true)
 |-- Category Id: integer (nullable = true)
 |-- Category Name: string (nullable = true)
 |-- Customer City: string (nullable = true)
 |-- Customer Country: string (nullable = true)
 |-- Customer Email: string (nullable = true)
 |-- Customer Fname: string (nullable = true)
 |-- Customer Id: integer (nullable = true)
 |-- Customer Lname: string (nullable = true)
 |-- Customer Password: string (nullable = true)
 |-- Customer Segment: string (nullable = true)
 |-- Customer State: string (nullable = true)
 |-- Customer Street: string (nullable = true)
 |-- Customer Zipcode: integer (nullable = true)
 |-- Department Id: integer (nullable = 

In [2]:
from pyspark.sql import functions as f
# df.dtypes

df.select([f.count(f.when(f.col(c).isNull(), c)).alias(c) for c in df.columns]).show()
# Product Description,Order Zipcode have null values
df = df.drop('Order Zipcode', 'Product Description', 'Customer Email', 'Customer Password' )

+----+------------------------+-----------------------------+-----------------+------------------+---------------+------------------+-----------+-------------+-------------+----------------+--------------+--------------+-----------+--------------+-----------------+----------------+--------------+---------------+----------------+-------------+---------------+--------+---------+------+----------+-------------+-----------------+-----------------------+--------+----------------------+-------------------+------------------------+-------------+------------------------+-----------------------+-------------------+-----+----------------+----------------------+------------+-----------+------------+-------------+---------------+-------------------+-------------------+-------------+------------+-------------+--------------+--------------------------+-------------+
|Type|Days for shipping (real)|Days for shipment (scheduled)|Benefit per order|Sales per customer|Delivery Status|Late_delivery_risk|Ca

In [3]:
num_cols = []
cat_cols = []

# df_1 = df.drop('Late_delivery_risk')

for col, type in df.dtypes:
    if type in ['int', 'double'] :
        num_cols.append(col)
    else: 
        cat_cols.append(col)

cat_indexed = [
 'Type(indexed)',
 'Delivery Status(indexed)',
#  'Late_delivery_risk(indexed)',
 'Category Name(indexed)',
 'Customer City(indexed)',
 'Customer Country(indexed)',
#  'Customer Email(indexed)',
 'Customer Fname(indexed)',
 'Customer Lname(indexed)',
#  'Customer Password(indexed)',
 'Customer Segment(indexed)',
 'Customer State(indexed)',
 'Customer Street(indexed)',
 'Department Name(indexed)',
 'Market(indexed)',
 'Order City(indexed)',
 'Order Country(indexed)',
 'order date (DateOrders)(indexed)',
 'Order Region(indexed)',
 'Order State(indexed)',
 'Order Status(indexed)',
#  'Product Description(indexed)',
 'Product Image(indexed)',
 'Product Name(indexed)',
 'shipping date (DateOrders)(indexed)',
 'Shipping Mode(indexed)']


cat_encoded = [
 'Type(encoded)',
 'Delivery Status(encoded)',
 'Category Name(encoded)',
 'Customer City(encoded)',
 'Customer Country(encoded)',
#  'Customer Email(encoded)',
 'Customer Fname(encoded)',
 'Customer Lname(encoded)',
#  'Customer Password(encoded)',
 'Customer Segment(encoded)',
 'Customer State(encoded)',
 'Customer Street(encoded)',
 'Department Name(encoded)',
 'Market(encoded)',
 'Order City(encoded)',
 'Order Country(encoded)',
 'order date (DateOrders)(encoded)',
 'Order Region(encoded)',
 'Order State(encoded)',
 'Order Status(encoded)',
#  'Product Description(encoded)',
 'Product Image(encoded)',
 'Product Name(encoded)',
 'shipping date (DateOrders)(encoded)',
 'Shipping Mode(encoded)']


num_cols.pop(4)
num_cols

['Days for shipping (real)',
 'Days for shipment (scheduled)',
 'Benefit per order',
 'Sales per customer',
 'Category Id',
 'Customer Id',
 'Customer Zipcode',
 'Department Id',
 'Latitude',
 'Longitude',
 'Order Customer Id',
 'Order Id',
 'Order Item Cardprod Id',
 'Order Item Discount',
 'Order Item Discount Rate',
 'Order Item Id',
 'Order Item Product Price',
 'Order Item Profit Ratio',
 'Order Item Quantity',
 'Sales',
 'Order Item Total',
 'Order Profit Per Order',
 'Product Card Id',
 'Product Category Id',
 'Product Price',
 'Product Status']

In [4]:
# print(len(cat_indexed), len(cat_cols))
print(len(cat_indexed), len(cat_encoded))

22 22


In [5]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer, OneHotEncoder

# indexing with Stringindexer 
indexer = StringIndexer(inputCols=cat_cols, outputCols=cat_indexed)
df_indexed = indexer.fit(df).transform(df) 
# df_indexed.show()

# encoding with OneHotEncoder
encoder = OneHotEncoder(inputCols=cat_indexed, outputCols=cat_encoded)
df_encoded = encoder.fit(df_indexed).transform(df_indexed)
df_encoded.show()

assembler_inputs = num_cols + cat_encoded

assembler = VectorAssembler(
    inputCols=assembler_inputs,
    outputCol='features'
)



+--------+------------------------+-----------------------------+-----------------+------------------+-----------------+------------------+-----------+--------------+--------------+----------------+--------------+-----------+--------------+----------------+--------------+--------------------+----------------+-------------+---------------+-----------+------------+------------+----------+-------------+-----------------+-----------------------+--------+----------------------+-------------------+------------------------+-------------+------------------------+-----------------------+-------------------+------+----------------+----------------------+--------------+--------------------+---------------+---------------+-------------------+--------------------+------------+-------------+--------------+--------------------------+--------------+-------------+------------------------+----------------------+----------------------+-------------------------+-----------------------+--------------------

In [6]:
# df_final = assembler.transform(df_encoded)
# df_final.show()

In [7]:
from pyspark.ml import Pipeline

rf = RandomForestClassifier(labelCol='Late_delivery_risk', featuresCol='features')

pipeline = Pipeline(stages=[indexer, encoder, assembler, rf])

model = pipeline.fit(df)

rf_model = model.stages[-1]
importances = rf_model.featureImportances
feature_names = assembler.getInputCols()
for name, imp in zip(feature_names, importances):
    print(f"{name}: {imp}")

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3508, in run_code
    exec(code_obj, self.user_global_ns, self

ConnectionRefusedError: [Errno 111] Connection refused

In [None]:
olumns_redondantes  = [
    'Type',
    'Days for shipping (real)',
    'Delivery Status',
    'Customer Fname',
    'Customer Lname',
    'Customer Email',
    'Customer Password',
    'Order Id',
    'Customer Id',
    'Order Item Id',
    'Order Customer Id',
    'Order Item Cardprod Id',
    'Product Card Id',
    'Product Category Id',
    'Department Id',
    'Product Description',
    'Product Image',
    'Product Name'  ,
    'Order Item Total',
    'Order Profit Per Order',
    'Benefit per order',
    'Sales per customer',
    'Order Item Profit Ratio',
    'Customer Country',
    'rder Zipcode',
    'Product Status',
    'Customer State',
    'Customer Street',
    'Customer Zipcode',
    'Department Name',
    'Latitude',
    'Longitude',
    'Market',
    'Order City',
    'Order Country',
    'Order Item Discount',
    'Order Item Discount Rate',
    'Order Item Product Price',
    'Order Item Quantity',
    'Sales',
    'Order Status',
    'Product Card Id',
    'Product Price'
    'Product Status',
    'Shipping date (DateOrders)',
    'Shipping Mode',
    'Category Name',
    'Customer City',
    'Order Zipcode',
    'Order State',
    'Product Price',
    
]


df_columns_cleaned =  df.drop(*olumns_redondantes)


In [None]:
from pyspark.sql import functions as f

df.filter(f.col('Product Status') != 0).show()

# df.select('Product Category Id').filter(f.col('Product Category Id') != 73).show()



In [None]:
df_columns_cleaned = df_columns_cleaned.withColumn(
    'order_date',
    f.to_timestamp('order date (DateOrders)', "M/d/yyyy H:mm")
)

df_columns_cleaned = df_columns_cleaned.withColumn('order_month', f.month('order_date'))

df_columns_cleaned = df_columns_cleaned.drop('order date (DateOrders)', 'order_date')

df_columns_cleaned.show()

In [None]:
df_columns_cleaned.coalesce(1).write.csv(
    "./data/data_cleaned",
    header=True,
    mode="overwrite"
)

In [None]:
df_columns_cleaned.groupBy('order_month').agg(
    f.count('order_month'),
).show()