## Instalando las dependencias necesarias

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

#### Definiendo variables de entorno

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [3]:
!ls

sample_data  spark-3.1.1-bin-hadoop3.2	spark-3.1.1-bin-hadoop3.2.tgz


In [4]:
! pip install pyspark

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285388 sha256=a00c610701a8b6e79af750aec457751cefd2ab95aa4d5b4eaa9c04c8ea0b8f6e
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


In [5]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # Property used to format output tables better
spark

## Trabajar con datos

In [6]:
# Para obtener detalle sobre un función de pyspark
help(spark.read.text)

Help on method text in module pyspark.sql.readwriter:

text(paths, wholetext=False, lineSep=None, pathGlobFilter=None, recursiveFileLookup=None, modifiedBefore=None, modifiedAfter=None) method of pyspark.sql.readwriter.DataFrameReader instance
    Loads text files and returns a :class:`DataFrame` whose schema starts with a
    string column named "value", and followed by partitioned columns if there
    are any.
    The text files must be encoded as UTF-8.
    
    By default, each line in the text file is a new row in the resulting DataFrame.
    
    .. versionadded:: 1.6.0
    
    Parameters
    ----------
    paths : str or list
        string, or list of strings, for input path(s).
    wholetext : str or bool, optional
        if true, read each file from input path(s) as a single row.
    lineSep : str, optional
        defines the line separator that should be used for parsing. If None is
        set, it covers all ``\r``, ``\r\n`` and ``\n``.
    pathGlobFilter : str or bool, 

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
# Ver el archivo por pantalla y no truncarlo
spark.read.text('/content/drive/MyDrive/retail_db/schemas.json', wholetext=True).show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [13]:
# Ver el primer registro del archivo por pantalla , ademas traerlo en una variable de tipo texto
schema_text = spark.read.text('/content/drive/MyDrive/retail_db/schemas.json', wholetext=True).first().value

schema_text

'{\r\n    "departments": [\r\n        {\r\n            "column_name": "department_id",\r\n            "data_type": "integer",\r\n            "column_position": 1\r\n        },\r\n        {\r\n            "column_name": "department_name",\r\n            "data_type": "string",\r\n            "column_position": 2\r\n        }\r\n    ],\r\n    "categories": [\r\n        {\r\n            "column_name": "category_id",\r\n            "data_type": "integer",\r\n            "column_position": 1\r\n        },\r\n        {\r\n            "column_name": "category_department_id",\r\n            "data_type": "integer",\r\n            "column_position": 2\r\n        },\r\n        {\r\n            "column_name": "category_name",\r\n            "data_type": "string",\r\n            "column_position": 3\r\n        }\r\n    ],\r\n    "orders": [\r\n        {\r\n            "column_name": "order_id",\r\n            "data_type": "integer",\r\n            "column_position": 1\r\n        },\r\n        {\r\n 

In [14]:
type(schema_text)

str

In [15]:
# Libreria de python usada para manejar los archivos json
import json

In [16]:
#  Lectura del json como un python dict
column_details = json.loads(schema_text)['orders']
column_details

[{'column_name': 'order_id', 'data_type': 'integer', 'column_position': 1},
 {'column_name': 'order_date', 'data_type': 'string', 'column_position': 2},
 {'column_name': 'order_customer_id',
  'data_type': 'timestamp',
  'column_position': 3},
 {'column_name': 'order_status', 'data_type': 'string', 'column_position': 4}]

In [17]:
# Se ordena los datos por la columns position
sorted(column_details, key=lambda col: col['column_position'])

[{'column_name': 'order_id', 'data_type': 'integer', 'column_position': 1},
 {'column_name': 'order_date', 'data_type': 'string', 'column_position': 2},
 {'column_name': 'order_customer_id',
  'data_type': 'timestamp',
  'column_position': 3},
 {'column_name': 'order_status', 'data_type': 'string', 'column_position': 4}]

In [18]:
# Extraer del diccionario del nombre de la columna
columns = [info['column_name'] for info in sorted(column_details, key=lambda col: col['column_position'])]

In [19]:
help(spark.read.csv)

Help on method csv in module pyspark.sql.readwriter:

csv(path, schema=None, sep=None, encoding=None, quote=None, escape=None, comment=None, header=None, inferSchema=None, ignoreLeadingWhiteSpace=None, ignoreTrailingWhiteSpace=None, nullValue=None, nanValue=None, positiveInf=None, negativeInf=None, dateFormat=None, timestampFormat=None, maxColumns=None, maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None, columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None, samplingRatio=None, enforceSchema=None, emptyValue=None, locale=None, lineSep=None, pathGlobFilter=None, recursiveFileLookup=None, modifiedBefore=None, modifiedAfter=None, unescapedQuoteHandling=None) method of pyspark.sql.readwriter.DataFrameReader instance
    Loads a CSV file and returns the result as a  :class:`DataFrame`.
    
    This function will go through the input once to determine the input schema if
    ``inferSchema`` is enabled. To avoid going through the entire data once, di

In [20]:
# Lectura csv, permitir que infiera el esquema de los datos -  asigna los nombres anteriores a las columns
orders = spark.read.csv('/content/drive/MyDrive/retail_db/orders', inferSchema=True).toDF(*columns)

In [21]:
from pyspark.sql.functions import count, col

In [22]:
# Aplicación de funciones de agregación en pyspark
orders.\
  groupBy('order_status').\
    agg(count('*').alias('order_count')).\
      orderBy(col('order_count').desc()).\
      show()

+---------------+-----------+
|   order_status|order_count|
+---------------+-----------+
|       COMPLETE|      22899|
|PENDING_PAYMENT|      15030|
|     PROCESSING|       8275|
|        PENDING|       7610|
|         CLOSED|       7556|
|        ON_HOLD|       3798|
|SUSPECTED_FRAUD|       1558|
|       CANCELED|       1428|
| PAYMENT_REVIEW|        729|
+---------------+-----------+



# Test