# Exemplos de consultas

## Linguagens

### SQL
`Calcule a média móvel do faturamento a cada 3 meses`

In [0]:
%sql
SELECT month, AVG(sales_amount) OVER (ORDER BY month ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS moving_average
FROM vr_demo.crisp.forecast
ORDER BY month

### R
`Use o dataframe abaixo para fazer a previsão do sales_amount usando o ARIMA. Use a library forecast`

In [0]:
%r
# Load the forecast library
library(forecast)

# Read the table into a Spark DataFrame
df <- SparkR::tableToDF('vr_demo.crisp.forecast')

# Convert the Spark DataFrame to a local R dataframe
df_local <- SparkR::collect(df)

# Convert the sales_amount column to a time series object
sales_ts <- ts(df_local$sales_amount)

# Fit an ARIMA model to the time series data
arima_model <- auto.arima(sales_ts)

# Make a forecast using the ARIMA model
forecast <- forecast(arima_model)

# Print the forecasted values
print(forecast)

### Python

In [0]:
display(spark.table('vr_demo.crisp.forecast'))

## Formatos de armazenamento

### CSV

In [0]:
display(dbutils.fs.ls("s3://one-env/vr/crisp/forecast/csv"))

In [0]:
display(dbutils.fs.head("s3://one-env/vr/crisp/forecast/csv/part-00000-tid-8981858333522022334-7ffb5121-a2b2-44bf-9e8d-536af44bb998-252-1-c000.csv"))

In [0]:
%sql describe extended vr_demo.crisp.forecast_csv

In [0]:
display(spark.table('vr_demo.crisp.forecast_csv'))

### JSON

In [0]:
display(dbutils.fs.ls("s3://one-env/vr/crisp/forecast/json"))

In [0]:
display(dbutils.fs.head("s3://one-env/vr/crisp/forecast/json/part-00000-tid-4916521996223329909-04b97eaf-a990-491e-ac8e-d5bf14557ac5-253-1-c000.json"))

In [0]:
%sql describe extended vr_demo.crisp.forecast_json

In [0]:
display(spark.table('vr_demo.crisp.forecast_json'))

### Parquet

In [0]:
display(dbutils.fs.ls("s3://one-env/vr/crisp/forecast/parquet"))

In [0]:
display(dbutils.fs.head("s3://one-env/vr/crisp/forecast/parquet/part-00000-tid-1972469917384408746-5bfac72e-16b7-454e-bed6-0b75cd8adc29-250-1.c000.snappy.parquet"))

In [0]:
%sql describe extended vr_demo.crisp.forecast_parquet

In [0]:
display(spark.table('vr_demo.crisp.forecast_parquet'))

Databricks data profile. Run in Databricks to view.

### PDF
`Escreva um código para ler arquivos PDF em /Volumes/vr_demo/chatbot3/pdfs para um Spark Dataframe`

In [0]:
display(dbutils.fs.ls("/Volumes/vr_demo/chatbot3/pdfs"))

In [0]:
import io
import warnings
from pypdf import PdfReader
from pyspark.sql.functions import pandas_udf
from typing import Iterator
import pandas as pd

def parse_bytes_pypdf(raw_doc_contents_bytes: bytes):
    try:
        pdf = io.BytesIO(raw_doc_contents_bytes)
        reader = PdfReader(pdf)
        parsed_content = [page_content.extract_text() for page_content in reader.pages]
        return "\n".join(parsed_content)
    except Exception as e:
        warnings.warn(f"Exception {e} has been thrown during parsing")
        return None
    
@pandas_udf("string")
def parse_pdf(content: pd.Series) -> pd.Series:
    return content.apply(parse_bytes_pypdf)

df = (spark.read.format('binaryFile')
    .load('/Volumes/vr_demo/chatbot3/pdfs')
    .withColumn("parsed_content", parse_pdf("content"))
)

display(df)

In [0]:
import io
import warnings
import pandas as pd
from pypdf import PdfReader
from pyspark.sql.functions import pandas_udf
from typing import Iterator

def parse_bytes_pypdf(raw_doc_contents_bytes: bytes):
    try:
        pdf = io.BytesIO(raw_doc_contents_bytes)
        reader = PdfReader(pdf)
        parsed_content = [page_content.extract_text() for page_content in reader.pages]
        return "\n".join(parsed_content)
    except Exception as e:
        warnings.warn(f"Exception {e} has been thrown during parsing")
        return None
    
@pandas_udf("string")
def parse_pdf(content: pd.Series) -> pd.Series:
    return content.apply(parse_bytes_pypdf)

df = (spark.read.format('BINARYFILE')
    .load('/Volumes/main/dbdemos_rag_chatbot/volume_databricks_documentation/databricks-pdf')
    .withColumn("parsed_content", parse_pdf("content"))
)

display(df)