In [1]:
df = spark.read.format("csv")\
  .option("header", "true")\
  .option("inferSchema", "true")\
  .load("/data/retail-data/all/*.csv")\
  .coalesce(5)

                                                                                

In [7]:
df.count()

541909

In [8]:
df.columns

['InvoiceNo',
 'StockCode',
 'Description',
 'Quantity',
 'InvoiceDate',
 'UnitPrice',
 'CustomerID',
 'Country']

In [2]:
# 컬럼 count 
from pyspark.sql.functions import count
df.select(count("StockCode")).show()



+----------------+
|count(StockCode)|
+----------------+
|          541909|
+----------------+



                                                                                

## countDistinct

In [3]:
# countDistinct
from pyspark.sql.functions import countDistinct
df.select(countDistinct("StockCode")).show()



+-------------------------+
|count(DISTINCT StockCode)|
+-------------------------+
|                     4070|
+-------------------------+



                                                                                

## approx_count_distinct

In [4]:
# 근사치 확인하기
# 사용 용도 -> 데이터셋이 크면... 전체 스캔하는데 시간 오래걸리기 때문에 
# 대략적인 크기 파악(속도... )
from pyspark.sql.functions import approx_count_distinct
df.select(approx_count_distinct("StockCode",0.1) ).show()

+--------------------------------+
|approx_count_distinct(StockCode)|
+--------------------------------+
|                            3364|
+--------------------------------+



## first, last

In [5]:
# 컬럼별 first, last 데이터 확인 
from pyspark.sql.functions import first, last
df.select(first('invoiceno'), last('StockCode')).show()



+----------------+---------------+
|first(invoiceno)|last(StockCode)|
+----------------+---------------+
|          571103|         85099C|
+----------------+---------------+



                                                                                

In [9]:
df.schema

StructType(List(StructField(InvoiceNo,StringType,true),StructField(StockCode,StringType,true),StructField(Description,StringType,true),StructField(Quantity,IntegerType,true),StructField(InvoiceDate,StringType,true),StructField(UnitPrice,DoubleType,true),StructField(CustomerID,IntegerType,true),StructField(Country,StringType,true)))

In [10]:
from pyspark.sql.functions import first, last
df.select(first('stockcode')).show()

+----------------+
|first(stockcode)|
+----------------+
|          85123A|
+----------------+



In [38]:
from pyspark.sql.functions import first, last
df.sort(col('StockCode').desc()).select(first('StockCode'), last('StockCode')).show() 

NameError: name 'col' is not defined

## min, max

In [13]:
from pyspark.sql.functions import min, max

In [14]:
df.select(min("Quantity"), max("Quantity")).show()

+-------------+-------------+
|min(Quantity)|max(Quantity)|
+-------------+-------------+
|       -80995|        80995|
+-------------+-------------+



## sum 

In [15]:
from pyspark.sql.functions import sum
df.select(sum('Quantity')).show()

+-------------+
|sum(Quantity)|
+-------------+
|      5176450|
+-------------+



## sumDistinct 

In [16]:
from pyspark.sql.functions import sum_distinct 
df.select(sum_distinct ('Quantity')).show()



+----------------------+
|sum(DISTINCT Quantity)|
+----------------------+
|                 29310|
+----------------------+



                                                                                

## avg

In [17]:
from pyspark.sql.functions import sum, count, avg, expr

df.select(
    count("Quantity").alias("total_transactions"),
    sum("Quantity").alias("total_purchases"),
    avg("Quantity").alias("avg_purchases"),
    expr("mean(Quantity)").alias("mean_purchases"))\
  .selectExpr(
    "total_purchases/total_transactions",
    "avg_purchases",
    "mean_purchases").show()

+--------------------------------------+----------------+----------------+
|(total_purchases / total_transactions)|   avg_purchases|  mean_purchases|
+--------------------------------------+----------------+----------------+
|                      9.55224954743324|9.55224954743324|9.55224954743324|
+--------------------------------------+----------------+----------------+



## 분산, 표준편차 

In [19]:
from pyspark.sql.functions import var_pop, stddev_pop
from pyspark.sql.functions import var_samp, stddev_samp
df.select(var_pop("Quantity"), var_samp("Quantity"),
  stddev_pop("Quantity"), stddev_samp("Quantity")).show()

+-----------------+------------------+--------------------+---------------------+
|var_pop(Quantity)|var_samp(Quantity)|stddev_pop(Quantity)|stddev_samp(Quantity)|
+-----------------+------------------+--------------------+---------------------+
|  47559.303646609| 47559.39140929869|  218.08095663447781|   218.08115785023404|
+-----------------+------------------+--------------------+---------------------+



In [None]:
# 참고 -> 불편분산

## 상관관계, 공분산 

In [18]:
from pyspark.sql.functions import corr, covar_pop, covar_samp
df.select(corr("InvoiceNo", "Quantity"), covar_samp("InvoiceNo", "Quantity"),
    covar_pop("InvoiceNo", "Quantity")).show()



+-------------------------+-------------------------------+------------------------------+
|corr(InvoiceNo, Quantity)|covar_samp(InvoiceNo, Quantity)|covar_pop(InvoiceNo, Quantity)|
+-------------------------+-------------------------------+------------------------------+
|     4.912186085648426E-4|              1052.728054393167|            1052.7260778770628|
+-------------------------+-------------------------------+------------------------------+



                                                                                

In [24]:
pip install numpy

Collecting numpy
  Downloading numpy-1.22.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.9 MB)
[K     |████████████████████████████████| 16.9 MB 16.3 MB/s eta 0:00:01K     |███                             | 1.6 MB 16.3 MB/s eta 0:00:01███                   | 6.9 MB 16.3 MB/s eta 0:00:010:01
[?25hInstalling collected packages: numpy
Successfully installed numpy-1.22.4
Note: you may need to restart the kernel to use updated packages.


In [30]:
pip install pandas

Collecting pandas
  Downloading pandas-1.4.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.7 MB)
[K     |████████████████████████████████| 11.7 MB 17.9 MB/s eta 0:00:01
[?25hCollecting pytz>=2020.1
  Downloading pytz-2022.1-py2.py3-none-any.whl (503 kB)
[K     |████████████████████████████████| 503 kB 64.9 MB/s eta 0:00:01
Installing collected packages: pytz, pandas
Successfully installed pandas-1.4.2 pytz-2022.1
Note: you may need to restart the kernel to use updated packages.


In [25]:
import numpy as np

In [31]:
import pandas as pd

In [27]:
help(np.var)
# ddof = 0

Help on function var in module numpy:

var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=<no value>, *, where=<no value>)
    Compute the variance along the specified axis.
    
    Returns the variance of the array elements, a measure of the spread of a
    distribution.  The variance is computed for the flattened array by
    default, otherwise over the specified axis.
    
    Parameters
    ----------
    a : array_like
        Array containing numbers whose variance is desired.  If `a` is not an
        array, a conversion is attempted.
    axis : None or int or tuple of ints, optional
        Axis or axes along which the variance is computed.  The default is to
        compute the variance of the flattened array.
    
        .. versionadded:: 1.7.0
    
        If this is a tuple of ints, a variance is performed over multiple axes,
        instead of a single axis or all the axes as before.
    dtype : data-type, optional
        Type to use in computing the variance.  Fo

In [32]:
help(pd.DataFrame.var)
# ddof = 1

Help on function var in module pandas.core.generic:

var(self, axis=None, skipna=True, level=None, ddof=1, numeric_only=None, **kwargs)
    Return unbiased variance over requested axis.
    
    Normalized by N-1 by default. This can be changed using the ddof argument.
    
    Parameters
    ----------
    axis : {index (0), columns (1)}
    skipna : bool, default True
        Exclude NA/null values. If an entire row/column is NA, the result
        will be NA.
    level : int or level name, default None
        If the axis is a MultiIndex (hierarchical), count along a
        particular level, collapsing into a Series.
    ddof : int, default 1
        Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
        where N represents the number of elements.
    numeric_only : bool, default None
        Include only float, int, boolean columns. If None, will attempt to use
        everything, then use only numeric data. Not implemented for Series.
    
    Returns
    

## 그룹화

In [34]:
df.groupBy("InvoiceNo").agg(
    count("Quantity").alias("quan"),
    expr("count(Quantity)")).show()

+---------+----+---------------+
|InvoiceNo|quan|count(Quantity)|
+---------+----+---------------+
|   571906|   1|              1|
|   572049|  20|             20|
|   572458|  26|             26|
|   573020|  55|             55|
|   573256|   1|              1|
|   573409|   1|              1|
|   573726|   1|              1|
|   574592|   8|              8|
|   574844|  13|             13|
|   574966|   8|              8|
|   575091|  38|             38|
|   575671|  20|             20|
|   575948|   4|              4|
|   575961|  13|             13|
|   576059|  44|             44|
|   576112|  20|             20|
|  C576393|   2|              2|
|   577022|  38|             38|
|  C577362|   1|              1|
|   577511|  46|             46|
+---------+----+---------------+
only showing top 20 rows



In [35]:
df.groupBy("InvoiceNo").agg(expr("avg(Quantity)"),expr("stddev_pop(Quantity)"))\
  .show()

+---------+------------------+--------------------+
|InvoiceNo|     avg(Quantity)|stddev_pop(Quantity)|
+---------+------------------+--------------------+
|   571906|               3.0|                 0.0|
|   572049|              8.05|   7.559596550081228|
|   572458|14.038461538461538|   10.04022972933147|
|   573020| 8.272727272727273|   4.726923063991264|
|   573256|              12.0|                 0.0|
|   573409|               4.0|                 0.0|
|   573726|             -67.0|                 0.0|
|   574592|              7.25|  4.4651427748729375|
|   574844| 6.846153846153846|   5.418574235494254|
|   574966|               6.0|   3.640054944640259|
|   575091|11.552631578947368|   5.008925551458656|
|   575671|             16.65|   12.14197265686264|
|   575948|              8.75|    8.98262211161084|
|   575961| 2.769230769230769|  1.5268794800984005|
|   576059|2.8181818181818183|   5.223516436936152|
|   576112|              10.9|  7.4959989327640635|
|  C576393| 