In [2]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))
spark = SparkSession.builder.getOrCreate()

In [3]:
df = spark.read.parquet('washing.parquet')

In [4]:
### using SQL ###
df.createOrReplaceTempView('washing')
min_temp = spark.sql("select min(temperature) as mntp from washing").first().mntp
max_temp = spark.sql("select max(temperature) as mxtp from washing").first().mxtp
temp_mean = spark.sql("select avg(temperature) as avtp from washing").first().avtp
std_temp = spark.sql("select stddev_pop(temperature) as stdtp from washing").first().stdtp
skew_temp = spark.sql("""select 
            1/count(temperature)*sum(pow(temperature-%s,3))/pow(%s,3) 
            as skwtp from washing"""%(temp_mean,std_temp)).first().skwtp
kurt_temp = spark.sql("""select 
            1/count(temperature)*sum(pow(temperature-%s,4))/pow(%s,4) 
            as kurttp from washing"""%(temp_mean,std_temp)).first().kurttp
corr_temp_hard = spark.sql("select corr(temperature,hardness) as corrtp from washing").first().corrtp
print('min temp = %3i \nmax temp = %3i \nmean temp = %3.2f\nstdev = %2.2f'%(min_temp,max_temp,temp_mean,std_temp))
print('skewness = %1.3f \nkurtosis = %1.3f \ncorr_temp_hard = %1.3f'%(skew_temp,kurt_temp,corr_temp_hard))

min temp =  80 
max temp = 100 
mean temp = 90.04
stdev = 6.10
skewness = 0.010 
kurtosis = 1.776 
corr_temp_hard = 0.018


In [32]:
### using rdd  ###
temp = df.select('temperature')
temp.take(5)

[Row(temperature=100),
 Row(temperature=None),
 Row(temperature=None),
 Row(temperature=86),
 Row(temperature=None)]

In [33]:
# drop None and position tags
tmp = temp.na.drop().rdd.map(lambda row: row.temperature)
tmp.take(5)

[100, 86, 84, 84, 96]

In [34]:
# analyze data
mean = tmp.mean()
mn = tmp.min()
mx = tmp.max()
std = tmp.stdev()
print('min temp = %3i \nmax temp = %3i \nmean temp = %3.2f\nstdev = %2.2f'%(mn,mx,mean,std))

min temp =  80 
max temp = 100 
mean temp = 90.04
stdev = 6.10


In [35]:
# Alternatively
tmp.stats()

(count: 1342, mean: 90.03800298062592, stdev: 6.098487624200338, max: 100.0, min: 80.0)

In [58]:
from pyspark.sql import functions as f
df.select(f.skewness('temperature')).show()
df.select(f.kurtosis('temperature')).show()

+---------------------+
|skewness(temperature)|
+---------------------+
| 0.010410008042945581|
+---------------------+

+---------------------+
|kurtosis(temperature)|
+---------------------+
|  -1.2239269305786886|
+---------------------+



In [74]:
#alternatively, compute from groundup
n = tmp.count()
skew = tmp.map(lambda x: pow(x-mean,3)*1/n*1/pow(std,3)).sum()
kurt = tmp.map(lambda x: pow(x-mean,4)*1/n*1/pow(std,4)).sum()  # reduce(lambda a,b: a+b ) also works as sum()
print('skewness = %1.3f \nkurtosis = %1.3f'%(skew,kurt))

skewness = 0.010 
kurtosis = 1.776


In [84]:
# correlation
from pyspark.mllib.stat import Statistics

hard = df.select('hardness').na.drop().rdd.map(lambda row: row.hardness)
corr = Statistics.corr(tmp,hard, method='pearson')
print('temp_hardness_corr = %1.3f'%(corr))

temp_hardness_corr = 0.018


In [90]:
#alternatively
tmp_hrd = tmp.zip(hard)
mean2 = hard.mean()
std2 = hard.stdev()
corr = tmp_hrd.map(lambda x: (x[0]-mean)*(x[1]-mean2)/(n*std*std2)).sum()
print('temp_hardness_corr = %1.3f'%(corr))

temp_hardness_corr = 0.018
