# Correlation 计算两个数据系列之间的相关性是统计学中的常见操作。 在spark.ml

# 中，我们可以灵活地计算许多系列之间的成对相关性。 支持的相关方法目前是

# Pearson和Spearman的相关性。

In [1]:
from pyspark.ml.linalg import Vectors

In [2]:
from pyspark.ml.stat import Correlation

In [7]:
from pyspark.sql import SparkSession

In [8]:
spark=SparkSession\
.builder.\
appName("python spark sql example")\
.config("spark.some.config.option","some-value")\
.getOrCreate()

In [59]:
data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]),),
        (Vectors.dense([4.0, 5.0, 6.0, 7.0]),),
        (Vectors.dense([6.0, 7.0, 8.0, 9.0]),),
        (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]),)]

In [60]:
data

[(SparseVector(4, {0: 1.0, 3: -2.0}),),
 (DenseVector([4.0, 5.0, 6.0, 7.0]),),
 (DenseVector([6.0, 7.0, 8.0, 9.0]),),
 (SparseVector(4, {0: 9.0, 3: 1.0}),)]

In [61]:
df=spark.createDataFrame(data,["features"])

In [63]:
df.toPandas()

Unnamed: 0,features
0,"(1.0, 0.0, 0.0, -2.0)"
1,"[4.0, 5.0, 6.0, 7.0]"
2,"[6.0, 7.0, 8.0, 9.0]"
3,"(9.0, 0.0, 0.0, 1.0)"


### Pearson相关系数

In [29]:
r1=Correlation.corr(df,"features").head()

In [30]:
r1.asDict

<bound method Row.asDict of Row(pearson(features)=DenseMatrix(4, 4, [1.0, 0.0556, 0.048, 0.2706, 0.0556, 1.0, 0.9995, 0.9689, 0.048, 0.9995, 1.0, 0.9704, 0.2706, 0.9689, 0.9704, 1.0], False))>

In [31]:
print("Pearson correlation matrix:\n" + str(r1[0]))

Pearson correlation matrix:
DenseMatrix([[1.        , 0.05564149, 0.04802921, 0.27055982],
             [0.05564149, 1.        , 0.99948387, 0.96885501],
             [0.04802921, 0.99948387, 1.        , 0.97043119],
             [0.27055982, 0.96885501, 0.97043119, 1.        ]])


### spearman相关系数

In [32]:
r2=Correlation.corr(df,"features","spearman").head()

In [33]:
print("Spearman correlation matrix:\n" + str(r2[0]))

Spearman correlation matrix:
DenseMatrix([[1.        , 0.10540926, 0.10540926, 0.4       ],
             [0.10540926, 1.        , 1.        , 0.9486833 ],
             [0.10540926, 1.        , 1.        , 0.9486833 ],
             [0.4       , 0.9486833 , 0.9486833 , 1.        ]])


# Hypothesis testing

###  ChiSquareTest 

In [34]:
from pyspark.ml.linalg import Vectors

In [35]:
from pyspark.ml.stat import ChiSquareTest

In [52]:
data = [(0.0, Vectors.dense(0.5, 10.0)),
        (0.0, Vectors.dense(1.5, 20.0)),
        (1.0, Vectors.dense(1.5, 30.0)),
        (0.0, Vectors.dense(3.5, 30.0)),
        (0.0, Vectors.dense(3.5, 40.0)),
        (1.0, Vectors.dense(3.5, 40.0))]

In [53]:
df=spark.createDataFrame(data,["label","features"])

In [54]:
df.show()

+-----+----------+
|label|  features|
+-----+----------+
|  0.0|[0.5,10.0]|
|  0.0|[1.5,20.0]|
|  1.0|[1.5,30.0]|
|  0.0|[3.5,30.0]|
|  0.0|[3.5,40.0]|
|  1.0|[3.5,40.0]|
+-----+----------+



In [55]:
r=ChiSquareTest.test(df,"features","label").head()

In [56]:
print("pValues: " + str(r.pValues))

pValues: [0.6872892787909721,0.6822703303362126]


In [57]:
print("degreesOfFreedom: " + str(r.degreesOfFreedom))

degreesOfFreedom: [2, 3]


In [58]:
print("statistics: " + str(r.statistics))

statistics: [0.75,1.5]
