In [1]:
import pyspark
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from sklearn import datasets
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark.sql import Window
from pyspark.ml.feature import Normalizer
from pyspark.mllib.linalg.distributed import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
import os
os.environ['ARROW_PRE_0_15_IPC_FORMAT']='1'

In [2]:
sc = SparkSession.builder.getOrCreate()
sc.conf.set("spark.sql.execution.arrow.enabled", "true")

These are some methods I was able to come up with for finding the cosine similarity matrix for grouped data in a dataset. For these methods I just computed the average, but once we have the similarity matrix there are different ways to condense the matrix to a single number (singular values seem promising). These methods could also be easily changed slightly to compute pairwise covariance or pearson correlation instead.

In [4]:
#im using the iris dataset to test these functions
iris = datasets.load_iris()
X_class = pd.DataFrame(iris.data, columns=iris.feature_names)
y_class = pd.DataFrame(iris.target, columns=['label'])
iris_df = pd.concat([X_class,y_class],axis=1)
iris_df['id']=iris_df.index
iris_df.columns=['A','B','C','D','label','id']
dfclust = sc.createDataFrame(iris_df)

In [5]:
@pandas_udf("float", PandasUDFType.GROUPED_AGG)  
def avgcosine(a,b,c,d):
    #function takes data columns as input
    num_rows = len(a)
    num_entries = (num_rows)*(num_rows-1)
    
    #creating the matrix like this transposes rows and columns
    pdf = np.array([a,b,c,d])
    
    #l2 normalize the columns(which are the original rows)
    sum_squares = np.square(pdf).sum(axis=0)
    normed= pdf/np.sqrt(sum_squares)
    
    #compute all the pairwise dot prooducts
    cosines = normed.T @ normed
    
    #remove ones on the diagonal
    cosines = cosines - np.identity(num_rows)
    
    
    return cosines.sum()/num_entries

In [6]:
sum2 = dfclust.groupby('label').agg(avgcosine(dfclust.A, dfclust.B, dfclust.C, dfclust.D))

In [7]:
sum2.show()

+-----+---------------------+
|label|avgcosine(A, B, C, D)|
+-----+---------------------+
|    0|           0.99776626|
|    1|           0.99767566|
|    2|            0.9977042|
+-----+---------------------+



This is code that filters the dataframe based on a label, then take the filtered dataframe and find the cosine similarity matrix using spark mllib matrix operations

(since the output from the DIMSUM method used in column similarities returns an upper triangular matrix, the denominator for the average is half as many as the other method)

In [11]:
def cosine_sim(dfclust, label):
    #this function takes the data frame and a specific label to aggregate by
    label_group = dfclust.filter(dfclust['label']==label)
    rows = label_group.count()
    entries = (rows*(rows-1))/2
    
    #create matrix of dataframe data, the take transpose
    irm = IndexedRowMatrix(label_group.rdd.map(lambda x: (x.id, [x.A, x.B, x.C, x.D]) ) )
    irmt = irm.toCoordinateMatrix().transpose()
    
    #compute pairwise similarity, take average
    cosine_matrix = irmt.toRowMatrix().columnSimilarities()
    avg_sim = cosine_matrix.toRowMatrix().rows.map(lambda x :x.toArray()).sum().sum()/(entries)
    
    return avg_sim
    

In [12]:
for x in [0,1,2]:
    print(f" Label: {x}, avgcosine: {cosine_sim(dfclust, x)}")

 Label: 0, avgcosine: 0.9977662646586422
 Label: 1, avgcosine: 0.9976756432421382
 Label: 2, avgcosine: 0.9977042012503872


I ran into a problem here, it seems like pyspark does not allow the creation of user defined aggregation function, other than through pandas. This is something to explore once I have better grasp of scala. I'm using a workaround described here: https://stackoverflow.com/questions/46187630/how-to-write-pyspark-udaf-on-multiple-columns

Another problem I ran into is that I need the columns to be in an rdd to create a row matrix, but you are not allowed to create rdds inside a udf. 

Until I learn some scala and can dig in to udfs, think this means that I either have to manually select groups and then use the spark matrices to compute cosine similarity, or use something like the Pandas udf, which uses the groupby and agg spark functions, but then just uses pandas and numpy to compute the cosine similarity.

In [106]:
@udf(returnType=StringType())
def cosine_sim_udf(a,b,c,d):
    rows =[a,b,c,d]
    rdd_rows = sc.parallelize(rows)
    rmatrix=RowMatrix(rd_rows)
    cosine_matrix = rmatrix.columnSimilarities()
    avg_sim = cosine_matrix.toRowMatrix().rows.map(lambda x :x.toArray()).sum().sum()/(1225)
    return avg_sim


In [103]:
grouped = dfclust.groupBy("label").agg(collect_list(dfclust['A']).alias('A'),
                                            collect_list(dfclust['B']).alias('B'),
                                            collect_list(dfclust['C']).alias('C'),
                                            collect_list(dfclust['D']).alias('D'))

In [104]:
grouped.show()

+-----+--------------------+--------------------+--------------------+--------------------+
|label|                   A|                   B|                   C|                   D|
+-----+--------------------+--------------------+--------------------+--------------------+
|    0|[5.1, 4.9, 4.7, 4...|[3.5, 3.0, 3.2, 3...|[1.4, 1.4, 1.3, 1...|[0.2, 0.2, 0.2, 0...|
|    1|[7.0, 6.4, 6.9, 5...|[3.2, 3.2, 3.1, 2...|[4.7, 4.5, 4.9, 4...|[1.4, 1.5, 1.5, 1...|
|    2|[6.3, 5.8, 7.1, 6...|[3.3, 2.7, 3.0, 2...|[6.0, 5.1, 5.9, 5...|[2.5, 1.9, 2.1, 1...|
+-----+--------------------+--------------------+--------------------+--------------------+



In [108]:
grouped.select( grouped['label'], cosine_sim_udf(grouped['A'],grouped['B'],grouped['C'],grouped['D']) ).show()

Traceback (most recent call last):
  File "/usr/local/spark/python/pyspark/serializers.py", line 597, in dumps
    return cloudpickle.dumps(obj, 2)
  File "/usr/local/spark/python/pyspark/cloudpickle.py", line 863, in dumps
    cp.dump(obj)
  File "/usr/local/spark/python/pyspark/cloudpickle.py", line 260, in dump
    return Pickler.dump(self, obj)
  File "/opt/conda/lib/python3.7/pickle.py", line 437, in dump
    self.save(obj)
  File "/opt/conda/lib/python3.7/pickle.py", line 504, in save
    f(self, obj) # Call unbound method with explicit self
  File "/opt/conda/lib/python3.7/pickle.py", line 774, in save_tuple
    save(element)
  File "/opt/conda/lib/python3.7/pickle.py", line 504, in save
    f(self, obj) # Call unbound method with explicit self
  File "/usr/local/spark/python/pyspark/cloudpickle.py", line 400, in save_function
    self.save_function_tuple(obj)
  File "/usr/local/spark/python/pyspark/cloudpickle.py", line 549, in save_function_tuple
    save(state)
  File "/opt/c

PicklingError: Could not serialize object: Exception: It appears that you are attempting to reference SparkContext from a broadcast variable, action, or transformation. SparkContext can only be used on the driver, not in code that it run on workers. For more information, see SPARK-5063.