In [1]:
import pyspark
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from sklearn import datasets
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark.sql import Window
from pyspark.ml.feature import Normalizer
from pyspark.mllib.linalg.distributed import *
from pyspark.sql.types import *
import os
os.environ['ARROW_PRE_0_15_IPC_FORMAT']='1'

In [2]:
sc = SparkSession.builder.getOrCreate()
sc.conf.set("spark.sql.execution.arrow.enabled", "true")

In [3]:
iris = datasets.load_iris()
X_class = pd.DataFrame(iris.data, columns=iris.feature_names)
y_class = pd.DataFrame(iris.target, columns=['label'])
classs = pd.concat([X_class,y_class],axis=1)
classs['id']=classs.index
classs.columns=['A','B','C','D','label','id']
num_obs = classs.shape[0]
dfclust = sc.createDataFrame(classs)

In [4]:
dfclust.show()

+---+---+---+---+-----+---+
|  A|  B|  C|  D|label| id|
+---+---+---+---+-----+---+
|5.1|3.5|1.4|0.2|    0|  0|
|4.9|3.0|1.4|0.2|    0|  1|
|4.7|3.2|1.3|0.2|    0|  2|
|4.6|3.1|1.5|0.2|    0|  3|
|5.0|3.6|1.4|0.2|    0|  4|
|5.4|3.9|1.7|0.4|    0|  5|
|4.6|3.4|1.4|0.3|    0|  6|
|5.0|3.4|1.5|0.2|    0|  7|
|4.4|2.9|1.4|0.2|    0|  8|
|4.9|3.1|1.5|0.1|    0|  9|
|5.4|3.7|1.5|0.2|    0| 10|
|4.8|3.4|1.6|0.2|    0| 11|
|4.8|3.0|1.4|0.1|    0| 12|
|4.3|3.0|1.1|0.1|    0| 13|
|5.8|4.0|1.2|0.2|    0| 14|
|5.7|4.4|1.5|0.4|    0| 15|
|5.4|3.9|1.3|0.4|    0| 16|
|5.1|3.5|1.4|0.3|    0| 17|
|5.7|3.8|1.7|0.3|    0| 18|
|5.1|3.8|1.5|0.3|    0| 19|
+---+---+---+---+-----+---+
only showing top 20 rows



In [5]:
sum_by_clust = dfclust.groupby('label').sum()

In [6]:
sum_by_clust.show()

+-----+------------------+------------------+------------------+------------------+----------+-------+
|label|            sum(A)|            sum(B)|            sum(C)|            sum(D)|sum(label)|sum(id)|
+-----+------------------+------------------+------------------+------------------+----------+-------+
|    0|250.29999999999998|171.40000000000003| 73.10000000000001|12.299999999999995|         0|   1225|
|    1|296.79999999999995|             138.5|             213.0| 66.30000000000001|        50|   3725|
|    2| 329.3999999999999|             148.7|277.59999999999997|101.29999999999998|       100|   6225|
+-----+------------------+------------------+------------------+------------------+----------+-------+



In [7]:
sum2 = dfclust.groupby('label').agg({'C':'sum'})

In [8]:
sum2.show()

+-----+------------------+
|label|            sum(C)|
+-----+------------------+
|    0| 73.10000000000001|
|    1|             213.0|
|    2|277.59999999999997|
+-----+------------------+



In [9]:
@pandas_udf("double", PandasUDFType.GROUPED_AGG)  
def mean_udf(v):
    return v.mean()

In [106]:
@pandas_udf("double", PandasUDFType.GROUPED_AGG)  
def means_udf2(v,u):
    a = v.mean()
    b = u.mean()
    return a + b

In [107]:
dfclust.groupby('label').agg(means_udf2(dfclust.A, dfclust.B)).show()

+-----+-----------------+
|label| means_udf2(A, B)|
+-----+-----------------+
|    0|8.434000000000001|
|    1|            8.706|
|    2|9.561999999999998|
+-----+-----------------+



In [114]:
@pandas_udf("float", PandasUDFType.GROUPED_AGG)  
def avgcosine(a,b,c,d):
    pdf = np.array([a,b,c,d])
    sum_squares = np.square(pdf).sum(axis=0)
    normed= pdf/np.sqrt(sum_squares)
    cosines = normed.T @ normed
    
    return cosines.mean()

In [115]:
sum2 = dfclust.groupby('label').agg(avgcosine(dfclust.A, dfclust.B, dfclust.C, dfclust.D))

In [116]:
sum2.show()

+-----+---------------------+
|label|avgcosine(A, B, C, D)|
+-----+---------------------+
|    0|          0.021071898|
|    1|          0.015095801|
|    2|          0.010939049|
+-----+---------------------+



In [88]:
cols = [classs[col] for col in ['A','B','C','D']]
temp_arr = np.array(cols)

sum_squares = np.square(arr).sum(axis=0)

normed= arr/np.sqrt(sum_squares)

cosines = normed.T @ normed
cosines.mean()

0.95549779660021

In [13]:
# @pandas_udf("label long, A double, B double, C double, D double, id long", PandasUDFType.GROUPED_MAP)
# def normalize(pdf):
#     # pdf is a pandas.DataFrame
#     pdf_temp = pdf.loc[:,['A','B','C','D']].apply(lambda x: x*x)
#     sum_squares = pdf.sum(axis=0)
#     normed = pdf_temp/sum_squares
#     normed['label'] = pdf['label']
#     normed['id'] = pdf['id']
#     return normed
# dfclust.groupby("label").apply(normalize).show()

+-----+-------------------+--------------------+--------------------+--------------------+---+
|label|                  A|                   B|                   C|                   D| id|
+-----+-------------------+--------------------+--------------------+--------------------+---+
|    0|0.10391530163803436| 0.07147024504084012|0.026812585499315998|0.003252032520325205|  0|
|    0|0.09592489013184181| 0.05250875145857642|0.026812585499315998|0.003252032520325205|  1|
|    0|0.08825409508589695| 0.05974329054842474|0.023119015047879617|0.003252032520325205|  2|
|    0|0.08453855373551737| 0.05606767794632438|  0.0307797537619699|0.003252032520325205|  3|
|    0|0.09988014382740712| 0.07561260210035005|0.026812585499315998|0.003252032520325205|  4|
|    0|0.11650019976028768| 0.08873978996499414|0.039534883720930225| 0.01300813008130082|  5|
|    0|0.08453855373551737|  0.0674445740956826|0.026812585499315998|0.007317073170731...|  6|
|    0|0.09988014382740712|  0.0674445740956826|  

In [None]:
classs

In [14]:
@pandas_udf("label long, A double, B double, C double, D double, id long", PandasUDFType.GROUPED_MAP)
def normalize(pdf):
    # pdf is a pandas.DataFrame
    pdf_temp = pdf.loc[:,['A','B','C','D']].apply(lambda x: x*x)
    sum_squares = pdf_temp.sum(axis=0)
    normed = pdf_temp.div(np.sqrt(sum_squares))
    normed['label'] = pdf['label']
    normed['id'] = pdf['id']
    return normed
dfclust.groupby("label").apply(normalize).show()

+-----+------------------+------------------+-------------------+--------------------+---+
|label|                 A|                 B|                  C|                   D| id|
+-----+------------------+------------------+-------------------+--------------------+---+
|    0|0.7330134859742016|0.5023699300176514|0.18829622510682084| 0.02117024496099853|  0|
|    0|0.6766495116586153|0.3690881118497031|0.18829622510682084| 0.02117024496099853|  1|
|    0|0.6225400963156523|0.4199402517045512|0.16235745940333024| 0.02117024496099853|  2|
|    0|0.5963308482589044|0.3941040838750719|0.21615638086242192| 0.02117024496099853|  3|
|    0|0.7045496789448304|0.5314868810635724|0.18829622510682084| 0.02117024496099853|  4|
|    0|0.8217867455212503|0.6237589090259982|0.27764086252995523| 0.08468097984399411|  5|
|    0|0.5963308482589044|0.4740731747758408|0.18829622510682084| 0.04763305116224668|  6|
|    0|0.7045496789448304|0.4740731747758408|0.21615638086242192| 0.02117024496099853|  7|

In [49]:
schema_list1 = [StructField(str(name), FloatType(), True) for name in list(range(50))]
#schema_list1.append(StructField('label', DoubleType(),True))
cos_schema=StructType(schema_list1)

In [50]:
@pandas_udf(cos_schema, PandasUDFType.GROUPED_MAP)
def dot_prods(pdf):
    #pdf is a pandas.DataFrame
    pdf_temp = pdf.loc[:,['A','B','C','D']]
    pdf_matrix = pdf_temp.to_numpy()
    dot_prods = pdf_matrix @ pdf_matrix.T
    return_df = pd.DataFrame(dot_prods)
    #return_df['label'] = pdf['label']
    return return_df
    
normed = dfclust.groupby("label").apply(normalize)
dots = normed.groupby("label").apply(dot_prods)

In [51]:
dots.count()

150

In [None]:
# @pandas_udf(, PandasUDFType.GROUPED_MAP)
# def cosine_matrix(pdf):
#     irm = IndexedRowMatrix(pdf.rdd.map(lambda x: (x.id, [x.A, x.B, x.C, x.D]) ) )
#     irmt = irm.toCoordinateMatrix().transpose()
#     cosines = irmt.toRowMatrix().columnSimilarities()
#     return pdf.assign(v=v - v.mean())

In [None]:
classs.T

In [None]:
# only needed if returning very large dataframe
# schema_list = [StructField(str(name), FloatType(), True) for name in  classs.id]
# cos_schema=StructType(schema_list)

In [None]:
# @pandas_udf("avg_cos double", PandasUDFType.GROUPED_MAP)
# def cosine_matrix(pdf):
#     irm = IndexedRowMatrix(pdf.rdd.map(lambda x: (x.id, [x.A, x.B, x.C, x.D]) ) )
#     irmt = irm.toCoordinateMatrix().transpose()
#     cosines = irmt.toRowMatrix().columnSimilarities()
    
#     # to test this function, lets try to find the average
#     avg_cos = [sum(rows.map(sum).collect())/(150**2)]
#     return sc.cereateDataFrame(pd.dataframe(avg_cos, columns=['avg_cos']))

In [None]:
# @pandas_udf("avg_cos double", PandasUDFType.GROUPED_MAP)
# def cosine_matrix(pdf):
#     irm = IndexedRowMatrix(pdf.rdd.map(lambda x: (x.id, [x.A, x.B, x.C, x.D]) ) )
#     irmt = irm.toCoordinateMatrix().transpose()
#     cosines = irmt.toRowMatrix().columnSimilarities()
    
#     # to test this function, lets try to find the average
#     avg_cos = [sum(rows.map(sum).collect())/(150**2)]
#     return sc.cereateDataFrame(pd.dataframe(avg_cos, columns=['avg_cos']))

In [None]:
sc.createDataFrame(pd.DataFrame([55]))

In [None]:
similarity_df = sc.createDataFrame(classs)
rdd_df=similarity_df.rdd.map(list)
df_matrix = RowMatrix(rdd_df)

In [None]:
df_matrix.computeGramianMatrix()

In [None]:
irm = IndexedRowMatrix(dfclust.rdd.map(lambda x: (x.id, [x.A, x.B, x.C, x.D]) ) )

irmt = irm.toCoordinateMatrix().transpose()

In [None]:
cosines = irmt.toRowMatrix().columnSimilarities()

In [None]:
rows = cosines.toRowMatrix().rows

In [None]:
sum(rows.map(sum).collect())/(num_obs**2)

In [None]:
@pandas_udf("double", PandasUDFType.GROUPED_AGG)  
def means_udf2(A, B, C, D, index):
    
    
    irmt = irm.toCoordinateMatrix().transpose()
    cosines = irmt.toRowMatrix().columnSimilarities()
    
    # to test this function, lets try to find the average
    avg_cos = [sum(rows.map(sum).collect())/(150**2)]
    return avg_cos
