# Repliquer calcul cosine en Spark

# Objective(s)

Repliquer le code du calcul de la distance entre 2 listes de mots en Spark.

In [1]:
import pandas as pd
import numpy as np
import json
from pyspark.sql import SparkSession

from pyspark.sql.types import StructType, ArrayType, StringType, FloatType, MapType
import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType
from pyspark.mllib.linalg import DenseVector, Vectors, VectorUDT

spark = (SparkSession 
    .builder 
    .appName("Python Spark SQL basic example") 
    .config('spark.executor.memory', '4G') 
    .getOrCreate()
        )

In [2]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
spark.conf.set("spark.sql.debug.maxToStringFields", 1000)

In [3]:
with open('test_cosine_inpi_insee_clean.json') as f:
    data = json.load(f)
data

[{'row_id': 5,
  'inpi_except': ['RUE', 'CHARLES', 'GILLE'],
  'insee_except': ['BOULEVARD', 'PREUILLY']},
 {'row_id': 7, 'inpi_except': ['JB'], 'insee_except': ['JEAN', 'BAPTISTE']},
 {'row_id': 8, 'inpi_except': ['JB'], 'insee_except': ['JEAN', 'BAPTISTE']},
 {'row_id': 10,
  'inpi_except': ['MARCELIN', 'BERTHELOT', 'CENTRE', 'D', 'ENTREPRISES'],
  'insee_except': ['PROSPER', 'LEGOUTE']},
 {'row_id': 12,
  'inpi_except': ['CHEMIN', 'BEL', 'AIR'],
  'insee_except': ['RUE', 'VICTOR', 'HUGO']},
 {'row_id': 19,
  'inpi_except': ['A', 'E'],
  'insee_except': ['AIME', 'EUGENIE', 'ZI', 'NORD']},
 {'row_id': 21, 'inpi_except': ['ST'], 'insee_except': ['SAINT']},
 {'row_id': 23,
  'inpi_except': ['LOTISSEMENT', 'N'],
  'insee_except': ['BOULEVARD', 'RAYMOND', 'POINCARE', 'PALAIS', 'ORIENTAL']},
 {'row_id': 24,
  'inpi_except': ['LOTISSEMENT', 'N'],
  'insee_except': ['PLACE', 'AMIRAL', 'ORTOLI']},
 {'row_id': 25,
  'inpi_except': ['RUE', 'FURSANNES'],
  'insee_except': ['COLLINE']}]

Recupération premier ID

In [None]:
test_id = data[0]['row_id']

In [None]:
df = spark.createDataFrame(data)
df.printSchema()

In [None]:
df.first()

## Load weights

In [4]:
path_list = 'word2vec_weights_100_v2.csv'

In [16]:
words_to_keep = []
for i in data:
    words_to_keep.extend(i['inpi_except'])
    words_to_keep.extend(i['insee_except'])
words_to_keep

['RUE',
 'CHARLES',
 'GILLE',
 'BOULEVARD',
 'PREUILLY',
 'JB',
 'JEAN',
 'BAPTISTE',
 'JB',
 'JEAN',
 'BAPTISTE',
 'MARCELIN',
 'BERTHELOT',
 'CENTRE',
 'D',
 'ENTREPRISES',
 'PROSPER',
 'LEGOUTE',
 'CHEMIN',
 'BEL',
 'AIR',
 'RUE',
 'VICTOR',
 'HUGO',
 'A',
 'E',
 'AIME',
 'EUGENIE',
 'ZI',
 'NORD',
 'ST',
 'SAINT',
 'LOTISSEMENT',
 'N',
 'BOULEVARD',
 'RAYMOND',
 'POINCARE',
 'PALAIS',
 'ORIENTAL',
 'LOTISSEMENT',
 'N',
 'PLACE',
 'AMIRAL',
 'ORTOLI',
 'RUE',
 'FURSANNES',
 'COLLINE']

In [20]:
test = pd.read_csv(path_list).loc[lambda x: x['0'].isin(words_to_keep)]

In [22]:
test.shape

(39, 101)

In [23]:
test.to_json('word2vec_weights_100_v2.json',orient="records")

In [None]:

schema  = (
    StructType()
    .add('words', StringType(),True)
    .add("list_weigths", ArrayType(FloatType(), True))
)

cols = [str(i) for i in range(1, 101)]
weights = (spark.read.csv(path_list, header = True)
           .select('0',(F.array(cols)).cast(ArrayType(FloatType(), True)).alias('list_weights'))
           .withColumnRenamed("0","words")
          )
weights.dtypes

In [None]:
weights.show()

## Calcul Cosine depuis deux listes en Spark 3.0

Comme la fonction du cosine est assez simple, il n'y a pas besoin de créer une fonction (et le décorateur). Une fonction lambda est amplement suffisante

In [None]:
cosine = F.udf(lambda x, y: 
               (np.dot(x, y)/ (np.linalg.norm(x) * np.linalg.norm(y))).item(),
               FloatType())

In [None]:
test = (
    df
    .filter("row_id = {}".format(test_id))
    .select(
        'row_id',
        F.expr(
        """
explode(
map_from_entries(    
 arrays_zip(
  inpi_except, 
  transform(
    sequence(
      1, 
      size(inpi_except)
    ), 
    x -> insee_except
    )
    )
  )
)
      """
                        )
         .alias("inpi", "value")
    )
       
    .select(
        'row_id',
        "inpi",
        F.explode_outer("value")
        .alias("insee")
   )
    .join((weights.withColumnRenamed("words","inpi")),
        on = ['inpi'], how = 'left')
    .withColumnRenamed("list_weights","list_weights_inpi")
    .join((weights.withColumnRenamed("words","insee")),
       on = ['insee'], how = 'left')
    .withColumnRenamed("list_weights","list_weights_insee")
    .select('row_id',
            'inpi',
            'insee',
            "list_weights_inpi",
            "list_weights_insee",
            cosine("list_weights_inpi", "list_weights_insee").alias("cosine"),
           )
)

In [None]:
test.show(truncate = True)

# Calcul Cosine depuis deux listes en Spark, version < 2.2

In [None]:
list_a = ["RUE", "CHARLES", "GILLE"]
list_b = ["BOULEVARD", "PREUILLY"]

test_list =[dict(zip([i], [list_b])) for i in list_a]
test_list

udf pour remplacer `explode(map_from_entries(arrays_zip`

In [None]:
zip_except = F.udf(lambda x, y: [dict(zip([i], [y])) for i in x],
                   ArrayType(MapType(StringType(), ArrayType(StringType()))))

In [None]:
cosine = F.udf(lambda x, y: 
               (np.dot(x, y)/ (np.linalg.norm(x) * np.linalg.norm(y))).item(),
               FloatType())

In [None]:
test = (
    df
    .filter("row_id = {}".format(test_id))
    .select(
        'row_id',
        'inpi_except',
        'insee_except',
        F.explode(zip_except("inpi_except","insee_except")).alias("zip_except")
    )
    .select(
    'row_id',
        'inpi_except',
        'insee_except',
        F.explode("zip_except").alias("inpi", "value")
    )
    .select(
    'row_id',
        'inpi_except',
        'insee_except',
        'inpi',
         F.explode("value")
        .alias("insee")
    )
    .join((weights.withColumnRenamed("words","inpi")),
        on = ['inpi'], how = 'left')
    .withColumnRenamed("list_weights","list_weights_inpi")
    .join((weights.withColumnRenamed("words","insee")),
       on = ['insee'], how = 'left')
    .withColumnRenamed("list_weights","list_weights_insee")
    .select('row_id',
            'inpi',
            'insee',
            "list_weights_inpi",
            "list_weights_insee",
            cosine("list_weights_inpi", "list_weights_insee").alias("cosine"),
           )
)


In [None]:
test.show(truncate =True)