# Repliquer calcul cosine en Spark

# Objective(s)

Repliquer le code du calcul de la distance entre 2 listes de mots en Spark.

In [4]:
import pandas as pd
import numpy as np
import json
from pyspark.sql import SparkSession

from pyspark.sql.types import StructType, ArrayType, StringType, FloatType, MapType
import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType
from pyspark.mllib.linalg import DenseVector, Vectors, VectorUDT

spark = (SparkSession 
    .builder 
    .appName("Python Spark SQL basic example") 
    .config('spark.executor.memory', '4G') 
    .getOrCreate()
        )

In [2]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
spark.conf.set("spark.sql.debug.maxToStringFields", 1000)

In [6]:
with open('test_cosine_inpi_insee_clean.json') as f:
    data = json.load(f)
data

[{'row_id': 5,
  'inpi_except': ['RUE', 'CHARLES', 'GILLE'],
  'insee_except': ['BOULEVARD', 'PREUILLY']},
 {'row_id': 7, 'inpi_except': ['JB'], 'insee_except': ['JEAN', 'BAPTISTE']},
 {'row_id': 8, 'inpi_except': ['JB'], 'insee_except': ['JEAN', 'BAPTISTE']},
 {'row_id': 10,
  'inpi_except': ['MARCELIN', 'BERTHELOT', 'CENTRE', 'D', 'ENTREPRISES'],
  'insee_except': ['PROSPER', 'LEGOUTE']},
 {'row_id': 12,
  'inpi_except': ['CHEMIN', 'BEL', 'AIR'],
  'insee_except': ['RUE', 'VICTOR', 'HUGO']},
 {'row_id': 19,
  'inpi_except': ['A', 'E'],
  'insee_except': ['AIME', 'EUGENIE', 'ZI', 'NORD']},
 {'row_id': 21, 'inpi_except': ['ST'], 'insee_except': ['SAINT']},
 {'row_id': 23,
  'inpi_except': ['LOTISSEMENT', 'N'],
  'insee_except': ['BOULEVARD', 'RAYMOND', 'POINCARE', 'PALAIS', 'ORIENTAL']},
 {'row_id': 24,
  'inpi_except': ['LOTISSEMENT', 'N'],
  'insee_except': ['PLACE', 'AMIRAL', 'ORTOLI']},
 {'row_id': 25,
  'inpi_except': ['RUE', 'FURSANNES'],
  'insee_except': ['COLLINE']}]

Recupération premier ID

In [15]:
test_id = data[0]['row_id']

In [8]:
df = spark.createDataFrame(data)
df.printSchema()



root
 |-- inpi_except: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- insee_except: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- row_id: long (nullable = true)



In [9]:
df.first()

Row(inpi_except=['RUE', 'CHARLES', 'GILLE'], insee_except=['BOULEVARD', 'PREUILLY'], row_id=5)

## Load weights

In [10]:
path_list = 'word2vec_weights_100_v2.csv'
schema  = (
    StructType()
    .add('words', StringType(),True)
    .add("list_weigths", ArrayType(FloatType(), True))
)

cols = [str(i) for i in range(1, 101)]
weights = (spark.read.csv(path_list, header = True)
           .select('0',(F.array(cols)).cast(ArrayType(FloatType(), True)).alias('list_weights'))
           .withColumnRenamed("0","words")
          )
weights.dtypes

[('words', 'string'), ('list_weights', 'array<float>')]

In [11]:
weights.show()

+---------+--------------------+
|    words|        list_weights|
+---------+--------------------+
|      RUE|[-0.86694837, -0....|
|   AVENUE|[-2.206969, -0.84...|
|    ROUTE|[0.75751305, -0.2...|
|   CHEMIN|[-0.33908606, 0.7...|
|        D|[1.1837989, -1.35...|
|        L|[-1.9471866, -0.5...|
|BOULEVARD|[-1.4547851, 0.14...|
|    PLACE|[-0.21757227, 0.1...|
|      BIS|[0.76434, -0.1137...|
|    SAINT|[0.7002688, -2.22...|
|     LIEU|[-0.5478822, 1.27...|
|      DIT|[-1.040085, -0.21...|
|        A|[-1.2238628, -0.0...|
|     ZONE|[-1.985939, -1.25...|
|        B|[-0.30842575, 0.0...|
|    ALLEE|[-0.8832805, -2.1...|
|     JEAN|[-2.4126835, 1.26...|
|   CENTRE|[-3.6606867, -4.1...|
|RESIDENCE|[-0.3581616, -0.9...|
|      BAT|[-1.2430978, 0.25...|
+---------+--------------------+
only showing top 20 rows



## Calcul Cosine depuis deux listes en Spark 3.0

Comme la fonction du cosine est assez simple, il n'y a pas besoin de créer une fonction (et le décorateur). Une fonction lambda est amplement suffisante

In [12]:
cosine = F.udf(lambda x, y: 
               (np.dot(x, y)/ (np.linalg.norm(x) * np.linalg.norm(y))).item(),
               FloatType())

In [16]:
test = (
    df
    .filter("row_id = {}".format(test_id))
    .select(
        'row_id',
        F.expr(
        """
explode(
map_from_entries(    
 arrays_zip(
  inpi_except, 
  transform(
    sequence(
      1, 
      size(inpi_except)
    ), 
    x -> insee_except
    )
    )
  )
)
      """
                        )
         .alias("inpi", "value")
    )
       
    .select(
        'row_id',
        "inpi",
        F.explode_outer("value")
        .alias("insee")
   )
    .join((weights.withColumnRenamed("words","inpi")),
        on = ['inpi'], how = 'left')
    .withColumnRenamed("list_weights","list_weights_inpi")
    .join((weights.withColumnRenamed("words","insee")),
       on = ['insee'], how = 'left')
    .withColumnRenamed("list_weights","list_weights_insee")
    .select('row_id',
            'inpi',
            'insee',
            "list_weights_inpi",
            "list_weights_insee",
            cosine("list_weights_inpi", "list_weights_insee").alias("cosine"),
           )
)

In [17]:
test.show(truncate = True)

+------+-------+---------+--------------------+--------------------+-----------+
|row_id|   inpi|    insee|   list_weights_inpi|  list_weights_insee|     cosine|
+------+-------+---------+--------------------+--------------------+-----------+
|     5|    RUE|BOULEVARD|[-0.86694837, -0....|[-1.4547851, 0.14...| 0.40306154|
|     5|    RUE| PREUILLY|[-0.86694837, -0....|[0.026656773, -0....|0.096528575|
|     5|CHARLES|BOULEVARD|[-1.1762805, -0.5...|[-1.4547851, 0.14...| 0.09133629|
|     5|CHARLES| PREUILLY|[-1.1762805, -0.5...|[0.026656773, -0....| 0.10189664|
|     5|  GILLE|BOULEVARD|[0.34494784, -0.2...|[-1.4547851, 0.14...| 0.03590281|
|     5|  GILLE| PREUILLY|[0.34494784, -0.2...|[0.026656773, -0....| 0.22824264|
+------+-------+---------+--------------------+--------------------+-----------+



# Calcul Cosine depuis deux listes en Spark, version < 2.2

In [20]:
list_a = ["RUE", "CHARLES", "GILLE"]
list_b = ["BOULEVARD", "PREUILLY"]

test_list =[dict(zip([i], [list_b])) for i in list_a]
test_list

[{'RUE': ['BOULEVARD', 'PREUILLY']},
 {'CHARLES': ['BOULEVARD', 'PREUILLY']},
 {'GILLE': ['BOULEVARD', 'PREUILLY']}]

udf pour remplacer `explode(map_from_entries(arrays_zip`

In [21]:
zip_except = F.udf(lambda x, y: [dict(zip([i], [y])) for i in x],
                   ArrayType(MapType(StringType(), ArrayType(StringType()))))

In [22]:
cosine = F.udf(lambda x, y: 
               (np.dot(x, y)/ (np.linalg.norm(x) * np.linalg.norm(y))).item(),
               FloatType())

In [23]:
test = (
    df
    .filter("row_id = {}".format(test_id))
    .select(
        'row_id',
        'inpi_except',
        'insee_except',
        F.explode(zip_except("inpi_except","insee_except")).alias("zip_except")
    )
    .select(
    'row_id',
        'inpi_except',
        'insee_except',
        F.explode("zip_except").alias("inpi", "value")
    )
    .select(
    'row_id',
        'inpi_except',
        'insee_except',
        'inpi',
         F.explode("value")
        .alias("insee")
    )
    .join((weights.withColumnRenamed("words","inpi")),
        on = ['inpi'], how = 'left')
    .withColumnRenamed("list_weights","list_weights_inpi")
    .join((weights.withColumnRenamed("words","insee")),
       on = ['insee'], how = 'left')
    .withColumnRenamed("list_weights","list_weights_insee")
    .select('row_id',
            'inpi',
            'insee',
            "list_weights_inpi",
            "list_weights_insee",
            cosine("list_weights_inpi", "list_weights_insee").alias("cosine"),
           )
)


In [24]:
test.show(truncate =True)

+------+-------+---------+--------------------+--------------------+-----------+
|row_id|   inpi|    insee|   list_weights_inpi|  list_weights_insee|     cosine|
+------+-------+---------+--------------------+--------------------+-----------+
|     5|    RUE|BOULEVARD|[-0.86694837, -0....|[-1.4547851, 0.14...| 0.40306154|
|     5|    RUE| PREUILLY|[-0.86694837, -0....|[0.026656773, -0....|0.096528575|
|     5|CHARLES|BOULEVARD|[-1.1762805, -0.5...|[-1.4547851, 0.14...| 0.09133629|
|     5|CHARLES| PREUILLY|[-1.1762805, -0.5...|[0.026656773, -0....| 0.10189664|
|     5|  GILLE|BOULEVARD|[0.34494784, -0.2...|[-1.4547851, 0.14...| 0.03590281|
|     5|  GILLE| PREUILLY|[0.34494784, -0.2...|[0.026656773, -0....| 0.22824264|
+------+-------+---------+--------------------+--------------------+-----------+

