# Repliquer calcul cosine en Spark

# Objective(s)

Repliquer le code du calcul de la distance entre 2 listes de mots en Spark.

In [1]:
from datum import spark

Successfully imported Spark Modules
/opt/mapr/spark/spark-2.2.1/python/pyspark/shell.py
Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.2.1-mapr-1901
      /_/

Using Python version 2.7.15 (default, Mar  5 2020 14:56:06)
SparkSession available as 'spark'.


In [2]:
#import pandas as pd
#import numpy as np
import json
from pyspark.sql import SparkSession

from pyspark.sql.types import StructType, ArrayType, StringType, FloatType, MapType
import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType
#from pyspark.mllib.linalg import DenseVector, Vectors, VectorUDT

#spark = (SparkSession 
#    .builder 
#    .appName("Python Spark SQL basic example") 
#    .config('spark.executor.memory', '4G') 
#    .getOrCreate()
#        )

In [3]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
spark.conf.set("spark.sql.debug.maxToStringFields", 1000)

In [4]:
with open('test_cosine_inpi_insee_clean.json') as f:
    data = json.load(f)
data

[{u'inpi_except': [u'RUE', u'CHARLES', u'GILLE'],
  u'insee_except': [u'BOULEVARD', u'PREUILLY'],
  u'row_id': 5},
 {u'inpi_except': [u'JB'],
  u'insee_except': [u'JEAN', u'BAPTISTE'],
  u'row_id': 7},
 {u'inpi_except': [u'JB'],
  u'insee_except': [u'JEAN', u'BAPTISTE'],
  u'row_id': 8},
 {u'inpi_except': [u'MARCELIN', u'BERTHELOT', u'CENTRE', u'D', u'ENTREPRISES'],
  u'insee_except': [u'PROSPER', u'LEGOUTE'],
  u'row_id': 10},
 {u'inpi_except': [u'CHEMIN', u'BEL', u'AIR'],
  u'insee_except': [u'RUE', u'VICTOR', u'HUGO'],
  u'row_id': 12},
 {u'inpi_except': [u'A', u'E'],
  u'insee_except': [u'AIME', u'EUGENIE', u'ZI', u'NORD'],
  u'row_id': 19},
 {u'inpi_except': [u'ST'], u'insee_except': [u'SAINT'], u'row_id': 21},
 {u'inpi_except': [u'LOTISSEMENT', u'N'],
  u'insee_except': [u'BOULEVARD',
   u'RAYMOND',
   u'POINCARE',
   u'PALAIS',
   u'ORIENTAL'],
  u'row_id': 23},
 {u'inpi_except': [u'LOTISSEMENT', u'N'],
  u'insee_except': [u'PLACE', u'AMIRAL', u'ORTOLI'],
  u'row_id': 24},
 {u'i

Recupération premier ID

In [5]:
test_id = data[0]['row_id']

In [6]:
df = spark.createDataFrame(data)
df.printSchema()



root
 |-- inpi_except: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- insee_except: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- row_id: long (nullable = true)



In [7]:
df.first()

Row(inpi_except=[u'RUE', u'CHARLES', u'GILLE'], insee_except=[u'BOULEVARD', u'PREUILLY'], row_id=5)

## Load weights

In [8]:
import os
os.getcwd()

'/opt/notebooks'

In [16]:
with open('word2vec_weights_100_v2.json') as f:
    weights_json = json.load(f)

In [17]:
cols = [str(i) for i in range(1, 101)]
weights = (
    spark.createDataFrame(weights_json)
    .select('0',(F.array(cols)).cast(ArrayType(FloatType(), True)).alias('list_weights'))
    .withColumnRenamed("0","words")
)
weights.printSchema()

root
 |-- words: string (nullable = true)
 |-- list_weights: array (nullable = false)
 |    |-- element: float (containsNull = true)



In [18]:
weights.show()

+-----------+--------------------+
|      words|        list_weights|
+-----------+--------------------+
|        RUE|[-0.86694837, -0....|
|     CHEMIN|[-0.33908606, 0.7...|
|          D|[1.1837989, -1.35...|
|  BOULEVARD|[-1.4547851, 0.14...|
|      PLACE|[-0.21757227, 0.1...|
|      SAINT|[0.7002688, -2.22...|
|          A|[-1.2238628, -0.0...|
|       JEAN|[-2.4126835, 1.26...|
|     CENTRE|[-3.6606867, -4.1...|
|         ZI|[0.13045992, 1.63...|
|         ST|[-2.0008852, -1.1...|
|          N|[-1.7969333, -2.7...|
|LOTISSEMENT|[-1.0849817, -3.4...|
|    CHARLES|[-1.1762805, -0.5...|
|          E|[-2.364159, 0.654...|
|     VICTOR|[2.3670592, 1.435...|
|       NORD|[-1.9177388, 0.50...|
|       HUGO|[-0.50103456, -2....|
|        BEL|[-2.2877157, -1.2...|
|    RAYMOND|[-0.6991833, -0.0...|
+-----------+--------------------+
only showing top 20 rows



# Calcul Cosine depuis deux listes en Spark, version < 2.2

In [None]:
list_a = ["RUE", "CHARLES", "GILLE"]
list_b = ["BOULEVARD", "PREUILLY"]

test_list =[dict(zip([i], [list_b])) for i in list_a]
test_list

udf pour remplacer `explode(map_from_entries(arrays_zip`

In [20]:
zip_except = F.udf(lambda x, y: [dict(zip([i], [y])) for i in x],
                   ArrayType(MapType(StringType(), ArrayType(StringType()))))

In [None]:
#cosine = F.udf(lambda x, y: 
#               (np.dot(x, y)/ (np.linalg.norm(x) * np.linalg.norm(y))).item(),
#               FloatType())

In [70]:
import math

In [72]:
list_a = [1,2,3]
list_b = [4,5,6]
def cosine(x, y):
    up = 0
    x_down = 0
    y_down = 0
    for i, j in zip(x, y):
        up += i*j
        x_down += i**2
        y_down += j**2
    cosine = up/ math.sqrt(x_down * y_down)    
    return cosine
dot_product(list_a, list_b)

0.9746318461970762

In [74]:
@F.udf
def cosine(x, y):
    up = 0
    x_down = 0
    y_down = 0
    for i, j in zip(x, y):
        up += i*j
        x_down += i**2
        y_down += j**2
    cosine = up/ math.sqrt(x_down * y_down)    
    return cosine
    #return sum(i[0] * i[1] for i in zip((x, y)))

In [78]:
test = (
    df
    .filter("row_id = {}".format(test_id))
    .select(
        'row_id',
        'inpi_except',
        'insee_except',
        F.explode(zip_except("inpi_except","insee_except")).alias("zip_except")
    )
    .select(
    'row_id',
        'inpi_except',
        'insee_except',
        F.explode("zip_except").alias("inpi", "value")
    )
    .select(
    'row_id',
        'inpi_except',
        'insee_except',
        'inpi',
         F.explode("value")
        .alias("insee")
    )
    .join((weights.withColumnRenamed("words","inpi")),
        on = ['inpi'], how = 'left')
    .withColumnRenamed("list_weights","list_weights_inpi")
    .join((weights.withColumnRenamed("words","insee")),
       on = ['insee'], how = 'left')
    .withColumnRenamed("list_weights","list_weights_insee")
    .select('row_id',
            'inpi',
            'insee',
            "list_weights_inpi",
            "list_weights_insee",
            cosine("list_weights_inpi", "list_weights_insee").alias('cosine')
    #        cosine("list_weights_inpi", "list_weights_insee").alias("cosine"),
           )
)


In [76]:
test.dtypes

[('row_id', 'bigint'),
 ('inpi', 'string'),
 ('insee', 'string'),
 ('list_weights_inpi', 'array<float>'),
 ('list_weights_insee', 'array<float>'),
 ('dot_product(list_weights_inpi, list_weights_insee)', 'string')]

In [79]:
test.show(truncate =True)

+------+-------+---------+--------------------+--------------------+-------------------+
|row_id|   inpi|    insee|   list_weights_inpi|  list_weights_insee|             cosine|
+------+-------+---------+--------------------+--------------------+-------------------+
|     5|  GILLE| PREUILLY|[0.34494784, -0.2...|[0.026656773, -0....|0.22824263771406134|
|     5|CHARLES| PREUILLY|[-1.1762805, -0.5...|[0.026656773, -0....|0.10189664729146147|
|     5|    RUE| PREUILLY|[-0.86694837, -0....|[0.026656773, -0....|0.09652857130042741|
|     5|  GILLE|BOULEVARD|[0.34494784, -0.2...|[-1.4547851, 0.14...|0.03590281117414814|
|     5|CHARLES|BOULEVARD|[-1.1762805, -0.5...|[-1.4547851, 0.14...|0.09133628399871992|
|     5|    RUE|BOULEVARD|[-0.86694837, -0....|[-1.4547851, 0.14...|0.40306154624654084|
+------+-------+---------+--------------------+--------------------+-------------------+

