<a href="https://colab.research.google.com/github/guylerme/abstraction-pagerank/blob/main/Abstraction_PageRank_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Packages install

In [1]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.2.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 40 kB/s 
[?25hCollecting py4j==0.10.9.2
  Downloading py4j-0.10.9.2-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 49.2 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.0-py2.py3-none-any.whl size=281805912 sha256=d09e7650dc3cfec635fc5e4798ef08f335010dcce29396a091456e6997a4b42f
  Stored in directory: /root/.cache/pip/wheels/0b/de/d2/9be5d59d7331c6c2a7c1b6d1a4f463ce107332b1ecd4e80718
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.2 pyspark-3.2.0


Import Libraries

In [2]:
from google.colab import files
import os

import json

In [3]:
from pyspark.sql import *
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql.functions import col,lit,when,concat_ws
from pyspark.sql.types import *
import pyspark.sql.functions as F
import math

Load model

In [4]:
uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))
  os.rename(fn, 'model.json')

Saving maestro--permissoes.json to maestro--permissoes.json
User uploaded file "maestro--permissoes.json" with length 787 bytes


In [6]:
# Opening JSON file
f = open('model.json')
 
# returns JSON object as
# a dictionary
model = json.load(f)

JSONDecodeError: ignored

Define the diagram

In [8]:
diagram_name = "PageRank-Remove Material Relations" #@param {type:"string"}
#set diagram name
if (diagram_name == ''):
  diagram_name='PageRank-Remove Material Relations'

Get all Classes

In [None]:
def getModelClassElement(model, id):
  #print(id)
  for package in model['contents']:
    if (package['name'] != None) & (package['type']=='Package'):
      #print(package['name'])
      if package['contents'] != None:
        #print(package['contents'])
        for element in package['contents']:
          if (element['id'] == id) & (element['type']=='Class'):
            #print(element['name'])
            return element['name']
  return None

In [None]:
classes=[]
for element in model['diagrams']:
  if element['name'] == diagram_name:
    for elem in element['contents']:
      if elem['type'] == 'ClassView':
        classElement = getModelClassElement(model['model'], elem['modelElement']['id'])
        classes.append(classElement)
      elif(elem['type'] == 'Class'):
        classes.append(elem['name'])
        #print(elem)

Get all Generalizations

In [None]:
def getModelGeneralizationElement(model, id):
  for package in model['contents']:
    if (package['name'] != None) & (package['type']=='Package'):
      #print(package['name'])
      if package['contents'] != None:
        #print(package['contents'])
        for element in package['contents']:
          if (element['id'] == id) & (element['type']=='Generalization'):
            #print(element['name'])
            general = getModelClassElement(model, element['general']['id'])
            specific = getModelClassElement(model, element['specific']['id'])
            return general, specific;
  return None

In [None]:
generalizations=[]
for element in model['diagrams']:
  if element['name'] == diagram_name:
    for elem in element['contents']:
      if (elem['type'] == 'GeneralizationView') | (elem['type'] == 'Generalization'):
        general, specific = getModelGeneralizationElement(model['model'], elem['modelElement']['id'])
        gen = (general, specific, 'GL')
        genR = (specific, general, 'GLr')
        generalizations.append(gen)
        generalizations.append(genR)

Get all Relations

In [None]:
def getModelRelationElement(model, id):
  for package in model['contents']:
    if (package['name'] != None) & (package['type']=='Package'):
      #print(package['name'])
      if package['contents'] != None:
        #print(package['contents'])
        for element in package['contents']:
          if (element['id'] == id) & (element['type']=='Relation'):
            #print(element['name'])
            source = getModelClassElement(model, element['properties'][0]['propertyType']['id'])
            target = getModelClassElement(model, element['properties'][1]['propertyType']['id'])
            return source, target;
  return None

In [None]:
relations=[]
for element in model['diagrams']:
  if element['name'] == diagram_name:
    for elem in element['contents']:
      if (elem['type'] == 'RelationView') | (elem['type'] == 'Relation'):
        source, target = getModelRelationElement(model['model'], elem['modelElement']['id'])
        rel = (source, target, 'AS')
        relations.append(rel)
        rel = (target, source, 'AS')
        relations.append(rel)

Define Spark Context

In [None]:
sc = SparkContext('local')
spark = SparkSession(sc)

Create Class Dataframe

In [None]:
classRow=Row("id", "name", "rc")
classSeq=[]

i=0
for c in classes:
  classX = classRow(i, c, 1000/len(classes))
  classSeq.append(classX)
  i=i+1

In [None]:
df = spark.createDataFrame(classSeq)

In [None]:
df.show()

+---+--------------------+------------------+
| id|                name|                rc|
+---+--------------------+------------------+
|  0|              Parent|14.925373134328359|
|  1|               Woman|14.925373134328359|
|  2|          Car Agency|14.925373134328359|
|  3|           Cargo Car|14.925373134328359|
|  4|Closed Car Rental...|14.925373134328359|
|  5|Car Needing Maint...|14.925373134328359|
|  6|Extented Car Rent...|14.925373134328359|
|  7|                Wife|14.925373134328359|
|  8|Maintenance Appoi...|14.925373134328359|
|  9|       Sender Branch|14.925373134328359|
| 10|               Child|14.925373134328359|
| 11|  Corporate Customer|14.925373134328359|
| 12|            Customer|14.925373134328359|
| 13|    Car Manufacturer|14.925373134328359|
| 14|       Passenger Car|14.925373134328359|
| 15|      Insured Driver|14.925373134328359|
| 16|             Teenage|14.925373134328359|
| 17|   Organization Unit|14.925373134328359|
| 18|   Insurance Company|14.92537

Creating a Relationship Type Data Frame

In [None]:
relationshipTypes=Row("cd_type","nm_type", "rr")
relationshipType1 = relationshipTypes('AG','Aggregation', 7)
relationshipType2 = relationshipTypes('DP', 'Dependency',8)
relationshipType3 = relationshipTypes('GL', 'Generalization', 10)
relationshipType4 = relationshipTypes('AS', 'Association', 5)
relationshipType5 = relationshipTypes('AGr', 'Agreggation Reverse', 7)
relationshipType6 = relationshipTypes('DPr', 'Dependency Reverse', 8)
relationshipType7 = relationshipTypes('GLr', 'Generalization Reverse', 10)

In [None]:
relationTypeSeq=[relationshipType1,relationshipType2,relationshipType3,relationshipType4,relationshipType5,relationshipType6,relationshipType7]

In [None]:
dfRt = spark.createDataFrame(relationTypeSeq)

In [None]:
dfRt.show()

+-------+--------------------+---+
|cd_type|             nm_type| rr|
+-------+--------------------+---+
|     AG|         Aggregation|  7|
|     DP|          Dependency|  8|
|     GL|      Generalization| 10|
|     AS|         Association|  5|
|    AGr| Agreggation Reverse|  7|
|    DPr|  Dependency Reverse|  8|
|    GLr|Generalization Re...| 10|
+-------+--------------------+---+



Creating a Relationship Data Frame

In [None]:
relationshipRow=Row("id","source", "target", "cd_type")
relationshipSeq=[]

i=0
for r in relations:
  relationX = relationshipRow(i, r[0], r[1], r[2])
  relationshipSeq.append(relationX)
  i=i+1

i=0
for r in generalizations:
  relationX = relationshipRow(i, r[0], r[1], r[2])
  relationshipSeq.append(relationX)
  i=i+1

In [None]:
dfR = spark.createDataFrame(relationshipSeq)

In [None]:
dfR.show()

+---+--------------------+--------------------+-------+
| id|              source|              target|cd_type|
+---+--------------------+--------------------+-------+
|  0|Car Rental Agreement|      Functional Car|     AS|
|  1|      Functional Car|Car Rental Agreement|     AS|
|  2|       Car Ownership|                 Car|     AS|
|  3|                 Car|       Car Ownership|     AS|
|  4| Employment Contract|            Employee|     AS|
|  5|            Employee| Employment Contract|     AS|
|  6| Employment Contract|        Organization|     AS|
|  7|        Organization| Employment Contract|     AS|
|  8|   Website Ownership|        Organization|     AS|
|  9|        Organization|   Website Ownership|     AS|
| 10|       Damage Report|         Damaged Car|     AS|
| 11|         Damaged Car|       Damage Report|     AS|
| 12|             Husband|            Marriage|     AS|
| 13|            Marriage|             Husband|     AS|
| 14|    Rental Insurance|      Insured Driver| 

Function to verify if exists relationship between the classes

In [None]:
def existsRelationship(sourceClass, targetClass):
  total = dfR.filter("source =='" + sourceClass + "' and target == '" + targetClass + "'").count()
  if (total > 0):
    return True
  return False
  

Function Ranks of Relationship. 
Initially was defined constant values such as the paper

In [None]:
def rr(sourceClass,targetClass):
  sql = "source == '" + sourceClass + "' and target == '"  + targetClass + "'"
  if (existsRelationship(sourceClass, targetClass)):
    return dfR.join(dfRt, dfR.cd_type == dfRt.cd_type).where(sql).select('rr').first()[0]
  return 0


Function to return the sum of all Ranks of Relationships

In [None]:
def allRr(sourceClass):
  return dfR.join(dfRt, dfR.cd_type == dfRt.cd_type).filter("source == '" + sourceClass + "'").groupBy().sum('rr').first()[0]

Function to calculate the Transition Probability

In [None]:
def tp(sourceClass,targetClass):
  if (existsRelationship(sourceClass, targetClass)):
    return (rr(sourceClass,targetClass))/(allRr(sourceClass))
  return 0

Function to get all Relationships where the class is source

In [None]:
def getRelationshipTargets(sourceClass):
  if (isinstance(sourceClass, str)):
    return dfR.filter("source == '"+sourceClass+"'").select('target').collect()
  return dfR.filter("source == '"+sourceClass.target+"'").select('target').collect()

Initializing the Class Rank Vector with starting value

In [None]:
classRankVector=Row("class", "rc")
classRcSeq = []
totalOfClasses = df.count()
for c in df.collect():
  classRc =classRankVector(c['name'], 1000/totalOfClasses)
  classRcSeq.append( classRc)

dfCRc = spark.createDataFrame(classRcSeq)

In [None]:
dfCRc.show()

+--------------------+------------------+
|               class|                rc|
+--------------------+------------------+
|              Parent|14.925373134328359|
|               Woman|14.925373134328359|
|          Car Agency|14.925373134328359|
|           Cargo Car|14.925373134328359|
|Closed Car Rental...|14.925373134328359|
|Car Needing Maint...|14.925373134328359|
|Extented Car Rent...|14.925373134328359|
|                Wife|14.925373134328359|
|Maintenance Appoi...|14.925373134328359|
|       Sender Branch|14.925373134328359|
|               Child|14.925373134328359|
|  Corporate Customer|14.925373134328359|
|            Customer|14.925373134328359|
|    Car Manufacturer|14.925373134328359|
|       Passenger Car|14.925373134328359|
|      Insured Driver|14.925373134328359|
|             Teenage|14.925373134328359|
|   Organization Unit|14.925373134328359|
|   Insurance Company|14.925373134328359|
|        Organization|14.925373134328359|
+--------------------+------------

Creating the Transition Probability Matrix

In [None]:
dfTpMatrixTemp = dfCRc.withColumnRenamed('class', 'source').join(dfCRc.withColumnRenamed('class', 'target')).select('source', 'target')
dfTpMR=Row("source", "target", "tp")
dfTpMSeq = []
for c in dfTpMatrixTemp.collect():
  m = dfTpMR(c['source'],c['target'], float(tp(c['source'],c['target'])))
  dfTpMSeq.append(m)


df_schema = StructType([StructField("source", StringType(), True)\
                       ,StructField("target", StringType(), True)\
                       ,StructField("tp", DoubleType(), True)])
 

dfTpMatrix = spark.createDataFrame(dfTpMSeq,df_schema )

Function to get the Class Rank

In [None]:
def getRc(classname):
  return dfCRc.filter("class == '" + classname + "'").select('rc').first()[0]

Function to get the Transition Probability

In [None]:
def getTpMatrix(source, target):
  return dfTpMatrix.filter("source == '" + source + "' and target == '" + target + "'").select('tp').first()[0]

Function to Calculate Class Rank

In [None]:
 def calculateRc(dfCRc):
   new_classseq = []
   for source in dfCRc.collect():
     new_rc = 0
     for target in dfTpMatrix.filter("source =='" + source['class'] + "' and tp > 0" ).collect():
       new_rc = new_rc + (getRc(target['target']) * getTpMatrix(target['target'], target['source']))

     classX = classRankVector(source['class'], new_rc)
     new_classseq.append(classX)

   new_df = spark.createDataFrame(new_classseq)
   return new_df



Calculating Class Rank in interactions

In [None]:
for x in range(10):
  df_2 = calculateRc(dfCRc)
  dfCRc = df_2

dfCRc.show()


+--------------------+------------------+
|               class|                rc|
+--------------------+------------------+
|              Parent|10.291568398706499|
|               Woman|14.496725631379643|
|          Car Agency|11.508631541613926|
|           Cargo Car|  8.66752489961041|
|Closed Car Rental...|16.745878187909636|
|Car Needing Maint...|17.844912100590797|
|Extented Car Rent...| 8.061979651739907|
|                Wife|11.847585933209292|
|Maintenance Appoi...| 9.101235046774953|
|       Sender Branch|13.376511224137477|
|               Child|10.291568398706499|
|  Corporate Customer|15.203910646067952|
|            Customer|  16.5639324821803|
|    Car Manufacturer|11.508631541613926|
|       Passenger Car|  8.66752489961041|
|      Insured Driver| 10.34833348425134|
|             Teenage| 8.115759892207357|
|   Organization Unit|26.114888295788738|
|   Insurance Company|12.106193871610426|
|        Organization|48.883246545845864|
+--------------------+------------

In [None]:
for x in range(10):
  df_2 = calculateRc(dfCRc)
  dfCRc = df_2

dfCRc.show()

+--------------------+------------------+
|               class|                rc|
+--------------------+------------------+
|              Parent| 9.890153958986945|
|               Woman|13.364492835087617|
|          Car Agency| 11.48508958763571|
|           Cargo Car| 8.137992903418738|
|Closed Car Rental...|15.883551699114841|
|Car Needing Maint...|16.583174423346634|
|Extented Car Rent...| 7.810991227246802|
|                Wife|12.470495287992355|
|Maintenance Appoi...| 8.470969378150667|
|       Sender Branch|12.827104700100001|
|               Child| 9.890153958986945|
|  Corporate Customer|15.298230978713248|
|            Customer|17.439986021479044|
|    Car Manufacturer| 11.48508958763571|
|       Passenger Car| 8.137992903418738|
|      Insured Driver|10.857173973712811|
|             Teenage| 8.083914258423764|
|   Organization Unit| 25.07594440807118|
|   Insurance Company|11.903213240398719|
|        Organization| 52.14204358609324|
+--------------------+------------

In [None]:
total = dfCRc.count()

dfCRc.sort(dfCRc.rc.desc()).limit(math.trunc(total*0.3)).show()

+--------------------+------------------+
|               class|                rc|
+--------------------+------------------+
|              Person| 58.56621301707885|
|        Organization| 52.14204358609324|
|                 Car|50.172404483491434|
|      Insurable Item|35.408932797412895|
|Car Rental Agreement| 35.26405946969564|
|            Employee|34.637064045866346|
|              Driver|30.305651953127715|
|       Living Person| 26.37007244587739|
|   Organization Unit| 25.07594440807118|
|      Driver License|22.550770254850256|
|      Functional Car|20.225400168337686|
|              Branch|  18.0271611581797|
|            Customer|17.439986021479044|
|Car Needing Maint...|16.583174423346634|
|  Car Needing Repair|16.583174423346634|
|Closed Car Rental...|15.883551699114841|
| Governmental Agency|15.756622447157344|
|               Adult|15.640649684772406|
|  Corporate Customer|15.298230978713248|
|    Rental Insurance|15.175947753076006|
+--------------------+------------

In [None]:
graph_nodes = []
for c in dfCRc.sort(dfCRc.rc.desc()).limit(math.trunc(total*0.3)).select('class').collect():
  graph_nodes.append(c["class"])

In [None]:
strenghRelationshipRow=Row("type", "strength")
sRSeq = []
relationshipStrength1 = strenghRelationshipRow('AG', 0.7)
relationshipStrength2 = strenghRelationshipRow('DP',0.8)
relationshipStrength3 = strenghRelationshipRow('GL', 1.0)
relationshipStrength4 = strenghRelationshipRow('AS', 0.5)
relationshipStrength5 = strenghRelationshipRow('AGr', 0.7)
relationshipStrength6 = strenghRelationshipRow('DPr', 0.8)
relationshipStrength7 = strenghRelationshipRow('GLr', 1.0)
sRSeq = [relationshipStrength1,relationshipStrength2,relationshipStrength3,relationshipStrength4,relationshipStrength5,relationshipStrength6,relationshipStrength7]
dfRS = spark.createDataFrame(sRSeq)

In [None]:
dfRS.show()

+----+--------+
|type|strength|
+----+--------+
|  AG|     0.7|
|  DP|     0.8|
|  GL|     1.0|
|  AS|     0.5|
| AGr|     0.7|
| DPr|     0.8|
| GLr|     1.0|
+----+--------+



In [None]:
def strength(sourceClass,targetClass):
  sql = "source == '" + sourceClass + "' and target == '"  + targetClass + "'"
  if (existsRelationship(sourceClass, targetClass)):
    return dfR.join(dfRS, dfR.cd_type == dfRS.type).where(sql).select('strength').first()[0]
  return 0

In [None]:
rowCorr=Row("source", "target", "correlation", "path")
sCorrSeq = []
dfCorr_schema = StructType([StructField("source", StringType(), True)\
                       ,StructField("target", StringType(), True)\
                       ,StructField("correlation", DoubleType(), True)\
                       ,StructField("path", StringType(), True)
])
dfCorr = spark.createDataFrame(sCorrSeq, dfCorr_schema)

In [None]:
def getCorrelations(source, target):
  return dfCorr.filter("source == '" + source + "' and target == '" + target + "'").select('source', 'target', 'correlation', 'path').collect()

In [None]:
def existsCorrelation(source, target):
  if (dfCorr.filter("source == '" + source + "' and target == '" + target + "'").count() > 0):
    return True
  return False


In [None]:
def calculateDirectCorrelations(dfCorr):
  for l in dfR.join(dfRS, dfRS.type == dfR.cd_type,"inner").select('source', 'target', 'strength').withColumnRenamed('strength', 'correlation').withColumn('path', concat_ws('-','source','target')).collect():
    line=rowCorr(l.source, l.target, l.correlation, l.path)
    sCorrSeq.append(line)
  dfCorr = spark.createDataFrame(sCorrSeq, dfCorr_schema)
  return dfCorr

In [None]:
def calculateOnLevalCorrelations(dfCorr):
  d1 = dfR.join(dfRS, dfRS.type == dfR.cd_type,"inner").withColumnRenamed('target','middlea').withColumnRenamed('strength', 'strengtha')
  d2 = dfR.join(dfRS, dfRS.type == dfR.cd_type,"inner").withColumnRenamed('source', 'middleb').withColumnRenamed('strength', 'strengthb')

  d3 = d1.join(d2, d1.middlea == d2.middleb)

  for l in d3.select('source', 'target', F.col('strengtha')*F.col('strengthb'),concat_ws('-', 'source', 'middlea', 'target')).toDF('source', 'target', 'correlation','path').collect():
    line=rowCorr(l.source, l.target, l.correlation, l.path)
    sCorrSeq.append(line)
  
  dfCorr = spark.createDataFrame(sCorrSeq, dfCorr_schema)
  return dfCorr

In [None]:
def correlation(source, target,dfCorr):
  if(not isinstance(source, str)):
    source = source.source
     
  if(not isinstance(target, str)):
    target=target.target

  if (not existsCorrelation(source, target)):
    if (existsRelationship(source, target)):
      line=rowCorr(source, target, strength(source, target), source + ' - ' + target)
      sCorrSeq.append(line)
      dfCorr = spark.createDataFrame(sCorrSeq, dfCorr_schema)
      return dfCorr
    else:
      for next_target in getRelationshipTargets(target):
        if(not isinstance(next_target, str)):
          next_target = next_target.target
        correlation(next_target, target, dfCorr)
        listOfCorrelations = getCorrelations(next_target, target)
        for loc in listOfCorrelations:
          n_corr = strength(source, next_target) * loc.correlation
          n_path = loc.path
          n_path = source + '-' + n_path
          line=rowCorr(source, target, n_corr, n_path)
          sCorrSeq.append(line)
          dfCorr = spark.createDataFrame(sCorrSeq, dfCorr_schema)
          return dfCorr

In [None]:
dfCorr=calculateDirectCorrelations(dfCorr)

dfCorr=calculateOnLevalCorrelations(dfCorr)

dfCorrTemp = dfCRc.withColumnRenamed('class', 'source').join(dfCRc.withColumnRenamed('class', 'target')).select('source', 'target').exceptAll(dfCorr.select('source', 'target'))


for c in dfCorrTemp.collect():
  if(isinstance(c['source'], str)):
    src = c['source']
  else:
    src = c['source'].source
     
  if(isinstance(c['target'], str)):
    tgt = c['target']
  else:
    print(c['target'])
    tgt = c['target'].target

  dfCorr=correlation(src,tgt,dfCorr)
  



In [None]:
graph_nodes

['Person',
 'Organization',
 'Car',
 'Insurable Item',
 'Car Rental Agreement',
 'Employee',
 'Driver',
 'Living Person',
 'Organization Unit',
 'Driver License',
 'Functional Car',
 'Branch',
 'Customer',
 'Car Needing Repair',
 'Car Needing Maintenance',
 'Closed Car Rental Agreement',
 'Governmental Agency',
 'Adult',
 'Corporate Customer',
 'Rental Insurance']

In [None]:
dfCorr.filter(dfCorr.source.isin(graph_nodes)).filter(dfCorr.target.isin(graph_nodes)).filter('correlation > 0.4').show()


+--------------------+--------------------+-----------+--------------------+
|              source|              target|correlation|                path|
+--------------------+--------------------+-----------+--------------------+
|Car Rental Agreement|      Functional Car|        0.5|Car Rental Agreem...|
|      Functional Car|Car Rental Agreement|        0.5|Functional Car-Ca...|
|    Rental Insurance|Car Rental Agreement|        0.5|Rental Insurance-...|
|Car Rental Agreement|    Rental Insurance|        0.5|Car Rental Agreem...|
|            Customer|Car Rental Agreement|        0.5|Customer-Car Rent...|
|Car Rental Agreement|            Customer|        0.5|Car Rental Agreem...|
|              Driver|      Driver License|        0.5|Driver-Driver Lic...|
|      Driver License|              Driver|        0.5|Driver License-Dr...|
|       Living Person|               Adult|        1.0| Living Person-Adult|
|                 Car|Car Needing Maint...|        1.0|Car-Car Needing M...|

In [None]:
dfCorr.filter(dfCorr.source.isin(graph_nodes)).filter(dfCorr.target.isin(graph_nodes)).filter('correlation > 0.4').write.csv('relation.csv')

In [None]:
dfCRc.sort(dfCRc.rc.desc()).limit(math.trunc(total*0.3)).write.csv('classes.csv')