In [None]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from graphframes import GraphFrame

conf = SparkConf().setAppName("ex53")
sc = SparkContext(conf = conf)
ssql = SparkSession.builder.getOrCreate()

In [19]:
edgesPath = "data/Ex53/data/edges.csv"
vertexesPath = "data/Ex53/data/vertexes.csv"
outputPath = "out53/"

In [4]:
eDF = ssql.read.load(
    edgesPath,
    format="csv",
    header=True,
    inferSchema=True
)

vDF = ssql.read.load(
    vertexesPath,
    format="csv",
    header=True,
    inferSchema=True
)

In [5]:
eDF.show(), eDF.printSchema()
vDF.show(), vDF.printSchema()

+---+---+--------+
|src|dst|linktype|
+---+---+--------+
| u1| u2|  friend|
| u1| u4|  friend|
| u1| u5|  friend|
| u2| u1|  friend|
| u2| u3|  follow|
| u3| u2|  follow|
| u4| u1|  friend|
| u4| u5|  friend|
| u5| u1|  friend|
| u5| u4|  friend|
| u5| u6|  follow|
| u6| u3|  follow|
| u7| u6|  follow|
+---+---+--------+

root
 |-- src: string (nullable = true)
 |-- dst: string (nullable = true)
 |-- linktype: string (nullable = true)

+---+-----+---+
| id| name|age|
+---+-----+---+
| u1|Alice| 34|
| u2|  Bob| 36|
| u3| John| 30|
| u4|David| 29|
| u5| Paul| 32|
| u6| Adel| 36|
| u7| Eddy| 60|
+---+-----+---+

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)



(None, None)

In [6]:
filteredEDF = eDF.filter("linktype='follow'")

In [None]:
g = GraphFrame(vDF, filteredEDF)

In [9]:
nFollowerDF = g.inDegrees.withColumnRenamed("inDegree", "NFollowers")
nFollowerDF.printSchema(), nFollowerDF.show()



root
 |-- id: string (nullable = true)
 |-- NFollowers: integer (nullable = false)

+---+----------+
| id|NFollowers|
+---+----------+
| u3|         2|
| u6|         2|
| u2|         1|
+---+----------+



(None, None)

In [10]:
maxFollowersDF = nFollowerDF\
    .agg({"NFollowers":"max"})\
        .withColumnRenamed("max(NFollowers)", "MaxNFollowers")

In [15]:
#se non selezionassi la colonna del DF, maxNFollowers conterrebbe SOLO un oggetto Row con quel valore
maxNFollowers = maxFollowersDF.first().MaxNFollowers
maxNFollowers

2

In [16]:
nFollowerDFfinal = nFollowerDF.filter(nFollowerDF.NFollowers==maxNFollowers)

In [17]:
nFollowerDFfinal.show()

+---+----------+
| id|NFollowers|
+---+----------+
| u3|         2|
| u6|         2|
+---+----------+



In [20]:
nFollowerDFfinal.write.csv(outputPath, header=True)