In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as sf
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

spark = SparkSession.builder.appName('SuperheroPopstar').getOrCreate()

In [2]:
import os
pwd = os.path.abspath('')

names = spark.read.option("sep", " ").schema(StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
])).csv(f"file:///{pwd}/../data/Marvel-names.txt")

lines = spark.read.text(f"file:///{pwd}/../data/Marvel-graph.txt")

In [3]:
names.show(5)
lines.show(5)

+---+--------------------+
| id|                name|
+---+--------------------+
|  1|24-HOUR MAN/EMMANUEL|
|  2|3-D MAN/CHARLES CHAN|
|  3|    4-D MAN/MERCURIO|
|  4|             8-BALL/|
|  5|                   A|
+---+--------------------+
only showing top 5 rows

+--------------------+
|               value|
+--------------------+
|5988 748 1722 375...|
|5989 4080 4264 44...|
|5982 217 595 1194...|
|5983 1165 3836 43...|
|5980 2731 3712 15...|
+--------------------+
only showing top 5 rows



In [13]:
herosPopularity = lines\
    .withColumn(
        'heros', sf.split(sf.trim(sf.col('value')), pattern=' ')
    ).withColumn(
        'id', sf.col('heros')[0]
    ).withColumn(
        'connections', sf.size(sf.col('heros')) - 1
    ).groupBy('id').agg(
        sf.sum('connections').alias('connections')
    ).sort('connections', ascending=False)

In [17]:
popstar = herosPopularity.first()
popstar

Row(id='859', connections=1933)

In [18]:
popstarName = names\
    .where(sf.col('id')==popstar.id)\
    .first()['name']

print(f"{popstarName} is the most popular superhero with {popstar.connections} co-appearances.")


CAPTAIN AMERICA is the most popular superhero with 1933 co-appearances.


In [35]:
minConnections = herosPopularity\
    .agg({'connections': 'min'})\
    .first()['min(connections)']
minConnections

0

In [38]:

obscureHeros = herosPopularity\
    .where(sf.col('connections')==minConnections)\
    .join(names, on='id')\
    .collect()

print(
    f'The most obscure heros (with {minConnections} connections) are:'
    + ''.join(f'\n   * {hero.name}' for hero in obscureHeros)
)

The most obscure heros (with 0 connections) are:
   * BERSERKER II
   * BLARE/
   * MARVEL BOY II/MARTIN
   * MARVEL BOY/MARTIN BU
   * GIURESCU, RADU
   * CLUMSY FOULUP
   * FENRIS
   * RANDAK
   * SHARKSKIN
   * CALLAHAN, DANNY
   * DEATHCHARGE
   * RUNE
   * SEA LEOPARD
   * RED WOLF II
   * ZANTOR
   * JOHNSON, LYNDON BAIN
   * LUNATIK II
   * KULL
   * GERVASE, LADY ALYSSA


In [40]:
# spark.stop()