# Import & Read DataFrame

In [0]:
import pandas as pd
from functools import reduce
from pyspark.sql.functions import col, lit, when, monotonically_increasing_id
from graphframes import *

# File location and type
file_location = "/FileStore/tables/AllMoviesCastingRaw.csv"
file_type = "csv"

# CSV options
infer_schema = "false"
first_row_is_header = "true"
delimiter = ";"

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

In [0]:
display(df)

id,actor1_name,actor1_gender,actor2_name,actor2_gender,actor3_name,actor3_gender,actor4_name,actor4_gender,actor5_name,actor5_gender,actor_number,director_name,director_gender,director_number,producer_name,producer_number,screeplay_name,editor_name
2,Turo Pajala,0,Susanna Haavisto,0,Matti Pellonpää,2,Eetu Hilkamo,0,none,0,4,Aki Kaurismäki,0,1,none,0,Aki Kaurismäki,Raija Talvio
3,Matti Pellonpää,2,Kati Outinen,1,Sakari Kuosmanen,2,Esko Nikkari,2,Kylli Köngäs,0,7,Aki Kaurismäki,0,1,Mika Kaurismäki,1,Aki Kaurismäki,Raija Talvio
5,Tim Roth,2,Antonio Banderas,2,Jennifer Beals,1,Madonna,1,Marisa Tomei,1,24,Allison Anders,1,4,Lawrence Bender,1,none,Margaret Goodspeed
6,Emilio Estevez,2,Cuba Gooding Jr.,2,Denis Leary,2,Jeremy Piven,2,Peter Greene,2,15,Stephen Hopkins,2,1,Gene Levy,1,Lewis Colick,Tim Wellburn
8,none,0,none,0,none,0,none,0,none,0,0,Timo Novotny,0,1,Timo Novotny,2,Michael Glawogger,Timo Novotny
9,Rita Lengyel,1,Milton Welsh,2,none,0,none,0,none,0,2,Marc Meyer,0,2,Marc Meyer,1,none,Marc Meyer
11,Mark Hamill,2,Harrison Ford,2,Carrie Fisher,1,Peter Cushing,2,Alec Guinness,2,106,George Lucas,2,1,Gary Kurtz,2,none,Marcia Lucas
12,Albert Brooks,2,Ellen DeGeneres,1,Alexander Gould,2,Willem Dafoe,2,Brad Garrett,2,24,Andrew Stanton,2,1,Graham Walters,1,Andrew Stanton,David Ian Salter
13,Tom Hanks,2,Robin Wright,1,Gary Sinise,2,Mykelti Williamson,2,Sally Field,1,67,Robert Zemeckis,2,1,Wendy Finerman,3,Eric Roth,Arthur Schmidt
14,Kevin Spacey,2,Annette Bening,1,Thora Birch,1,Wes Bentley,2,Mena Suvari,1,41,Sam Mendes,2,1,Bruce Cohen,2,Alan Ball,Christopher Greenbury


In [0]:
actor1 = df.select(df[1].alias('name'), df[2].alias('gender'))
# actor1.count()
actor2 = df.select(df[3].alias('name'), df[4].alias('gender'))
actor3 = df.select(df[5].alias('name'), df[6].alias('gender'))
actor4 = df.select(df[7].alias('name'), df[8].alias('gender'))
actor5 = df.select(df[9].alias('name'), df[10].alias('gender'))
actor = actor1.union(actor2)\
              .union(actor3)\
              .union(actor4)\
              .union(actor5)\
              .filter("name != 'none'")
# actor.count():851041
ACTOR, DIRECTOR, PRODUCER = 1,10,100
actor = actor.select('*', lit(ACTOR).alias('duty'))
display(actor)

name,gender,duty
Turo Pajala,0,1
Matti Pellonpää,2,1
Tim Roth,2,1
Emilio Estevez,2,1
Rita Lengyel,1,1
Mark Hamill,2,1
Albert Brooks,2,1
Tom Hanks,2,1
Kevin Spacey,2,1
Orson Welles,2,1


In [0]:
director = df.select(df[12].alias('name'), df[13].alias('gender'))\
             .filter("director_name != 'none'")
director = director.select('*', lit(DIRECTOR).alias('duty'))
# director.count():25231
display(director)

name,gender,duty
Aki Kaurismäki,0,10
Aki Kaurismäki,0,10
Allison Anders,1,10
Stephen Hopkins,2,10
Timo Novotny,0,10
Marc Meyer,0,10
George Lucas,2,10
Andrew Stanton,2,10
Robert Zemeckis,2,10
Sam Mendes,2,10


In [0]:
producer = df.select(df[15].alias('name'))\
             .filter("producer_name != 'none'")
producer = producer.select('*', lit(0).alias('gender'))
producer = producer.select('*', lit(PRODUCER).alias('duty'))
# producer.count():63293
display(producer)

name,gender,duty
Mika Kaurismäki,0,100
Lawrence Bender,0,100
Gene Levy,0,100
Timo Novotny,0,100
Marc Meyer,0,100
Gary Kurtz,0,100
Graham Walters,0,100
Wendy Finerman,0,100
Bruce Cohen,0,100
Orson Welles,0,100


In [0]:
staff = actor.union(director)\
             .union(producer).dropDuplicates(subset=['name', 'duty'])\
             .select(actor[0].alias('id'),actor[1], actor[2] )
#            .select('*', monotonically_increasing_id().alias('id'))\
#116655 -387753         
# staff.count():378499

In [0]:
display(staff.filter("id='Adam Williams'"))

id,gender,duty
Adam Williams,0,10
Adam Williams,2,1


In [0]:
duty = staff.groupBy('id').sum('duty')
v = staff.join(duty, 'id')\
                    .drop('duty')\
                    .dropDuplicates(subset=['id'])
# v.count():348953
display(v)

id,gender,sum(duty)
Alex Chalmers,0,1
Amanda Phillips,0,1
Anders Hove,0,1
Aneeta Meet,0,1
Antanas Barčas,0,1
Bill Bunten,0,1
Craig DeSilva,0,1
Ilene Hamann,0,1
Jeon Hyung-jun,0,10
Julius Cotter,0,1


In [0]:
v.printSchema()

# Extract Edges DataFrame

In [0]:
partner12 = df.select(df[1].alias('src'), df[3].alias('dst'))\
              .filter("src != 'none'")\
              .filter("dst != 'none'")
partner21 = partner12.select(partner12['dst'].alias('src'),partner12['src'].alias('dst'))
partner13 = df.select(df[1].alias('src'), df[5].alias('dst'))\
              .filter("src != 'none'")\
              .filter("dst != 'none'")
partner31 = partner13.select(partner13['dst'].alias('src'),partner13['src'].alias('dst'))
partner14 = df.select(df[1].alias('src'), df[7].alias('dst'))\
              .filter("src != 'none'")\
              .filter("dst != 'none'")
partner41 = partner14.select(partner14['dst'].alias('src'),partner14['src'].alias('dst'))
partner15 = df.select(df[1].alias('src'), df[9].alias('dst'))\
              .filter("src != 'none'")\
              .filter("dst != 'none'")
partner51 = partner15.select(partner15['dst'].alias('src'),partner15['src'].alias('dst'))
partner23 = df.select(df[3].alias('src'), df[5].alias('dst'))\
              .filter("src != 'none'")\
              .filter("dst != 'none'")
partner32 = partner23.select(partner23['dst'].alias('src'),partner23['src'].alias('dst'))
partner24 = df.select(df[3].alias('src'), df[7].alias('dst'))\
              .filter("src != 'none'")\
              .filter("dst != 'none'")
partner42 = partner24.select(partner24['dst'].alias('src'),partner24['src'].alias('dst'))
partner25 = df.select(df[3].alias('src'), df[9].alias('dst'))\
              .filter("src != 'none'")\
              .filter("dst != 'none'")
partner52 = partner25.select(partner25['dst'].alias('src'),partner25['src'].alias('dst'))
partner34 = df.select(df[5].alias('src'), df[7].alias('dst'))\
              .filter("src != 'none'")\
             .filter("dst != 'none'")
partner43 = partner34.select(partner34['dst'].alias('src'),partner34['src'].alias('dst'))
partner35 = df.select(df[5].alias('src'), df[9].alias('dst'))\
              .filter("src != 'none'")\
              .filter("dst != 'none'")
partner53 = partner35.select(partner35['dst'].alias('src'),partner35['src'].alias('dst'))
partner45 = df.select(df[7].alias('src'), df[9].alias('dst'))\
              .filter("src != 'none'")\
              .filter("dst != 'none'")
partner54 = partner45.select(partner45['dst'].alias('src'),partner45['src'].alias('dst'))
partner = partner12.union(partner21)\
                   .union(partner13).union(partner31)\
                   .union(partner14).union(partner41)\
                   .union(partner15).union(partner51)\
                   .union(partner23).union(partner32)\
                   .union(partner24).union(partner42)\
                   .union(partner25).union(partner52)\
                   .union(partner34).union(partner43)\
                   .union(partner35).union(partner53)\
                   .union(partner45).union(partner54)
partner = partner.select('*', lit('partner').alias('relationship'))

In [0]:
display(partner)

src,dst,relationship
Turo Pajala,Susanna Haavisto,partner
Matti Pellonpää,Kati Outinen,partner
Tim Roth,Antonio Banderas,partner
Emilio Estevez,Cuba Gooding Jr.,partner
Rita Lengyel,Milton Welsh,partner
Mark Hamill,Harrison Ford,partner
Albert Brooks,Ellen DeGeneres,partner
Tom Hanks,Robin Wright,partner
Kevin Spacey,Annette Bening,partner
Orson Welles,Joseph Cotten,partner


In [0]:
direct1 = df.select(df[1].alias('dst'), df[12].alias('src'))\
            .filter("src != 'none'")\
            .filter("dst != 'none'")
direct2 = df.select(df[3].alias('dst'), df[12].alias('src'))\
            .filter("src != 'none'")\
            .filter("dst != 'none'")
direct3 = df.select(df[5].alias('dst'), df[12].alias('src'))\
            .filter("src != 'none'")\
            .filter("dst != 'none'")
direct4 = df.select(df[7].alias('dst'), df[12].alias('src'))\
            .filter("src != 'none'")\
            .filter("dst != 'none'")
direct5 = df.select(df[9].alias('dst'), df[12].alias('src'))\
            .filter("src != 'none'")\
            .filter("dst != 'none'")
direct = direct1.union(direct2)\
                .union(direct3)\
                .union(direct4)\
                .union(direct5)
direct = direct.select('*', lit('direct').alias('relationship'))
display(direct)

dst,src,relationship
Turo Pajala,Aki Kaurismäki,direct
Matti Pellonpää,Aki Kaurismäki,direct
Tim Roth,Allison Anders,direct
Emilio Estevez,Stephen Hopkins,direct
Rita Lengyel,Marc Meyer,direct
Mark Hamill,George Lucas,direct
Albert Brooks,Andrew Stanton,direct
Tom Hanks,Robert Zemeckis,direct
Kevin Spacey,Sam Mendes,direct
Orson Welles,Orson Welles,direct


In [0]:
produce1 = df.select(df[1].alias('dst'), df[15].alias('src'))\
            .filter("src != 'none'")\
            .filter("dst != 'none'")
produce2 = df.select(df[3].alias('dst'), df[15].alias('src'))\
            .filter("src != 'none'")\
            .filter("dst != 'none'")
produce3 = df.select(df[5].alias('dst'), df[15].alias('src'))\
            .filter("src != 'none'")\
            .filter("dst != 'none'")
produce4 = df.select(df[7].alias('dst'), df[15].alias('src'))\
            .filter("src != 'none'")\
            .filter("dst != 'none'")
produce5 = df.select(df[9].alias('dst'), df[15].alias('src'))\
            .filter("src != 'none'")\
            .filter("dst != 'none'")
produce = produce1.union(produce2)\
                .union(produce3)\
                .union(produce4)\
                .union(produce5)
produce = produce.select('*', lit('produce').alias('relationship'))

In [0]:
display(produce)

dst,src,relationship
Matti Pellonpää,Mika Kaurismäki,produce
Tim Roth,Lawrence Bender,produce
Emilio Estevez,Gene Levy,produce
Rita Lengyel,Marc Meyer,produce
Mark Hamill,Gary Kurtz,produce
Albert Brooks,Graham Walters,produce
Tom Hanks,Wendy Finerman,produce
Kevin Spacey,Bruce Cohen,produce
Orson Welles,Orson Welles,produce
Björk,Vibeke Windeløv,produce


In [0]:
edges = partner.unionByName(direct)\
               .unionByName(produce)
#byname
display(edges)

src,dst,relationship
Turo Pajala,Susanna Haavisto,partner
Matti Pellonpää,Kati Outinen,partner
Tim Roth,Antonio Banderas,partner
Emilio Estevez,Cuba Gooding Jr.,partner
Rita Lengyel,Milton Welsh,partner
Mark Hamill,Harrison Ford,partner
Albert Brooks,Ellen DeGeneres,partner
Tom Hanks,Robin Wright,partner
Kevin Spacey,Annette Bening,partner
Orson Welles,Joseph Cotten,partner


In [0]:
# edges.count():3790287
# edges.distinct().count():3370858
# 有多次合作

In [0]:
e = edges.groupBy('src', 'dst', 'relationship').count()
display(e)

src,dst,relationship,count
Lea Baastrup Rønne,Kristian Halken,partner,1
Jaromír Nohavica,Pavla Kovalová,partner,1
Roméo Botzaris,Isabelle Carré,partner,1
Arve Opsahl,Aud Schønemann,partner,17
Karine Vanasse,Pierre Lebeau,partner,1
Nam Sang-mi,Ryu Seung-ryong,partner,1
Margherita Buy,Gaetano Bruno,partner,1
Alexandra Wilson,Stacy Haiduk,partner,1
Chandler Canterbury,Willie Nelson,partner,1
Emmanuel Mouret,Judith Godrèche,partner,1


In [0]:
# v.write.format('csv').option('header',True).mode('overwrite').save('vertices.csv')
# e.write.format('csv').option('header',True).mode('overwrite').save('edges.csv')

In [0]:
e.printSchema()

# Create a Graph

In [0]:
g = GraphFrame(v, e)
display(g.vertices)

id,gender,sum(duty)
Alex Chalmers,0,1
Amanda Phillips,0,1
Anders Hove,0,1
Aneeta Meet,0,1
Antanas Barčas,0,1
Bill Bunten,0,1
Craig DeSilva,0,1
Ilene Hamann,0,1
Jeon Hyung-jun,0,10
Julius Cotter,0,1


In [0]:
display(g.edges)

src,dst,relationship,count
Peter Gantzler,Sara Indrio Jensen,partner,1
Michael Keaton,Danny DeVito,partner,2
Jennifer Beals,Michael Nouri,partner,1
Jodie Foster,Matthew McConaughey,partner,1
Heinz Rühmann,Karin Himboldt,partner,3
Kiefer Sutherland,Julia Roberts,partner,1
Franka Potente,Benno Fürmann,partner,2
Judd Nelson,Peter Cullen,partner,1
Julie Christie,Michael Murphy,partner,1
Keanu Reeves,Charlize Theron,partner,2


In [0]:
display(g.inDegrees)

id,inDegree
Ivan Okhlobystin,79
Cory Monteith,42
Eijirô Tôno,78
Bárbara Lennie,70
Stéphane Freiss,119
Glazova Ludmila,1
Yoo Yeon-seok,39
Yveline Cery,6
Stina Ekblad,69
Steve Rudzinski,11


In [0]:
display(g.outDegrees)

id,outDegree
Manuel Aguilera,5
Lex Barker,159
Michael Dobson,30
Jason Chang,7
Enn Kraam,4
Evgeniya Dobrovolskaya,48
Longjun Li,4
Miharu Shima,11
Ed Marinaro,55
Anthony Andrews,80


In [0]:
display(g.triplets)

src,edge,dst
"List(Denice Duff, 1, 11)","List(Denice Duff, Anders Hove, partner, 1)","List( Anders Hove, 0, 1)"
"List(Ted Nicolaou, 2, 10)","List(Ted Nicolaou, Anders Hove, direct, 1)","List( Anders Hove, 0, 1)"
"List(Iona Abur, 0, 1)","List(Iona Abur, Anders Hove, partner, 1)","List( Anders Hove, 0, 1)"
"List(Julie Michaels, 1, 1)","List(Julie Michaels, Anders Hove, partner, 1)","List( Anders Hove, 0, 1)"
"List(Manjeet Brar, 0, 1)","List(Manjeet Brar, Aneeta Meet, partner, 1)","List( Aneeta Meet, 0, 1)"
"List(Mehar Mittal, 2, 1)","List(Mehar Mittal, Aneeta Meet, partner, 1)","List( Aneeta Meet, 0, 1)"
"List(Manjit Guddu, 0, 1)","List(Manjit Guddu, Aneeta Meet, partner, 1)","List( Aneeta Meet, 0, 1)"
"List(Sahib Singh, 0, 1)","List(Sahib Singh, Aneeta Meet, partner, 1)","List( Aneeta Meet, 0, 1)"
"List(Shyamoli Verma, 0, 1)","List(Shyamoli Verma, Ilene Hamann, partner, 1)","List( Ilene Hamann, 0, 1)"
"List(Suhel Seth, 0, 1)","List(Suhel Seth, Ilene Hamann, partner, 1)","List( Ilene Hamann, 0, 1)"


# Extract information

In [0]:
display(g.vertices.filter("id = 'Turo Pajala'"))
#only actor

id,gender,sum(duty)
Turo Pajala,0,1


In [0]:
#find acotr's all relationship
display(g.edges.filter("dst = 'Turo Pajala'"))

src,dst,relationship,count
Susanna Haavisto,Turo Pajala,partner,1
Matti Pellonpää,Turo Pajala,partner,1
Aki Kaurismäki,Turo Pajala,direct,1
Eetu Hilkamo,Turo Pajala,partner,1


In [0]:
# not include single relationship (direct/ product)
display(g.edges.filter("src = 'Turo Pajala'"))

src,dst,relationship,count
Turo Pajala,Matti Pellonpää,partner,1
Turo Pajala,Susanna Haavisto,partner,1
Turo Pajala,Eetu Hilkamo,partner,1


In [0]:
display(g.edges.filter("src = 'Luciano Curreli'"))

src,dst,relationship,count
Luciano Curreli,Davide Manuli,partner,1
Luciano Curreli,Sarah Boberg,partner,1
Luciano Curreli,Simona Caramelli,partner,1


In [0]:
display(g.edges.filter("dst = 'Luciano Curreli'"))

src,dst,relationship,count
Davide Manuli,Luciano Curreli,partner,1
Davide Manuli,Luciano Curreli,produce,1
Sarah Boberg,Luciano Curreli,partner,1
Simona Caramelli,Luciano Curreli,partner,1
Davide Manuli,Luciano Curreli,direct,1


In [0]:
display(g.vertices.filter("id = ' Davide Manuli'"))
#3 duties on one person

id,gender,sum(duty)
Davide Manuli,0,111


In [0]:
display(g.edges.filter("src = ' Davide Manuli'"))

src,dst,relationship,count
Davide Manuli,Vincent Gallo,direct,1
Davide Manuli,Elisa Sednaoui,direct,1
Davide Manuli,Fabrizio Gifuni,direct,1
Davide Manuli,Luciano Curreli,direct,1
Davide Manuli,Silvia Calderoni,direct,1
Davide Manuli,Sarah Boberg,produce,1
Davide Manuli,Simona Caramelli,produce,1
Davide Manuli,Davide Manuli,produce,1
Davide Manuli,Simona Caramelli,direct,1
Davide Manuli,Davide Manuli,direct,1


In [0]:
# Search for pairs of vertices with edges in both directions between them.
ow = g.find("(a)-[e]->(b);(b)-[e2]->(a)").where("e.relationship = 'partner' AND e2.relationship = 'direct'")
display(ow)

a,e,b,e2
"List(Luciano Curreli, 0, 1)","List(Luciano Curreli, Davide Manuli, partner, 1)","List( Davide Manuli, 0, 111)","List( Davide Manuli, Luciano Curreli, direct, 1)"
"List(Art Evans, 2, 1)","List(Art Evans, A.J. Jamal, partner, 1)","List(A.J. Jamal, 0, 11)","List(A.J. Jamal, Art Evans, direct, 1)"
"List(Crane Manohar, 0, 1)","List(Crane Manohar, Aadharsh, partner, 1)","List(Aadharsh, 0, 11)","List(Aadharsh, Crane Manohar, direct, 1)"
"List(Matt Conrad, 0, 1)","List(Matt Conrad, Aaron C. Peer, partner, 1)","List(Aaron C. Peer, 0, 11)","List(Aaron C. Peer, Matt Conrad, direct, 1)"
"List(Billy Blair, 2, 1)","List(Billy Blair, Abel Berry, partner, 1)","List(Abel Berry, 0, 11)","List(Abel Berry, Billy Blair, direct, 2)"
"List(Nicholas St. John, 2, 1)","List(Nicholas St. John, Abel Ferrara, partner, 1)","List(Abel Ferrara, 2, 111)","List(Abel Ferrara, Nicholas St. John, direct, 1)"
"List(John Blyth Barrymore, 2, 1)","List(John Blyth Barrymore, Actually Huizenga, partner, 1)","List(Actually Huizenga, 0, 111)","List(Actually Huizenga, John Blyth Barrymore, direct, 1)"
"List(Kari Wuhrer, 1, 1)","List(Kari Wuhrer, Adam Baratta, partner, 1)","List(Adam Baratta, 0, 11)","List(Adam Baratta, Kari Wuhrer, direct, 1)"
"List(Amanda Plummer, 1, 1)","List(Amanda Plummer, Adam Coleman Howard, partner, 1)","List(Adam Coleman Howard, 2, 11)","List(Adam Coleman Howard, Amanda Plummer, direct, 1)"
"List(Alia Shawkat, 1, 1)","List(Alia Shawkat, Adam Green, partner, 1)","List(Adam Green, 2, 111)","List(Adam Green, Alia Shawkat, direct, 1)"


# Subgraph

actor-actor

In [0]:
g_par = g.filterEdges("relationship = 'partner'").dropIsolatedVertices()

display(g_par.vertices)

id,gender,sum(duty)
Anders Hove,0,1
Aneeta Meet,0,1
Ilene Hamann,0,1
Michael Chambers,2,1
Nicole Muñoz,1,1
"""Bernd """"Bernemann"""" Kost""",0,1
"""Carroll """"Poke"""" Runyon""",0,1
"""Christine'a """"Mama Quest"""" Rainey""",0,1
"""Floyd I. """"Bud"""" Gaugh IV""",0,1
"""Juan Ricardo Lozano """"Alerta""""""",0,1


In [0]:
display(g_par.edges.filter("src = ' Davide Manuli'"))

src,dst,relationship,count
Davide Manuli,Sarah Boberg,partner,1
Davide Manuli,Simona Caramelli,partner,1
Davide Manuli,Luciano Curreli,partner,1


In [0]:
display(g_par.edges)

src,dst,relationship,count
Lea Baastrup Rønne,Kristian Halken,partner,1
Jaromír Nohavica,Pavla Kovalová,partner,1
Roméo Botzaris,Isabelle Carré,partner,1
Arve Opsahl,Aud Schønemann,partner,17
Karine Vanasse,Pierre Lebeau,partner,1
Nam Sang-mi,Ryu Seung-ryong,partner,1
Margherita Buy,Gaetano Bruno,partner,1
Alexandra Wilson,Stacy Haiduk,partner,1
Chandler Canterbury,Willie Nelson,partner,1
Emmanuel Mouret,Judith Godrèche,partner,1


director-actor

In [0]:
g_dir = g.filterEdges("relationship = 'direct'").dropIsolatedVertices()

display(g_dir.vertices)

id,gender,sum(duty)
Alex Chalmers,0,1
Amanda Phillips,0,1
Anders Hove,0,1
Antanas Barčas,0,1
Bill Bunten,0,1
Craig DeSilva,0,1
Ilene Hamann,0,1
Jeon Hyung-jun,0,10
Julius Cotter,0,1
Michael Chambers,2,1


In [0]:
display(g_dir.edges)

src,dst,relationship,count
Arno Dierickx,Terence Schreurs,direct,1
John Huff,Ray Wise,direct,1
Mahamat-Saleh Haroun,Youssouf Djaoro,direct,3
Kazuyuki Morosawa,Keiko Kitagawa,direct,1
Tai Katô,Hashizo Okawa,direct,2
Mario Bonnard,Alberto Sordi,direct,2
Ivo Novák,Daniela Kolářová,direct,1
Toshio Masuda,Yûjirô Ishihara,direct,6
Raphael Alvarez,Cláudia Raia,direct,1
Eric Stanze,Emily Haack,direct,3


producer-actor

In [0]:
g_pro = g.filterEdges("relationship = 'produce'").dropIsolatedVertices()

display(g_pro.vertices)
ver_size_pro = g_pro.vertices.count()

id,gender,sum(duty)
Alex Chalmers,0,1
Amanda Phillips,0,1
Ilene Hamann,0,1
Julius Cotter,0,1
Nicole Muñoz,1,1
Radu Teordescu,0,1
"""Alan """"Boston"""" Dvorkis""",0,1
"""Arthur """"Peg Leg Sam"""" Jackson""",0,1
"""Christine'a """"Mama Quest"""" Rainey""",0,1
"""Lowell """"Sly"""" Dunbar""",0,1


In [0]:
display(g_pro.edges)
ed_size_pro = g_pro.edges.count()

src,dst,relationship,count
Rene Bastian,Felicity Huffman,produce,1
Laurie MacDonald,Naomi Watts,produce,2
Robert De Niro,Ben Stiller,produce,3
Clark L. Paylow,Richard Dreyfuss,produce,1
Tim Bevan,Rowan Atkinson,produce,2
Otto Preminger,Robert Mitchum,produce,1
Alain Glasberg,Abdellah Didane,produce,1
Irving Thalberg,John Gilbert,produce,5
John Boorman,Sean Connery,produce,1
David Moreton,Chris Stafford,produce,1


# Visualization

matplotlib

In [0]:
from matplotlib import pyplot as plt
df_staff = g.vertices.toPandas()
df_relation = g.edges.toPandas()

In [0]:
df_staff

In [0]:
df_relation

In [0]:
plt.plot(df_relation['count'])
plt.show()

networkx

In [0]:
g1 = g.filterEdges("src = ' Davide Manuli'").dropIsolatedVertices()

display(g1.edges)
ver_size1 = g1.vertices.count()
edg_size1 = g1.edges.count()

In [0]:
display(g1.vertices)

In [0]:
import networkx as nx
def draw_graph(g, figsize, width, node_size):
  Gplot=nx.MultiDiGraph()
  edges_count = {}
  edges = []
  
  #partner-blue
  #direct-green
  #produce-red
  for row in g.edges.take(g.edges.count()):
    if (row['src'], row['dst']) not in edges:
      edges_count[row['src']+','+row['dst']] = 3
      edges.append((row['src'], row['dst']))
    else:
      edges_count[row['src']+','+row['dst']] -= 1
    if row['relationship'] == 'partner':
      Gplot.add_edge(row['src'], row['dst'],color = 'b',  relationship=row['relationship'], weight = width*edges_count[row['src']+','+row['dst']],count = row['count'] )
    elif row['relationship'] == 'direct':
      Gplot.add_edge(row['src'], row['dst'],color = 'g',  relationship=row['relationship'], weight = width*edges_count[row['src']+','+row['dst']],count = row['count'] )
    elif row['relationship'] == 'produce':
      Gplot.add_edge(row['src'], row['dst'],color = 'r',  relationship=row['relationship'], weight = width*edges_count[row['src']+','+row['dst']],count = row['count'] )


  for row in g.vertices.take(g.vertices.count()):
      Gplot.add_node(row['id'],)
  # plt.subplot(121)

  pos=nx.spring_layout(Gplot)

  colors = []
  weight = []

  for (u,v,attrib_dict) in list(Gplot.edges.data()):
      colors.append(attrib_dict['color'])
      weight.append(attrib_dict['weight'])
  nx.draw(Gplot, pos,node_color = 'y', node_size = node_size, font_size = 8,edge_color=colors, width=weight, with_labels=True )
  #edges
  # nx.draw_networkx_edge_labels(Gplot, pos, edge_labels=nx.get_edge_attributes(Gplot, 'relationship'))
  #width-count
  # nx.draw_networkx_edges(Gplot, pos, width=nx.get_edge_attributes(Gplot, 'weight'))
  plt.figure(figsize=figsize)
#   plt.savefig('./test.jpg')
  plt.show()
  return Gplot
  #node_size

In [0]:
graph1 = draw_graph(g1, (10,10), 2,150)

In [0]:
g2 = g.filterEdges("src = 'Sarah Boberg' OR dst = 'Sarah Boberg' ").dropIsolatedVertices()
ver_size2 = g2.vertices.count()
edg_size2 = g2.edges.count()
display(g2.edges)

In [0]:
display(g2.vertices)

In [0]:
graph2 = draw_graph(g2, (30,30), 0.5,100)

In [0]:
g3 = g.filterEdges("dst = 'Sarah Boberg' OR src = ' Davide Manuli'").dropIsolatedVertices()
display(g3.edges)

In [0]:
graph3 = draw_graph(g3, (30,30), 0.3, 50)

In [0]:
graph1[' Davide Manuli']  # same as G.adj[' Davide Manuli']