In [1]:
#Import full dataset 
graphdata_df = spark.read.csv("/FileStore/tables/CleanedLoanData.csv",
                      inferSchema="true", header="true")

In [2]:
#register as temp table
graphdata_df.registerTempTable("graphdata_df")
graphdata_df.cache()

In [3]:
#Create another dataset with feature engineered input
graph_data = graphdata_df.select('loan_amnt','term', 'emp_length', 'home_ownership', 'annual_inc','verification_status','delinq_2yrs','Risk_Score','inq_last_6mths','open_acc','revol_bal','revol_util','total_acc','mths_since_last_major_derog','funded_amnt_inv','installment','pub_rec','dti','addr_state', 'int_rate')
graph_data.registerTempTable("graph_data")

graph_data.cache()
graph_data.count()

In [4]:
from pyspark.sql.functions import *
from graphframes import *

# Create Vertices  and Edges 
Vertices = graphdata_df.withColumnRenamed("Interest rate", "grade" ).distinct()
Edges = graph_data.select("annual_inc", "emp_length", "home_ownership", "total_acc", "installment", "Risk_Score")

# Cache Vertices and Edges
Edges.cache()
Vertices.cache()

In [5]:
# Vertices
#   The vertices of our graph is the Interest Rate
display(Vertices)

In [6]:
# Edges
#  The edges of our graph are the relationship between features
display(Edges)

In [7]:
# Build `Graph` GraphFrame
#  This GraphFrame builds up on the vertices and edges 
Graph = GraphFrame(Vertices, Edges)
print Graph

# Build `tripGraphPrime` GraphFrame
#   This graphframe contains a smaller subset of data to make it easier to display motifs and subgraphs (below)
EdgesPrime = graph_data.select('loan_amnt','term', 'emp_length', 'home_ownership')
GraphPrime = GraphFrame(Vertices, EdgesPrime)

In [8]:
Graph.edges\
  .filter("emp_length = '12' and Risk_Score > 650")\
  .groupBy("emp_length", "installment")\
  .avg("int_rate")\
  .sort(desc("avg(int_rate)"))

In [9]:
display(Graph.edges.filter("emp_length = '12' and Risk_Score > 650").groupBy("emp_length", "installment").avg("int_rate").sort(desc("avg(int_rate)")))

In [10]:
# After displaying interest rates, use Plot Options 
Interest_rate = Graph.edges.filter("Risk_Score > 650")
display(Interest_rate)

In [11]:
# Degrees
#  The number of degrees - 
display(Graph.degrees.sort(desc("degree")).limit(25))

In [12]:
# Determining  ranking of importance using `pageRank`
ranks = Graph.pageRank(resetProbability=0.10, maxIter=8)
display(ranks.vertices.orderBy(ranks.vertices.pagerank.desc()).limit(20))