In [2]:
from pyspark.sql import SparkSession
import pandas as pd

spark = SparkSession.builder.appName('L12').getOrCreate()

# 加载数据,重命名列
df = spark.read.csv('./cs-training.csv', header=True, inferSchema=True) \
    .withColumnRenamed('SeriousDlqin2yrs', 'y') \
    .withColumnRenamed('NumberOfTime30-59DaysPastDueNotWorse', '30-59days') \
    .withColumnRenamed('NumberOfTime60-89DaysPastDueNotWorse', '60-89days') \
    .cache()

pd.DataFrame(df.take(5), columns=df.columns)

Unnamed: 0,_c0,y,RevolvingUtilizationOfUnsecuredLines,age,30-59days,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,60-89days,NumberOfDependents
0,1,1,0.766127,45,2,0.802982,9120,13,0,6,0,2
1,2,0,0.957151,40,0,0.121876,2600,4,0,0,0,1
2,3,0,0.65818,38,1,0.085113,3042,2,1,0,0,0
3,4,0,0.23381,30,0,0.03605,3300,5,0,0,0,0
4,5,0,0.907239,49,1,0.024926,63588,7,0,1,0,0


In [57]:
# 全局配置
from pyecharts.charts import Bar, Pie, Page
from pyecharts import options as opts
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pip_label_opts = opts.LabelOpts(formatter='{b} {c} {d}%')
attr_title = ['正常人数', '违约人数']

In [58]:
# 整体情况
total_y0 = df.filter(df['y']==0).count()
total_y1 = df.filter(df['y']==1).count()
(
    Pie()
        .add('', [[attr_title[0], total_y0], [attr_title[1], total_y1]])
        .set_global_opts(title_opts=opts.TitleOpts(title='整体违约人数分布'))
        .set_series_opts(label_opts=pip_label_opts)
        .render_notebook()
)

In [59]:
# 不同维度的情况
def draw(attr, y0, y1, title):
    page = Page()
    # 柱图整体情况
    bar = Bar()
    bar.add_xaxis(attr)
    bar.add_yaxis(attr_title[0], y0)
    bar.add_yaxis(attr_title[1], y1)
    bar.set_global_opts(title_opts=opts.TitleOpts(title='各' + title + '违约情况', pos_left='5%'))
    page.add(bar)
    
    # 饼图分布
    for i in range(len(attr)):
        pie = (Pie()
            .add(attr[i], [[attr_title[0], y0[i]], [attr_title[1], y1[i]]])
            .set_global_opts(title_opts=opts.TitleOpts(title=attr[i] + '违约人数分布'))
            .set_series_opts(label_opts=pip_label_opts)
        )
        page.add(pie)
    return page
    

In [60]:
# 年龄段
bin = [0,25,35,50,70,100]
df_age = df.select(['age', 'y'])
age_y0 = []
age_y1 = []
for i in range(len(bin)-1):
    age = df_age.filter(df_age['age'].between(bin[i], bin[i+1]))
    age_y0.append(age.filter(df_age['y']==0).count())
    age_y1.append(age.filter(df_age['y']==1).count())
age_attr = ['0-25岁', '25-35岁', '35-50岁', '50-70岁', '70-100岁']
draw(age_attr, age_y0, age_y1, '年龄段').render_notebook()

In [62]:
# 有逾期的人
df_past = df.select(df['30-59days'], df['60-89days'], df['y'])
past_y0 = []
past_y1= []
past_y0.append(df_past.filter(df_past['30-59days'] > 0).filter(df_past['y']==0).count())
past_y1.append(df_past.filter(df_past['30-59days'] > 0).filter(df_past['y']==1).count())
past_y0.append(df_past.filter(df_past['60-89days'] > 0).filter(df_past['y']==0).count())
past_y1.append(df_past.filter(df_past['60-89days'] > 0).filter(df_past['y']==1).count())
past_y0.append(df_past.filter(df_past['30-59days'] > 0).filter(df_past['60-89days'] > 0).filter(df_past['y']==0).count())
past_y1.append(df_past.filter(df_past['30-59days'] > 0).filter(df_past['60-89days'] > 0).filter(df_past['y']==1).count())

past_attr = ['30-59day', '60-89days', '两个都有']
draw(past_attr, past_y0, past_y1, '逾期记录').render_notebook()