In [None]:
lending_club.db

In [1]:
import os
import pandas as pd
import numpy as np
import pyspark.pandas as ps
from pyspark.sql import SparkSession



In [2]:
%pwd

'/Users/a06411/Documents/GitHub/nh_lecture/nhbank_dp_2_pyspark'

## 판다스로 스키마 정보 확인하기 

In [15]:
path_sch = "/Users/a06411/documents/data_hub/lending_club/table_schema.csv"

In [17]:
df_sch = pd.read_csv(path_sch);

In [20]:
df_sch= df_sch.drop("Unnamed: 0",axis=1)

In [21]:
df_sch.shape

(150, 5)

In [23]:
df_sch.head()

Unnamed: 0,LoanStatNew,Label,Description,Korean,lab_desc
0,emp_title,borrower,The job title supplied by the Borrower when ap...,직업,고객 정보
1,emp_length,borrower,Employment length in years. Possible values ar...,연차,
2,home_ownership,borrower,The home ownership status provided by the borr...,주택 소유 상태,
3,zip_code,borrower,The first 3 numbers of the zip code provided b...,우편번호(앞 3자리),
4,addr_state,borrower,The state provided by the borrower in the loan...,거주중인 주,


In [40]:
df_sch['Label'].value_counts()

Label
credit                 32
loan_account           23
hardship               16
payment                13
secondary_applicant    13
revolving              11
installment             9
settlement              7
borrower                6
credit_rating           6
income                  6
inquiry                 4
trade                   4
Name: count, dtype: int64

### 판다스 쿼리로 행 선택하기 

In [48]:
df_sch.query("Label == 'borrower'")

Unnamed: 0,LoanStatNew,Label,Description,Korean,lab_desc
0,emp_title,borrower,The job title supplied by the Borrower when ap...,직업,고객 정보
1,emp_length,borrower,Employment length in years. Possible values ar...,연차,
2,home_ownership,borrower,The home ownership status provided by the borr...,주택 소유 상태,
3,zip_code,borrower,The first 3 numbers of the zip code provided b...,우편번호(앞 3자리),
4,addr_state,borrower,The state provided by the borrower in the loan...,거주중인 주,
5,member_id,borrower,A unique LC assigned Id for the borrower member.,고객 ID,


In [47]:
df_sch.query("Label == 'inquiry'")

Unnamed: 0,LoanStatNew,Label,Description,Korean,lab_desc
66,inq_last_6mths,inquiry,The number of inquiries in past 6 months (excl...,지난 6개월 간 상담 수,상담 정보
67,inq_fi,inquiry,Number of personal finance inquiries,상담 수,
68,inq_last_12m,inquiry,Number of credit inquiries in past 12 months,지난 12개월 간 상담 수,
69,mths_since_recent_inq,inquiry,Months since most recent inquiry.,가장 최근 상담 이후의 개월 수,


## sqlite를 접속해서 테이블 정보 확인하기 

In [10]:
path = "/Users/a06411/documents/data_hub/lending_club/lending_club.db"

In [11]:
import sqlite3

In [12]:
con = sqlite3.connect(path)

In [13]:
query = """SELECT name FROM sqlite_master  
  WHERE type='table';"""

In [14]:
pd.read_sql_query(query,con)

Unnamed: 0,name
0,borrower
1,credit
2,credit_rating
3,hardship
4,income
5,inquiry
6,installment
7,loan_account
8,payment
9,revolving


## 스파크 세션 연결 

In [25]:
spark = (SparkSession.builder.appName('lendingclub-api')
                             .config("spark.driver.host","127.0.0.1") 
                             .config("spark.driver.bindAddress","127.0.0.1")
                             .config("spark.jars","{}/sqlite-jdbc-3.34.0.jar".format(os.getcwd()))
                             .config("spark.driver.extraClassPath","{}/sqlite-jdbc-3.34.0.jar".format(os.getcwd()))
                             .getOrCreate())

23/06/10 23:39:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/06/10 23:39:40 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/06/10 23:39:40 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
23/06/10 23:39:40 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


## 1. 테이블이름으로 처리 

In [27]:
py_df_borrower = spark.read.format('jdbc') \
        .options(driver='org.sqlite.JDBC', dbtable='borrower',
                 url=f'jdbc:sqlite:{path}')\
        .load()

In [28]:
py_df_borrower.printSchema()

root
 |-- id: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- addr_state: string (nullable = true)
 |-- member_id: double (nullable = true)



In [30]:
py_df_borrower.count()

                                                                                

2260701

In [31]:
py_df_borrower.createOrReplaceTempView('borrower')

In [33]:
spark.sql("select * from borrower").take(3)

[Row(id='68407277', emp_title='leadman', emp_length='10+ years', home_ownership='MORTGAGE', zip_code='190xx', addr_state='PA', member_id=None),
 Row(id='68355089', emp_title='Engineer', emp_length='10+ years', home_ownership='MORTGAGE', zip_code='577xx', addr_state='SD', member_id=None),
 Row(id='68341763', emp_title='truck driver', emp_length='10+ years', home_ownership='MORTGAGE', zip_code='605xx', addr_state='IL', member_id=None)]

In [34]:
spark.sql("select * from borrower limit 3").show()

+--------+------------+----------+--------------+--------+----------+---------+
|      id|   emp_title|emp_length|home_ownership|zip_code|addr_state|member_id|
+--------+------------+----------+--------------+--------+----------+---------+
|68407277|     leadman| 10+ years|      MORTGAGE|   190xx|        PA|     null|
|68355089|    Engineer| 10+ years|      MORTGAGE|   577xx|        SD|     null|
|68341763|truck driver| 10+ years|      MORTGAGE|   605xx|        IL|     null|
+--------+------------+----------+--------------+--------+----------+---------+



In [None]:
py_df_borrower = spark.read.format('jdbc') \
        .options(driver='org.sqlite.JDBC', dbtable='borrower',
                 url=f'jdbc:sqlite:{path}')\
        .load()

In [None]:
.option("query", "select id,age from borrower where gender='M'") 

## 2. 하나의 테이블 쿼리 처리  

## 데이터베이스의 테이블 조회하기 

In [60]:
query_tables = """SELECT name FROM sqlite_master  
  WHERE type='table'"""

In [61]:
df_tables = spark.read \
    .format("jdbc") \
    .option("url", f'jdbc:sqlite:{path}') \
    .option("driver", 'org.sqlite.JDBC') \
    .option("query", query_tables) \
    .load()

In [63]:
df_tables.show(15)

+-------------------+
|               name|
+-------------------+
|           borrower|
|             credit|
|      credit_rating|
|           hardship|
|             income|
|            inquiry|
|        installment|
|       loan_account|
|            payment|
|          revolving|
|secondary_applicant|
|         settlement|
|              trade|
+-------------------+



## 하나의 테이블 조회 

In [70]:
# Query from sql Table
df_trade = spark.read \
    .format("jdbc") \
    .option("url", f'jdbc:sqlite:{path}') \
    .option("driver", 'org.sqlite.JDBC') \
    .option("query", "select * from trade limit 10") \
    .load()

In [71]:
df_trade.printSchema()

root
 |-- id: string (nullable = true)
 |-- open_acc_6m: double (nullable = true)
 |-- total_cu_tl: double (nullable = true)
 |-- acc_open_past_24mths: double (nullable = true)
 |-- pct_tl_nvr_dlq: double (nullable = true)



In [72]:
# Query from sql Table
df_inquiry = spark.read \
    .format("jdbc") \
    .option("url", f'jdbc:sqlite:{path}') \
    .option("driver", 'org.sqlite.JDBC') \
    .option("query", "select * from inquiry limit 10") \
    .load()

In [73]:
df_inquiry.printSchema()

root
 |-- id: string (nullable = true)
 |-- inq_last_6mths: double (nullable = true)
 |-- inq_fi: double (nullable = true)
 |-- inq_last_12m: double (nullable = true)
 |-- mths_since_recent_inq: double (nullable = true)



In [38]:
# Query from sql Table
df = spark.read \
    .format("jdbc") \
    .option("url", f'jdbc:sqlite:{path}') \
    .option("driver", 'org.sqlite.JDBC') \
    .option("query", "select * from borrower limit 100") \
    .load()

In [39]:
df.show()

+--------+--------------------+----------+--------------+--------+----------+---------+
|      id|           emp_title|emp_length|home_ownership|zip_code|addr_state|member_id|
+--------+--------------------+----------+--------------+--------+----------+---------+
|68407277|             leadman| 10+ years|      MORTGAGE|   190xx|        PA|     null|
|68355089|            Engineer| 10+ years|      MORTGAGE|   577xx|        SD|     null|
|68341763|        truck driver| 10+ years|      MORTGAGE|   605xx|        IL|     null|
|66310712|Information Syste...| 10+ years|      MORTGAGE|   076xx|        NJ|     null|
|68476807| Contract Specialist|   3 years|      MORTGAGE|   174xx|        PA|     null|
|68426831|Veterinary Tecnician|   4 years|          RENT|   300xx|        GA|     null|
|68476668|Vice President of...| 10+ years|      MORTGAGE|   550xx|        MN|     null|
|67275481|         road driver| 10+ years|      MORTGAGE|   293xx|        SC|     null|
|68466926|     SERVICE MANAGER| 

## 3 . 조인처리하기 

In [54]:
query = """
select b.id,b.emp_title, i.inq_fi from borrower b join inquiry i on b.id = i.id limit 100
"""

In [55]:
# Query from sql Table
df_join = spark.read \
    .format("jdbc") \
    .option("url", f'jdbc:sqlite:{path}') \
    .option("driver", 'org.sqlite.JDBC') \
    .option("query", query) \
    .load()

In [56]:
df_join.printSchema()

root
 |-- id: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- inq_fi: double (nullable = true)



In [57]:
df_join.show()

[Stage 7:>                                                          (0 + 1) / 1]

+--------+--------------------+------+
|      id|           emp_title|inq_fi|
+--------+--------------------+------+
|68407277|             leadman|   3.0|
|68355089|            Engineer|   0.0|
|68341763|        truck driver|   2.0|
|66310712|Information Syste...|   0.0|
|68476807| Contract Specialist|   2.0|
|68426831|Veterinary Tecnician|   0.0|
|68476668|Vice President of...|   1.0|
|67275481|         road driver|   1.0|
|68466926|     SERVICE MANAGER|   2.0|
|68616873|      Vendor liaison|   0.0|
|68356421|  Executive Director|   1.0|
|68426545|Senior Structural...|   0.0|
|68338832|   Logistics Manager|   0.0|
|66624733|    Software Manager|   2.0|
|68466961|      Senior Manager|   0.0|
|68354783|                tech|   0.0|
|68466916|       Sales Manager|   1.0|
|68577849|               GS-11|   0.0|
|68506798|             Teacher|   0.0|
|68495092|Program Coordinator |   2.0|
+--------+--------------------+------+
only showing top 20 rows



                                                                                