# MLlib package of PySpark
- spark 2.x 부터 ML 패키지는 데이터프레임에 대해 작동
    - MLlib 패키지는 RDD에 대해 작동함
    
- MLlib은 전체적으로 크게 세단계의 머신러닝 기능으로 구분된다
    1. 데이터 전처리 : 
        - 피처추출, 변형, 선택, 카테고리 피처에 대한 해석, 자연어처리
    1. 머신러닝 알고리즘 :
        - 몇몇 유명, 고급레벨인 회귀, 분류, 군집 알고리즘 지원
    1. 유틸리티 :
        - 기술통계, 카이스퀘어(ChiSquare)테스트, 선형대수, 모델평가
        
- https://books.google.co.kr/books?id=HVQoDwAAQBAJ&pg=PA84&lpg=PA84&dq=births_transformed+%3D+births_transformed.select(exprs_YNU)&source=bl&ots=tLNsHpKdfH&sig=ACfU3U37yj0SiQWLB-DgXvOuT1toOs4keQ&hl=ko&sa=X&ved=2ahUKEwjissX9v5HpAhWS7WEKHbfMAkIQ6AEwAHoECAoQAQ#v=onepage&q&f=false

In [1]:
base_path = "../../data/RDD_example/"

In [9]:
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

conf  = pyspark.SparkConf().setAppName('appName').setMaster('local[2]')
sc    = pyspark.SparkContext(conf=conf)
spark = SparkSession(sc)

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=appName, master=local[2]) created by __init__ at <ipython-input-7-d75bc8a5b849>:6 

### INFANT_ALIVE_AT_REPORT 가 1인지, 0 인지를 예측 하는것이 목표
- US 2014 ~ 2015 년 출생데이터의 일부
    - 원본데이터는 300개의 피쳐로 구성, 그중 85개 선별

- 799 만개의 데이터 중 45429개 균등 샘플링

In [10]:
import pyspark.sql.types as typ

In [11]:
labels = [
    ('INFANT_ALIVE_AT_REPORT', typ.StringType()),
    ('BIRTH_YEAR', typ.IntegerType()),
    ('BIRTH_MONTH', typ.IntegerType()),
    ('BIRTH_PLACE', typ.StringType()),
    ('MOTHER_AGE_YEARS', typ.IntegerType()),
    ('MOTHER_RACE_6CODE', typ.StringType()),
    ('MOTHER_EDUCATION', typ.StringType()),
    ('FATHER_COMBINED_AGE', typ.IntegerType()),
    ('FATHER_EDUCATION', typ.StringType()),
    ('MONTH_PRECARE_RECODE', typ.StringType()),
    ('CIG_BEFORE', typ.IntegerType()),
    ('CIG_1_TRI', typ.IntegerType()),
    ('CIG_2_TRI', typ.IntegerType()),
    ('CIG_3_TRI', typ.IntegerType()),
    ('MOTHER_HEIGHT_IN', typ.IntegerType()),
    ('MOTHER_BMI_RECODE', typ.IntegerType()),
    ('MOTHER_PRE_WEIGHT', typ.IntegerType()),
    ('MOTHER_DELIVERY_WEIGHT', typ.IntegerType()),
    ('MOTHER_WEIGHT_GAIN', typ.IntegerType()),
    ('DIABETES_PRE', typ.StringType()),
    ('DIABETES_GEST', typ.StringType()),
    ('HYP_TENS_PRE', typ.StringType()),
    ('HYP_TENS_GEST', typ.StringType()),
    ('PREV_BIRTH_PRETERM', typ.StringType()),
    ('NO_RISK', typ.StringType()),
    ('NO_INFECTIONS_REPORTED', typ.StringType()),
    ('LABOR_IND', typ.StringType()),
    ('LABOR_AUGM', typ.StringType()),
    ('STEROIDS', typ.StringType()),
    ('ANTIBIOTICS', typ.StringType()),
    ('ANESTHESIA', typ.StringType()),
    ('DELIV_METHOD_RECODE_COMB', typ.StringType()),
    ('ATTENDANT_BIRTH', typ.StringType()),
    ('APGAR_5', typ.IntegerType()),
    ('APGAR_5_RECODE', typ.StringType()),
    ('APGAR_10', typ.IntegerType()),
    ('APGAR_10_RECODE', typ.StringType()),
    ('INFANT_SEX', typ.StringType()),
    ('OBSTETRIC_GESTATION_WEEKS', typ.IntegerType()),
    ('INFANT_WEIGHT_GRAMS', typ.IntegerType()),
    ('INFANT_ASSIST_VENTI', typ.StringType()),
    ('INFANT_ASSIST_VENTI_6HRS', typ.StringType()),
    ('INFANT_NICU_ADMISSION', typ.StringType()),
    ('INFANT_SURFACANT', typ.StringType()),
    ('INFANT_ANTIBIOTICS', typ.StringType()),
    ('INFANT_SEIZURES', typ.StringType()),
    ('INFANT_NO_ABNORMALITIES', typ.StringType()),
    ('INFANT_ANCEPHALY', typ.StringType()),
    ('INFANT_MENINGOMYELOCELE', typ.StringType()),
    ('INFANT_LIMB_REDUCTION', typ.StringType()),
    ('INFANT_DOWN_SYNDROME', typ.StringType()),
    ('INFANT_SUSPECTED_CHROMOSOMAL_DISORDER', typ.StringType()),
    ('INFANT_NO_CONGENITAL_ANOMALIES_CHECKED', typ.StringType()),
    ('INFANT_BREASTFED', typ.StringType())
]

In [12]:
schema = typ.StructType([ typ.StructField(e[0], e[1], False)
                        for e in labels ])

In [13]:
births = spark.read.csv(base_path+'births_train.csv', header=True, schema=schema)

In [14]:
type(births)

pyspark.sql.dataframe.DataFrame

In [15]:
births.printSchema()

root
 |-- INFANT_ALIVE_AT_REPORT: string (nullable = true)
 |-- BIRTH_YEAR: integer (nullable = true)
 |-- BIRTH_MONTH: integer (nullable = true)
 |-- BIRTH_PLACE: string (nullable = true)
 |-- MOTHER_AGE_YEARS: integer (nullable = true)
 |-- MOTHER_RACE_6CODE: string (nullable = true)
 |-- MOTHER_EDUCATION: string (nullable = true)
 |-- FATHER_COMBINED_AGE: integer (nullable = true)
 |-- FATHER_EDUCATION: string (nullable = true)
 |-- MONTH_PRECARE_RECODE: string (nullable = true)
 |-- CIG_BEFORE: integer (nullable = true)
 |-- CIG_1_TRI: integer (nullable = true)
 |-- CIG_2_TRI: integer (nullable = true)
 |-- CIG_3_TRI: integer (nullable = true)
 |-- MOTHER_HEIGHT_IN: integer (nullable = true)
 |-- MOTHER_BMI_RECODE: integer (nullable = true)
 |-- MOTHER_PRE_WEIGHT: integer (nullable = true)
 |-- MOTHER_DELIVERY_WEIGHT: integer (nullable = true)
 |-- MOTHER_WEIGHT_GAIN: integer (nullable = true)
 |-- DIABETES_PRE: string (nullable = true)
 |-- DIABETES_GEST: string (nullable = true)


- Y : yes
- N : no
- U : unknown

In [16]:
recode_dictionary = {
    'YNU' : {
         'Y' : 1
        ,'N' : 2
        ,'U' : 0
    }
}

In [17]:
type(recode_dictionary), len(recode_dictionary)

(dict, 1)

In [18]:
selected_features = [
    'INFANT_ALIVE_AT_REPORT', 
    'BIRTH_PLACE', 
    'MOTHER_AGE_YEARS', 
    'FATHER_COMBINED_AGE', 
    'CIG_BEFORE', 
    'CIG_1_TRI', 
    'CIG_2_TRI', 
    'CIG_3_TRI', 
    'MOTHER_HEIGHT_IN', 
    'MOTHER_PRE_WEIGHT', 
    'MOTHER_DELIVERY_WEIGHT', 
    'MOTHER_WEIGHT_GAIN', 
    'DIABETES_PRE', 
    'DIABETES_GEST', 
    'HYP_TENS_PRE', 
    'HYP_TENS_GEST', 
    'PREV_BIRTH_PRETERM'
]

In [19]:
births_trimmed = births.select(selected_features)

In [20]:
births_trimmed.printSchema()

root
 |-- INFANT_ALIVE_AT_REPORT: string (nullable = true)
 |-- BIRTH_PLACE: string (nullable = true)
 |-- MOTHER_AGE_YEARS: integer (nullable = true)
 |-- FATHER_COMBINED_AGE: integer (nullable = true)
 |-- CIG_BEFORE: integer (nullable = true)
 |-- CIG_1_TRI: integer (nullable = true)
 |-- CIG_2_TRI: integer (nullable = true)
 |-- CIG_3_TRI: integer (nullable = true)
 |-- MOTHER_HEIGHT_IN: integer (nullable = true)
 |-- MOTHER_PRE_WEIGHT: integer (nullable = true)
 |-- MOTHER_DELIVERY_WEIGHT: integer (nullable = true)
 |-- MOTHER_WEIGHT_GAIN: integer (nullable = true)
 |-- DIABETES_PRE: string (nullable = true)
 |-- DIABETES_GEST: string (nullable = true)
 |-- HYP_TENS_PRE: string (nullable = true)
 |-- HYP_TENS_GEST: string (nullable = true)
 |-- PREV_BIRTH_PRETERM: string (nullable = true)




- 데이터셋에는 YES, NO, UNKNOWN 값을 가진 피쳐들이 매우 많음
    - YES만 1, 나머지는 0으로 변환
- 어미의 흡연량 관련 레코드 :
    - 0      : 임신기간 동안 금연
    - 1 ~ 97 : 1 ~ 97 개피 흡연
    - 98     : 98개피 이상
    - 99     : 알수없음

In [24]:
import pyspark.sql.functions as func

def recode(col, key):
    return recode_dictionary[key][col]

- 99 : 알수없음을 제외한 것은 그대로 반환하고, 99 일때는 0을 반환

In [26]:
def correct_cig(feat):
    return func.when(func.col(feat) != 99, func.col(feat)).otherwise(0)

## 사용자 정의 함수 써보기
- def 으로 정의한것을 넣어서

In [23]:
rec_integer = func.udf( recode, typ.IntegerType())

In [27]:
births_transformed = births_trimmed \
                    .withColumn('CIG_BEFORE', correct_cig('CIG_BEFORE')) \
                    .withColumn('CIG_1_TRI', correct_cig('CIG_1_TRI')) \
                    .withColumn('CIG_2_TRI', correct_cig('CIG_2_TRI')) \
                    .withColumn('CIG_3_TRI', correct_cig('CIG_3_TRI'))

- births_trimmed.schema 에는 이름, 데이터타입, nullable 여부가 있는데
- 이중 name, dataType만 가져오겠다

In [30]:
cols = [ (col.name, col.dataType) for col in births_trimmed.schema ]
cols

[('INFANT_ALIVE_AT_REPORT', StringType),
 ('BIRTH_PLACE', StringType),
 ('MOTHER_AGE_YEARS', IntegerType),
 ('FATHER_COMBINED_AGE', IntegerType),
 ('CIG_BEFORE', IntegerType),
 ('CIG_1_TRI', IntegerType),
 ('CIG_2_TRI', IntegerType),
 ('CIG_3_TRI', IntegerType),
 ('MOTHER_HEIGHT_IN', IntegerType),
 ('MOTHER_PRE_WEIGHT', IntegerType),
 ('MOTHER_DELIVERY_WEIGHT', IntegerType),
 ('MOTHER_WEIGHT_GAIN', IntegerType),
 ('DIABETES_PRE', StringType),
 ('DIABETES_GEST', StringType),
 ('HYP_TENS_PRE', StringType),
 ('HYP_TENS_GEST', StringType),
 ('PREV_BIRTH_PRETERM', StringType)]

- Y or N 을 찾기
    - StringType이겠네?
    

In [32]:
YNU_cols = []
for i, s in enumerate(cols):
    if s[1] == typ.StringType():
        dis = births.select(s[0]) \
        .distinct() \
        .rdd \
        .map(lambda row: row[0]) \
        .collect()
        print(dis)
        if 'Y' in dis:
            YNU_cols.append(s[0])
            print(YNU_cols)

['Y', 'N']
['INFANT_ALIVE_AT_REPORT']
['7', '3', '5', '6', '9', '1', '4', '2']
['Y', 'U', 'N']
['INFANT_ALIVE_AT_REPORT', 'DIABETES_PRE']
['Y', 'U', 'N']
['INFANT_ALIVE_AT_REPORT', 'DIABETES_PRE', 'DIABETES_GEST']
['Y', 'U', 'N']
['INFANT_ALIVE_AT_REPORT', 'DIABETES_PRE', 'DIABETES_GEST', 'HYP_TENS_PRE']
['Y', 'U', 'N']
['INFANT_ALIVE_AT_REPORT', 'DIABETES_PRE', 'DIABETES_GEST', 'HYP_TENS_PRE', 'HYP_TENS_GEST']
['Y', 'U', 'N']
['INFANT_ALIVE_AT_REPORT', 'DIABETES_PRE', 'DIABETES_GEST', 'HYP_TENS_PRE', 'HYP_TENS_GEST', 'PREV_BIRTH_PRETERM']


['Y', 'N']
['INFANT_ALIVE_AT_REPORT']
['7', '3', '5', '6', '9', '1', '4', '2']
['Y', 'U', 'N']
['INFANT_ALIVE_AT_REPORT', 'DIABETES_PRE']
['Y', 'U', 'N']
['INFANT_ALIVE_AT_REPORT', 'DIABETES_PRE', 'DIABETES_GEST']
['Y', 'U', 'N']
['INFANT_ALIVE_AT_REPORT', 'DIABETES_PRE', 'DIABETES_GEST', 'HYP_TENS_PRE']
['Y', 'U', 'N']
['INFANT_ALIVE_AT_REPORT', 'DIABETES_PRE', 'DIABETES_GEST', 'HYP_TENS_PRE', 'HYP_TENS_GEST']
['Y', 'U', 'N']
['INFANT_ALIVE_AT_REPORT', 'DIABETES_PRE', 'DIABETES_GEST', 'HYP_TENS_PRE', 'HYP_TENS_GEST', 'PREV_BIRTH_PRETERM']

In [33]:
births.select('INFANT_ALIVE_AT_REPORT') \
        .distinct() \
        .rdd \
        .map(lambda row: row[0]) \
        .collect()

['Y', 'N']

In [34]:
births.select('BIRTH_PLACE') \
        .distinct() \
        .rdd \
        .map(lambda row: row[0]) \
        .collect()

['7', '3', '5', '6', '9', '1', '4', '2']

In [35]:
births.select([
    'INFANT_NICU_ADMISSION',
    rec_integer(
        'INFANT_NICU_ADMISSION'
        , func.lit('YNU')).alias('INFANT_NICU_ADMISSION_RECODE')
]).take(5)

[Row(INFANT_NICU_ADMISSION='Y', INFANT_NICU_ADMISSION_RECODE=1),
 Row(INFANT_NICU_ADMISSION='Y', INFANT_NICU_ADMISSION_RECODE=1),
 Row(INFANT_NICU_ADMISSION='U', INFANT_NICU_ADMISSION_RECODE=0),
 Row(INFANT_NICU_ADMISSION='N', INFANT_NICU_ADMISSION_RECODE=2),
 Row(INFANT_NICU_ADMISSION='U', INFANT_NICU_ADMISSION_RECODE=0)]

In [36]:
exprs_YNU = [
    rec_integer(x, func.lit('YNU')).alias(x)
    if x in YNU_cols
    else x
    for x in births_transformed.columns
]

In [37]:
exprs_YNU

[Column<b'recode(INFANT_ALIVE_AT_REPORT, YNU) AS `INFANT_ALIVE_AT_REPORT`'>,
 'BIRTH_PLACE',
 'MOTHER_AGE_YEARS',
 'FATHER_COMBINED_AGE',
 'CIG_BEFORE',
 'CIG_1_TRI',
 'CIG_2_TRI',
 'CIG_3_TRI',
 'MOTHER_HEIGHT_IN',
 'MOTHER_PRE_WEIGHT',
 'MOTHER_DELIVERY_WEIGHT',
 'MOTHER_WEIGHT_GAIN',
 Column<b'recode(DIABETES_PRE, YNU) AS `DIABETES_PRE`'>,
 Column<b'recode(DIABETES_GEST, YNU) AS `DIABETES_GEST`'>,
 Column<b'recode(HYP_TENS_PRE, YNU) AS `HYP_TENS_PRE`'>,
 Column<b'recode(HYP_TENS_GEST, YNU) AS `HYP_TENS_GEST`'>,
 Column<b'recode(PREV_BIRTH_PRETERM, YNU) AS `PREV_BIRTH_PRETERM`'>]

- YNU_cols에 있으면 사용자정의함수 처리 되고 아니면 그냥 컬럼명이 리스트에 담김

In [38]:
births_transformed = births_transformed.select(exprs_YNU)

In [43]:
births_transformed.select(YNU_cols[-5:]).show(5)

+------------+-------------+------------+-------------+------------------+
|DIABETES_PRE|DIABETES_GEST|HYP_TENS_PRE|HYP_TENS_GEST|PREV_BIRTH_PRETERM|
+------------+-------------+------------+-------------+------------------+
|           2|            2|           2|            2|                 2|
|           2|            2|           2|            2|                 2|
|           2|            2|           2|            2|                 2|
|           2|            2|           2|            2|                 1|
|           2|            2|           2|            2|                 2|
+------------+-------------+------------+-------------+------------------+
only showing top 5 rows



In [44]:
YNU_cols[-5:]

['DIABETES_PRE',
 'DIABETES_GEST',
 'HYP_TENS_PRE',
 'HYP_TENS_GEST',
 'PREV_BIRTH_PRETERM']

In [46]:
# Descriptive statistics

import pyspark.mllib.stat as st
import numpy as np

In [47]:
numeric_cols = [
#     'INFANT_ALIVE_AT_REPORT', 
#     'BIRTH_PLACE', 
    'MOTHER_AGE_YEARS', 
    'FATHER_COMBINED_AGE', 
    'CIG_BEFORE', 
    'CIG_1_TRI', 
    'CIG_2_TRI', 
    'CIG_3_TRI', 
    'MOTHER_HEIGHT_IN', 
    'MOTHER_PRE_WEIGHT', 
    'MOTHER_DELIVERY_WEIGHT', 
    'MOTHER_WEIGHT_GAIN', 
]

In [48]:
numeric_rdd = births_transformed.select(numeric_cols) \
                .rdd.map(lambda row: [e for e in row])

In [50]:
mllib_stats = st.Statistics.colStats(numeric_rdd)

In [54]:
for col, m, v in zip(numeric_cols, mllib_stats.mean(), mllib_stats.variance()):
    print( '{0:.10s}: \t{1:.2f} \t{2:.2f}'.format(col, m, np.sqrt(v)) )

MOTHER_AGE: 	28.30 	6.08
FATHER_COM: 	44.55 	27.55
CIG_BEFORE: 	1.43 	5.18
CIG_1_TRI: 	0.91 	3.83
CIG_2_TRI: 	0.70 	3.31
CIG_3_TRI: 	0.58 	3.11
MOTHER_HEI: 	65.12 	6.45
MOTHER_PRE: 	214.50 	210.21
MOTHER_DEL: 	223.63 	180.01
MOTHER_WEI: 	30.74 	26.23


- colStats()
    - 처리된 값은 기술통계를 샘플링해서 계산한 것임
    - 실제 데이터에서는 큰 문제는 없다. 
    - 데이터셋이 100개 미만일 경우에는 문제가 될 수 있다
    - 본 함수는 RDD 데이터를 취해 기술통계를 계산한다.
        - MulhvariateStahshcalSummary 객체를 리턴
            - count() : 데이터행 갯수
            - max() : 최대값
            - numNonzeros() : 0 이 아닌 갯수
            - variance() : 분산
                - Sample variance vector.
            - norml1() : L1-Norm 값
                - L1 norm of each column
            - norml2() : L2-Norm 값
                - Euclidean magnitude of each column

                - L1 Loss 이냐 L2 Loss 이냐 선택?
                    - 직관적으로 오차의 제곱을 더함
                    - Outlier에 대하여 더 Robust하다
                    - Outlier가 적당히 무시되길 원하면 L1 Loss를
                    - Outlier에 신경을 써야한다면 L2 Loss를 사용

In [55]:
categorical_cols = [e for e in births_transformed.columns if e not in numeric_cols]

In [56]:
categorical_rdd = births_transformed.select(categorical_cols) \
.rdd.map(lambda row: [e for e in row])

In [60]:
for i, col in enumerate(categorical_cols):
    agg = categorical_rdd.groupBy(lambda row: row[i]) \
                        .map( lambda row : (row[0], len(row[1])) )
    print( col, sorted(agg.collect(), key=lambda el: el[1], reverse=True) )

INFANT_ALIVE_AT_REPORT [(1, 23349), (2, 22080)]
BIRTH_PLACE [('1', 44558), ('4', 327), ('3', 224), ('2', 136), ('7', 91), ('5', 74), ('6', 11), ('9', 8)]
DIABETES_PRE [(2, 44689), (1, 548), (0, 192)]
DIABETES_GEST [(2, 43259), (1, 1978), (0, 192)]
HYP_TENS_PRE [(2, 44156), (1, 1081), (0, 192)]
HYP_TENS_GEST [(2, 43110), (1, 2127), (0, 192)]
PREV_BIRTH_PRETERM [(2, 42896), (1, 2341), (0, 192)]


In [61]:
corrs = st.Statistics.corr(numeric_rdd)

In [62]:
for i, el in enumerate(corrs > 0.5):
    correlated = [
        (numeric_cols[j], corrs[i][j])
        for j, e in enumerate(el)
        if e == 1.0 and j != i
    ]
    if len(correlated) > 0:
        for e in correlated:
            print( '{0}-to-{1}: {2:.2f}' \
                 .format(numeric_cols[i], e[0], e[1]) )

CIG_BEFORE-to-CIG_1_TRI: 0.83
CIG_BEFORE-to-CIG_2_TRI: 0.72
CIG_BEFORE-to-CIG_3_TRI: 0.62
CIG_1_TRI-to-CIG_BEFORE: 0.83
CIG_1_TRI-to-CIG_2_TRI: 0.87
CIG_1_TRI-to-CIG_3_TRI: 0.76
CIG_2_TRI-to-CIG_BEFORE: 0.72
CIG_2_TRI-to-CIG_1_TRI: 0.87
CIG_2_TRI-to-CIG_3_TRI: 0.89
CIG_3_TRI-to-CIG_BEFORE: 0.62
CIG_3_TRI-to-CIG_1_TRI: 0.76
CIG_3_TRI-to-CIG_2_TRI: 0.89
MOTHER_PRE_WEIGHT-to-MOTHER_DELIVERY_WEIGHT: 0.54
MOTHER_PRE_WEIGHT-to-MOTHER_WEIGHT_GAIN: 0.65
MOTHER_DELIVERY_WEIGHT-to-MOTHER_PRE_WEIGHT: 0.54
MOTHER_DELIVERY_WEIGHT-to-MOTHER_WEIGHT_GAIN: 0.60
MOTHER_WEIGHT_GAIN-to-MOTHER_PRE_WEIGHT: 0.65
MOTHER_WEIGHT_GAIN-to-MOTHER_DELIVERY_WEIGHT: 0.60


In [63]:
births_transformed.columns

['INFANT_ALIVE_AT_REPORT',
 'BIRTH_PLACE',
 'MOTHER_AGE_YEARS',
 'FATHER_COMBINED_AGE',
 'CIG_BEFORE',
 'CIG_1_TRI',
 'CIG_2_TRI',
 'CIG_3_TRI',
 'MOTHER_HEIGHT_IN',
 'MOTHER_PRE_WEIGHT',
 'MOTHER_DELIVERY_WEIGHT',
 'MOTHER_WEIGHT_GAIN',
 'DIABETES_PRE',
 'DIABETES_GEST',
 'HYP_TENS_PRE',
 'HYP_TENS_GEST',
 'PREV_BIRTH_PRETERM']

In [64]:
features_to_keep = [
 'INFANT_ALIVE_AT_REPORT',
 'BIRTH_PLACE',
 'MOTHER_AGE_YEARS',
 'FATHER_COMBINED_AGE',
 'CIG_1_TRI',
 'MOTHER_HEIGHT_IN',
 'MOTHER_PRE_WEIGHT',
 'DIABETES_PRE',
 'DIABETES_GEST',
 'HYP_TENS_PRE',
 'HYP_TENS_GEST',
 'PREV_BIRTH_PRETERM'
]

In [65]:
births_transformed = births_transformed.select([e for e in features_to_keep])

In [67]:
import pyspark.mllib.linalg as ln

In [68]:
for cat in categorical_cols[1:]:
    agg = births_transformed.groupby('INFANT_ALIVE_AT_REPORT') \
                            .pivot(cat).count()
    agg_rdd = agg.rdd.map(lambda row: (row[1:])) \
    .flatMap(lambda row: [0 if e == None else e for e in row]) \
    .collect()
    row_length = len(agg.collect()[0])-1
    agg = ln.Matrices.dense(row_length, 2, agg_rdd)
    
    test = st.Statistics.chiSqTest(agg)
    print(cat, round(test.pValue, 4))

BIRTH_PLACE 0.0
DIABETES_PRE 0.0
DIABETES_GEST 0.0
HYP_TENS_PRE 0.0
HYP_TENS_GEST 0.0
PREV_BIRTH_PRETERM 0.0


- Matrices.dense() 함수의 파라미터 설명
    1. 행렬에서 행의 수
    1. 카테고리 고유값의 수
    1. 행렬로 변환 할 값들에 대한 리스트

### final dataset


In [84]:
import pyspark.mllib.feature as ft
import pyspark.mllib.regression as reg

hashing = ft.HashingTF(7)
births_hashed = births_transformed.rdd.map( lambda row: [
                list(hashing.transform(row[1]).toArray())
                    if col == 'BIRTH_PLACE'
                    else row[i] 
                for i, col in enumerate(features_to_keep)] 
                                         ) \
    .map( lambda row: [[e] if type(e) == int else e for e in row] ) \
    .map( lambda row: [item for sublist in row for item in sublist] ) \
    .map( lambda row: reg.LabeledPoint(row[0], ln.Vectors.dense(row[1:])) )

### train and test data 

In [85]:
births_train, births_test = births_hashed.randomSplit([0.6, 0.4])

In [86]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
LR_Model = LogisticRegressionWithLBFGS.train(births_train, iterations=10)

Py4JJavaError: An error occurred while calling o1201.trainLogisticRegressionModelWithLBFGS.
: org.apache.spark.SparkException: Multinomial models contain a matrix of coefficients, use coefficientMatrix instead.
	at org.apache.spark.ml.classification.LogisticRegressionModel.coefficients(LogisticRegression.scala:955)
	at org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS.runWithMlLogisticRegression$1(LogisticRegression.scala:455)
	at org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS.run(LogisticRegression.scala:459)
	at org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS.run(LogisticRegression.scala:425)
	at org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS.run(LogisticRegression.scala:355)
	at org.apache.spark.mllib.api.python.PythonMLLibAPI.trainRegressionModel(PythonMLLibAPI.scala:92)
	at org.apache.spark.mllib.api.python.PythonMLLibAPI.trainLogisticRegressionModelWithLBFGS(PythonMLLibAPI.scala:308)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [77]:
LR_results = (
births_test.map(lambda row: row.label) \
.zip( LR_Model.predict(births_test.map(lambda row: row.features)) )
).map( lambda row: (row[0], row[1] * 1.0) )

NameError: name 'LR_Model' is not defined

In [78]:
LR_results.collect()

NameError: name 'LR_results' is not defined


### Model Evaluation

In [79]:
import pyspark.mllib.eveluation as ev

ModuleNotFoundError: No module named 'pyspark.mllib.eveluation'

In [81]:
LR_evaluation = ev.BinaryClassificationMetrics(LR_results)

print( 'Area under PR: {0:.2f}'.format(LR_evaluation.areaUnderPR) )
print( 'Araed under ROC: {0:.2f}'.format(LR_evaluation.areaUnderROC) )
LR_evaluation.unpersist()

NameError: name 'ev' is not defined

## ChiSqSelector()
- 피쳐 선택

In [None]:
selector = ft.ChiSqSelector(4).fit(births_train)

In [None]:
topFeatures_train = (
    
)