In [1]:
from pyspark.sql import  SQLContext 
from pyspark import SparkContext
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
matplotlib.rcParams['font.family'] = 'Malgun Gothic'
matplotlib.rcParams['axes.unicode_minus'] = False

In [2]:
sc = SparkContext('local')
sqlctx = SQLContext(sc)

In [3]:
df = sqlctx.read.csv( 'data/titanic1.csv', header=True, inferSchema=True)
df.show()

+--------+------+----+-----+-----+-------+-----------+-----+---------+--------------+-------------+
|Survived|Pclass| Age|SibSp|Parch|   Fare|Family_Size|Alone|Sex_index|Embarked_index|Initial_index|
+--------+------+----+-----+-----+-------+-----------+-----+---------+--------------+-------------+
|       0|     3|22.0|    1|    0|   7.25|          1|    0|      0.0|           0.0|          0.0|
|       1|     1|38.0|    1|    0|71.2833|          1|    0|      1.0|           1.0|          2.0|
|       1|     3|26.0|    0|    0|  7.925|          0|    1|      1.0|           0.0|          1.0|
|       1|     1|35.0|    1|    0|   53.1|          1|    0|      1.0|           0.0|          2.0|
|       0|     3|35.0|    0|    0|   8.05|          0|    1|      0.0|           0.0|          0.0|
|       0|     3|33.0|    0|    0| 8.4583|          0|    1|      0.0|           2.0|          0.0|
|       0|     1|54.0|    0|    0|51.8625|          0|    1|      0.0|           0.0|          0.0|


In [4]:
f = VectorAssembler(inputCols=['Pclass', 'Age', 'Sex_index','Family_Size'], outputCol='features')
v_df = f.transform(df)
v_df.show(v_df.count())

+--------+------+----+-----+-----+--------+-----------+-----+---------+--------------+-------------+-------------------+
|Survived|Pclass| Age|SibSp|Parch|    Fare|Family_Size|Alone|Sex_index|Embarked_index|Initial_index|           features|
+--------+------+----+-----+-----+--------+-----------+-----+---------+--------------+-------------+-------------------+
|       0|     3|22.0|    1|    0|    7.25|          1|    0|      0.0|           0.0|          0.0| [3.0,22.0,0.0,1.0]|
|       1|     1|38.0|    1|    0| 71.2833|          1|    0|      1.0|           1.0|          2.0| [1.0,38.0,1.0,1.0]|
|       1|     3|26.0|    0|    0|   7.925|          0|    1|      1.0|           0.0|          1.0| [3.0,26.0,1.0,0.0]|
|       1|     1|35.0|    1|    0|    53.1|          1|    0|      1.0|           0.0|          2.0| [1.0,35.0,1.0,1.0]|
|       0|     3|35.0|    0|    0|    8.05|          0|    1|      0.0|           0.0|          0.0| [3.0,35.0,0.0,0.0]|
|       0|     3|33.0|    0|    

In [5]:
v_df.count()

891

In [6]:
train_df, test_df = v_df.randomSplit([0.8, 0.2])

In [7]:
train_df.count()

705

In [8]:
lr = LogisticRegression( featuresCol='features',labelCol='Survived')
lr_model = lr.fit(train_df)    # 학습 시작

In [9]:
print("기울기", lr_model.coefficients)
print('절편', lr_model.intercept)

기울기 [-1.2610527554835729,-0.04104715202767053,2.724877575268035,-0.1887889730611733]
절편 2.6666971357430613


In [10]:
lr_predict = lr_model.transform(test_df)
lr_predict.show()

+--------+------+----+-----+-----+--------+-----------+-----+---------+--------------+-------------+------------------+--------------------+--------------------+----------+
|Survived|Pclass| Age|SibSp|Parch|    Fare|Family_Size|Alone|Sex_index|Embarked_index|Initial_index|          features|       rawPrediction|         probability|prediction|
+--------+------+----+-----+-----+--------+-----------+-----+---------+--------------+-------------+------------------+--------------------+--------------------+----------+
|       0|     1|24.0|    0|    1|247.5208|          1|    0|      0.0|           1.0|          0.0|[1.0,24.0,0.0,1.0]|[-0.2317237585342...|[0.44232689714021...|       1.0|
|       0|     1|28.0|    0|    0|    47.1|          0|    1|      0.0|           0.0|          0.0|[1.0,28.0,0.0,0.0]|[-0.2563241234847...|[0.43626753379104...|       1.0|
|       0|     1|28.0|    1|    0| 82.1708|          1|    0|      0.0|           1.0|          0.0|[1.0,28.0,0.0,1.0]|[-0.067535150423

In [11]:
lr_predict.select('Pclass', 'Age', 'Sex_index','Family_Size','Survived', 'prediction').show()

+------+----+---------+-----------+--------+----------+
|Pclass| Age|Sex_index|Family_Size|Survived|prediction|
+------+----+---------+-----------+--------+----------+
|     1|24.0|      0.0|          1|       0|       1.0|
|     1|28.0|      0.0|          0|       0|       1.0|
|     1|28.0|      0.0|          1|       0|       1.0|
|     1|33.0|      0.0|          0|       0|       1.0|
|     1|33.0|      0.0|          0|       0|       1.0|
|     1|37.0|      0.0|          1|       0|       0.0|
|     1|40.0|      0.0|          0|       0|       0.0|
|     1|40.0|      0.0|          0|       0|       0.0|
|     1|46.0|      0.0|          0|       0|       0.0|
|     1|46.0|      0.0|          1|       0|       0.0|
|     1|47.0|      0.0|          0|       0|       0.0|
|     1|49.0|      0.0|          2|       0|       0.0|
|     1|50.0|      0.0|          1|       0|       0.0|
|     1|56.0|      0.0|          0|       0|       0.0|
|     1|61.0|      0.0|          0|       0|    

In [12]:
lr_predict.toPandas()[['probability']]

Unnamed: 0,probability
0,"[0.4423268971402123, 0.5576731028597877]"
1,"[0.4362675337910456, 0.5637324662089545]"
2,"[0.48312262670997186, 0.5168773732900283]"
3,"[0.4872306863905191, 0.512769313609481]"
4,"[0.4872306863905191, 0.512769313609481]"
...,...
181,"[0.9094409988844881, 0.09055900111551186]"
182,"[0.9159787054307934, 0.08402129456920648]"
183,"[0.9220848725713137, 0.07791512742868632]"
184,"[0.5145134221805275, 0.48548657781947246]"
