# Logistic Regression Code Along
이것은 유명한 타이타닉 데이터 세트의 코드이며, 거의 모든 데이터 분석 언어에서 찾을 수 있는 예제이기 때문에 이 데이터 세트로 시작하자

데이터가 저장되어 있는 mysql DB와 연결하여 pandas DataFrame을 SparkDataFrame으로 변경한다.

In [1]:
import repackage
repackage.up(2)
from configuration import make_engine
import pandas as pd

In [2]:
engine = make_engine()
engine.connect()

<sqlalchemy.engine.base.Connection at 0x7fefc437e9a0>

In [3]:
query = "SELECT * FROM titanic"
titanic = pd.read_sql(query, con=engine)

In [4]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.appName('logreg').getOrCreate()

세션 정보는 http://domain:4040에서 확인할 수 있다.

In [6]:
data = spark.createDataFrame(titanic)
data.printSchema()

root
 |-- PassengerId: long (nullable = true)
 |-- Survived: long (nullable = true)
 |-- Pclass: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: long (nullable = true)
 |-- Parch: long (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



컬럼 확인!!

In [7]:
data.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [8]:
my_cols = [
#     'PassengerId',
    'Survived',
    'Pclass',
#     'Name',
    'Sex',
    'Age',
    'SibSp',
    'Parch',
#     'Ticket',
    'Fare',
#     'Cabin',
    'Embarked'
]

In [34]:
my_cols

['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

In [9]:
df = data.select(my_cols)
df.show()

+--------+------+------+----+-----+-----+-------+--------+
|Survived|Pclass|   Sex| Age|SibSp|Parch|   Fare|Embarked|
+--------+------+------+----+-----+-----+-------+--------+
|       0|     3|  male|22.0|    1|    0|   7.25|       S|
|       1|     1|female|38.0|    1|    0|71.2833|       C|
|       1|     3|female|26.0|    0|    0|  7.925|       S|
|       1|     1|female|35.0|    1|    0|   53.1|       S|
|       0|     3|  male|35.0|    0|    0|   8.05|       S|
|       0|     3|  male| NaN|    0|    0| 8.4583|       Q|
|       0|     1|  male|54.0|    0|    0|51.8625|       S|
|       0|     3|  male| 2.0|    3|    1| 21.075|       S|
|       1|     3|female|27.0|    0|    2|11.1333|       S|
|       1|     2|female|14.0|    1|    0|30.0708|       C|
|       1|     3|female| 4.0|    1|    1|   16.7|       S|
|       1|     1|female|58.0|    0|    0|  26.55|       S|
|       0|     3|  male|20.0|    0|    0|   8.05|       S|
|       0|     3|  male|39.0|    1|    5| 31.275|       

In [13]:
my_final_data = df.na.drop()
my_final_data.show()

+--------+------+------+----+-----+-----+-------+--------+
|Survived|Pclass|   Sex| Age|SibSp|Parch|   Fare|Embarked|
+--------+------+------+----+-----+-----+-------+--------+
|       0|     3|  male|22.0|    1|    0|   7.25|       S|
|       1|     1|female|38.0|    1|    0|71.2833|       C|
|       1|     3|female|26.0|    0|    0|  7.925|       S|
|       1|     1|female|35.0|    1|    0|   53.1|       S|
|       0|     3|  male|35.0|    0|    0|   8.05|       S|
|       0|     1|  male|54.0|    0|    0|51.8625|       S|
|       0|     3|  male| 2.0|    3|    1| 21.075|       S|
|       1|     3|female|27.0|    0|    2|11.1333|       S|
|       1|     2|female|14.0|    1|    0|30.0708|       C|
|       1|     3|female| 4.0|    1|    1|   16.7|       S|
|       1|     1|female|58.0|    0|    0|  26.55|       S|
|       0|     3|  male|20.0|    0|    0|   8.05|       S|
|       0|     3|  male|39.0|    1|    5| 31.275|       S|
|       0|     3|female|14.0|    0|    0| 7.8542|       

### Working with Categorical Columns
이 모든 것을 명확하게하기 위해 여러 단계로 나누어 보자

In [14]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer, OneHotEncoder, StringIndexer

In [16]:
gender_indexer = StringIndexer(
    inputCol='Sex', 
    outputCol='SexIndex'
)
gender_encoder = OneHotEncoder(
    inputCol='SexIndex',
    outputCol='SexVec'
)

In [26]:
embark_indexer = StringIndexer(
    inputCol = "Embarked", 
    outputCol = "EmbarkIndex"
)
embark_encoder = OneHotEncoder(
    inputCol = "EmbarkIndex",
    outputCol = "EmbarkVec"
)

이제 인코딩이 끝났으니 데이터 column을 합치자!

In [19]:
my_final_data.columns

['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

In [35]:
assembler = VectorAssembler(
    inputCols = [
        'Pclass', 'SexVec', 'Age', 'SibSp', 'Parch', 'Fare', 'EmbarkVec'
    ],
    outputCol = 'features'
)

In [36]:
from pyspark.ml.classification import LogisticRegression

### Pipelines
Pipelines을 다뤄보자!

In [37]:
from pyspark.ml import Pipeline

In [38]:
log_reg_titanic = LogisticRegression(
    featuresCol='features',
    labelCol='Survived'
)

In [39]:
pipeline = Pipeline(
    stages=[
        gender_indexer, embark_indexer,
        gender_encoder, embark_encoder,
        assembler, log_reg_titanic
    ]    
)

In [40]:
train_titanic_data, test_titanic_data = my_final_data.randomSplit([0.7, 0.3])

In [41]:
fit_model = pipeline.fit(train_titanic_data)

In [42]:
results = fit_model.transform(test_titanic_data)

In [43]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [44]:
my_eval = BinaryClassificationEvaluator(
    rawPredictionCol='prediction',
    labelCol='Survived'
)

In [50]:
results.select(
    'Survived', 
    results['prediction'].cast('int').alias('prediction')
).show()

+--------+----------+
|Survived|prediction|
+--------+----------+
|       0|         0|
|       0|         1|
|       0|         0|
|       0|         0|
|       0|         0|
|       0|         0|
|       0|         0|
|       0|         0|
|       0|         0|
|       0|         0|
|       0|         0|
|       0|         0|
|       0|         0|
|       0|         0|
|       0|         1|
|       0|         0|
|       0|         1|
|       0|         1|
|       0|         0|
|       0|         0|
+--------+----------+
only showing top 20 rows



In [51]:
AUC = my_eval.evaluate(results)

In [52]:
AUC

0.7881973647322681