In [1]:
from pyspark.sql import  SQLContext 
from pyspark import SparkContext
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
matplotlib.rcParams['font.family'] = 'Malgun Gothic'
matplotlib.rcParams['axes.unicode_minus'] = False

In [2]:
sc = SparkContext('local')
sqlctx = SQLContext(sc)

In [6]:
df = sqlctx.read.csv( 'data1/weight.csv', header=True, inferSchema=True)
df.show()

+----+------+------+----------+------+---------+----------+
|year|height|weight|     grade|gender|gradecode|gendercode|
+----+------+------+----------+------+---------+----------+
|2017| 152.5|  47.9|elementary|   man|        2|         1|
|2017| 153.2|  46.6|elementary| woman|        2|         0|
|2017| 170.6|  63.8|    middle|   man|        0|         1|
|2017| 160.4|  54.2|    middle| woman|        0|         0|
|2017| 173.9|  72.3|      high|   man|        1|         1|
|2017| 160.9|  57.7|      high| woman|        1|         0|
+----+------+------+----------+------+---------+----------+



In [7]:
df.printSchema()

root
 |-- year: integer (nullable = true)
 |-- height: double (nullable = true)
 |-- weight: double (nullable = true)
 |-- grade: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- gradecode: integer (nullable = true)
 |-- gendercode: integer (nullable = true)



In [None]:
# 특성데이터 : 키, gradecode, gendercode   (설명 변수)
# 라벨 : weight (종속 변수)

In [8]:
f = VectorAssembler(inputCols=['weight', 'gradecode', 'gendercode'], outputCol='features')
v_df = f.transform(df)
v_df.show(v_df.count())

+----+------+------+----------+------+---------+----------+--------------+
|year|height|weight|     grade|gender|gradecode|gendercode|      features|
+----+------+------+----------+------+---------+----------+--------------+
|2017| 152.5|  47.9|elementary|   man|        2|         1|[47.9,2.0,1.0]|
|2017| 153.2|  46.6|elementary| woman|        2|         0|[46.6,2.0,0.0]|
|2017| 170.6|  63.8|    middle|   man|        0|         1|[63.8,0.0,1.0]|
|2017| 160.4|  54.2|    middle| woman|        0|         0|[54.2,0.0,0.0]|
|2017| 173.9|  72.3|      high|   man|        1|         1|[72.3,1.0,1.0]|
|2017| 160.9|  57.7|      high| woman|        1|         0|[57.7,1.0,0.0]|
+----+------+------+----------+------+---------+----------+--------------+



In [9]:
v_df = v_df.select('weight', 'features')
v_df.show()

+------+--------------+
|weight|      features|
+------+--------------+
|  47.9|[47.9,2.0,1.0]|
|  46.6|[46.6,2.0,0.0]|
|  63.8|[63.8,0.0,1.0]|
|  54.2|[54.2,0.0,0.0]|
|  72.3|[72.3,1.0,1.0]|
|  57.7|[57.7,1.0,0.0]|
+------+--------------+



In [10]:
train_df, test_df = v_df.randomSplit([0.7, 0.3])

In [11]:
lr = LinearRegression( featuresCol='features',labelCol='weight', maxIter=100, regParam=0.01)
lr_model = lr.fit(train_df)    # 학습 시작

In [12]:
print("기울기", lr_model.coefficients)
print('절편', lr_model.intercept)

기울기 [0.6413681937877966,-2.8435745208471777,3.43007030676863]
절편 19.44134169292053


In [13]:
b1, b2, b3 = lr_model.coefficients
a = lr_model.intercept
p = a + b1*170 + b2*2+ b3*1
p