# 线性回归

## 1.创建sparksession对象

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('lin_reg').getOrCreate()

## 2.读取数据集

In [2]:
df = spark.read.csv('data/Linear_regression_dataset.csv', inferSchema=True, header=True)
df

DataFrame[var_1: int, var_2: int, var_3: int, var_4: double, var_5: double, output: double]

## 3.数据分析

In [3]:
print((df.count(), len(df.columns)))

(1232, 6)


In [4]:
df.printSchema()

root
 |-- var_1: integer (nullable = true)
 |-- var_2: integer (nullable = true)
 |-- var_3: integer (nullable = true)
 |-- var_4: double (nullable = true)
 |-- var_5: double (nullable = true)
 |-- output: double (nullable = true)



In [5]:
df.describe().show(3, False)

+-------+-----------------+-----------------+------------------+--------------------+--------------------+-------------------+
|summary|var_1            |var_2            |var_3             |var_4               |var_5               |output             |
+-------+-----------------+-----------------+------------------+--------------------+--------------------+-------------------+
|count  |1232             |1232             |1232              |1232                |1232                |1232               |
|mean   |715.0819805194806|715.0819805194806|80.90422077922078 |0.3263311688311693  |0.25927272727272715 |0.39734172077922014|
|stddev |91.5342940441652 |93.07993263118064|11.458139049993724|0.015012772334166148|0.012907228928000298|0.03326689862173776|
+-------+-----------------+-----------------+------------------+--------------------+--------------------+-------------------+
only showing top 3 rows



In [6]:
df.head(3)

[Row(var_1=734, var_2=688, var_3=81, var_4=0.328, var_5=0.259, output=0.418),
 Row(var_1=700, var_2=600, var_3=94, var_4=0.32, var_5=0.247, output=0.389),
 Row(var_1=712, var_2=705, var_3=93, var_4=0.311, var_5=0.247, output=0.417)]

In [7]:
from pyspark.sql.functions import corr
df.select(corr('var_1','output')).show()

+-------------------+
|corr(var_1, output)|
+-------------------+
| 0.9187399607627283|
+-------------------+



## 4.特征工程化

In [8]:
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler
df.columns

['var_1', 'var_2', 'var_3', 'var_4', 'var_5', 'output']

In [9]:
vec_assmebler = VectorAssembler(inputCols=['var_1', 'var_2', 'var_3', 'var_4', 'var_5'], outputCol='features')
features_df = vec_assmebler.transform(df)
features_df.printSchema()

root
 |-- var_1: integer (nullable = true)
 |-- var_2: integer (nullable = true)
 |-- var_3: integer (nullable = true)
 |-- var_4: double (nullable = true)
 |-- var_5: double (nullable = true)
 |-- output: double (nullable = true)
 |-- features: vector (nullable = true)



In [10]:
features_df.select('features').show(5, False)

+------------------------------+
|features                      |
+------------------------------+
|[734.0,688.0,81.0,0.328,0.259]|
|[700.0,600.0,94.0,0.32,0.247] |
|[712.0,705.0,93.0,0.311,0.247]|
|[734.0,806.0,69.0,0.315,0.26] |
|[613.0,759.0,61.0,0.302,0.24] |
+------------------------------+
only showing top 5 rows



In [11]:
model_df = features_df.select('features', 'output')
model_df.show(5, False)

+------------------------------+------+
|features                      |output|
+------------------------------+------+
|[734.0,688.0,81.0,0.328,0.259]|0.418 |
|[700.0,600.0,94.0,0.32,0.247] |0.389 |
|[712.0,705.0,93.0,0.311,0.247]|0.417 |
|[734.0,806.0,69.0,0.315,0.26] |0.415 |
|[613.0,759.0,61.0,0.302,0.24] |0.378 |
+------------------------------+------+
only showing top 5 rows



In [12]:
print((model_df.count(), len(model_df.columns)))

(1232, 2)


## 5.划分数据集

In [13]:
train_df, test_df = model_df.randomSplit([0.7, 0.3])
print((train_df.count(), len(train_df.columns)))

(883, 2)


In [14]:
print((test_df.count(), len(train_df.columns)))

(349, 2)


## 6.构建和训练线性回归模型

In [15]:
from pyspark.ml.regression import LinearRegression
lin_reg = LinearRegression(labelCol='output')
lr_model = lin_reg.fit(train_df)
print(lr_model.coefficients)

[0.0003290645762219201,6.002993787265466e-05,0.0002078031662952946,-0.6094874788637602,0.5126289706055822]


In [16]:
print(lr_model.intercept)

0.16837249079100558


In [17]:
training_predictions = lr_model.evaluate(train_df)
print(training_predictions.r2)

0.8637017883173019


## 7.在测试数据上评估线性回归模型

In [18]:
test_predictions = lr_model.evaluate(test_df)
print(test_predictions.r2)

0.8810437544916532


In [19]:
print(test_predictions.meanSquaredError)

0.0001396513682817377
