# LINEAR REGRESSION

Xây dựng Linear Regression để dự đoán thời gian bay (flight duration) từ khoảng cách (mile)

In [2]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
import findspark
findspark.init()

In [3]:
sc= SparkContext(appName= 'Demo linear regression', master= 'local')
ss= SparkSession(sc)

In [66]:
path= '/Users/vovanthuong/Desktop/9 - Big Data in Machine Learning/Data/Data ML/flights.csv'
df= ss.read.csv(path, inferSchema= True, header= True)

In [7]:
df.show(3)

+---+---+---+-------+------+---+----+------+--------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|
+---+---+---+-------+------+---+----+------+--------+-----+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351|   NA|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8|
+---+---+---+-------+------+---+----+------+--------+-----+
only showing top 3 rows



## Chuyển đổi dữ liệu

In [67]:
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler
vector_assembler= VectorAssembler(inputCols= ['mile'], outputCol= 'features')
df= vector_assembler.transform(df)
df.show(3)

+---+---+---+-------+------+---+----+------+--------+-----+--------+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|features|
+---+---+---+-------+------+---+----+------+--------+-----+--------+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351|   NA|[2153.0]|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30| [316.0]|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8| [337.0]|
+---+---+---+-------+------+---+----+------+--------+-----+--------+
only showing top 3 rows



In [68]:
df_final= df.select('features', 'duration')
df_final.printSchema()
df_final.show(3)

root
 |-- features: vector (nullable = true)
 |-- duration: integer (nullable = true)

+--------+--------+
|features|duration|
+--------+--------+
|[2153.0]|     351|
| [316.0]|      82|
| [337.0]|      82|
+--------+--------+
only showing top 3 rows



## Chia tập dữ liệu train - test

In [69]:
train, test = df_final.randomSplit([0.8, 0.2])

In [70]:
train.count()

40089

In [72]:
test.count()

9911

## Xây dựng model

In [75]:
from pyspark.ml.regression import LinearRegression
ln= LinearRegression(featuresCol= 'features', labelCol= 'duration', predictionCol= 'duration_predict')
ln_model= ln.fit(train)

In [76]:
ln_model.coefficients

DenseVector([0.1216])

In [77]:
ln_model.intercept

44.45401967879791

## Đánh giá kết quả

In [78]:
evaluate_test= ln_model.evaluate(test)

In [79]:
evaluate_test.meanAbsoluteError

13.096591755517084

In [80]:
evaluate_test.meanSquaredError

290.61453414399705

In [81]:
evaluate_test.r2

0.9623867142612381

## Lưu model

In [83]:
ln_model.save('Linear model demo')

## Load model

In [87]:
from pyspark.ml.regression import LinearRegressionModel
ln_model2= LinearRegressionModel.load('Linear model demo')

## Dự đoán với dữ liệu mới

In [97]:
unlaber_test_df= test.select('features')
test_predict= ln_model2.transform(unlaber_test_df)

In [98]:
test_predict.show(3)

+--------+-----------------+
|features| duration_predict|
+--------+-----------------+
|  [67.0]|52.60264089437822|
|  [67.0]|52.60264089437822|
|  [67.0]|52.60264089437822|
+--------+-----------------+
only showing top 3 rows

