In [26]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder, StandardScaler, ChiSqSelector, PCA
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [7]:
spark = SparkSession.builder.appName("").getOrCreate()

In [8]:
!ls

bitcoin.csv		  Pipeline_FeatureEngineering.ipynb
LinearRegression2.ipynb   PracticeUAS.ipynb
LinearRegression.ipynb	  student_academic_placement_performance_dataset.csv
LogisticRegression.ipynb  work


In [9]:
df = spark.read.csv("./bitcoin.csv", header = True, inferSchema = True)
df.show()

+----------+------------------+------------------+------------------+------------------+--------+
|      Date|             Close|              High|               Low|              Open|  Volume|
+----------+------------------+------------------+------------------+------------------+--------+
|2014-09-17| 457.3340148925781|468.17401123046875| 452.4219970703125|  465.864013671875|21056800|
|2014-09-18|424.44000244140625| 456.8599853515625|   413.10400390625| 456.8599853515625|34483200|
|2014-09-19| 394.7959899902344| 427.8349914550781| 384.5320129394531| 424.1029968261719|37919700|
|2014-09-20|408.90399169921875| 423.2959899902344|389.88299560546875| 394.6730041503906|36863600|
|2014-09-21| 398.8210144042969| 412.4259948730469| 393.1809997558594| 408.0849914550781|26580100|
|2014-09-22| 402.1520080566406| 406.9159851074219| 397.1300048828125| 399.1000061035156|24127600|
|2014-09-23| 435.7909851074219| 441.5570068359375| 396.1969909667969| 402.0920104980469|45099500|
|2014-09-24| 423.204

In [10]:
print("Jumlah baris = {:}".format(df.count()))

Jumlah baris = 4134


In [11]:
df = df.drop("Date")
df.show()

+------------------+------------------+------------------+------------------+--------+
|             Close|              High|               Low|              Open|  Volume|
+------------------+------------------+------------------+------------------+--------+
| 457.3340148925781|468.17401123046875| 452.4219970703125|  465.864013671875|21056800|
|424.44000244140625| 456.8599853515625|   413.10400390625| 456.8599853515625|34483200|
| 394.7959899902344| 427.8349914550781| 384.5320129394531| 424.1029968261719|37919700|
|408.90399169921875| 423.2959899902344|389.88299560546875| 394.6730041503906|36863600|
| 398.8210144042969| 412.4259948730469| 393.1809997558594| 408.0849914550781|26580100|
| 402.1520080566406| 406.9159851074219| 397.1300048828125| 399.1000061035156|24127600|
| 435.7909851074219| 441.5570068359375| 396.1969909667969| 402.0920104980469|45099500|
| 423.2049865722656|436.11199951171875| 421.1319885253906| 435.7510070800781|30627700|
| 411.5740051269531| 423.5199890136719| 409

In [12]:
df.columns

['Close', 'High', 'Low', 'Open', 'Volume']

In [13]:
# Dengan Pipeline
num_cols = ['Close', 'High', 'Low', 'Open']
assembler = VectorAssembler(inputCols = num_cols, outputCol = "features")
scaler = StandardScaler(inputCol = "features", outputCol = "scaled_features", withMean = True, withStd = True)
lm = LinearRegression(featuresCol = "scaled_features", labelCol = "Volume")

In [14]:
pipeline = Pipeline(stages = [assembler, scaler, lm])

In [15]:
train_data, test_data = df.randomSplit([0.7, 0.3], seed = 123)

In [16]:
# train_data.describe().show()

In [17]:
# test_data.describe().show()

In [18]:
model = pipeline.fit(train_data)

In [19]:
prediction = model.transform(test_data)

In [23]:
prediction.select("scaled_features", "prediction").show(truncate = False)

+---------------------------------------------------------------------------------+-------------------+
|scaled_features                                                                  |prediction         |
+---------------------------------------------------------------------------------+-------------------+
|[-0.8496698840220611,-0.8516155753404662,-0.8466943074311993,-0.8494274773741267]|6.581895311367535E9|
|[-0.8495744356973831,-0.8504697310689311,-0.8463255214888169,-0.847862895044234] |6.802389662420425E9|
|[-0.8493813622198187,-0.8510548352706829,-0.8462066147066938,-0.8488319846224648]|6.608732786909113E9|
|[-0.8493149770181748,-0.851558443597979,-0.8460973943399133,-0.8490860339241958] |6.436401989146534E9|
|[-0.8493071351618336,-0.851447076373616,-0.8462017558236008,-0.8488721748773027] |6.495664942186214E9|
|[-0.8492163594230809,-0.8513348277830471,-0.8461574675321722,-0.8491759900808108]|6.518037175147118E9|
|[-0.8491814705861432,-0.8515297994364404,-0.8460060131006538,-0

In [24]:
paramGrid = (ParamGridBuilder().addGrid(lm.regParam, [0.01, 0.1, 0.5])
            .addGrid(lm.elasticNetParam, [0.0, 0.5, 1.0])
            .build())

In [27]:
eval = RegressionEvaluator(labelCol = 'Volume', predictionCol = 'prediction', metricName = 'rmse')

In [28]:
cv = CrossValidator(estimator = pipeline, estimatorParamMaps = paramGrid, evaluator = eval, numFolds = 3)

In [29]:
cv_model = cv.fit(train_data)

In [38]:
evaluator_r2 = RegressionEvaluator(labelCol="Volume", 
                                   predictionCol="prediction", 
                                   metricName="r2")
r2 = evaluator_r2.evaluate(prediction)
r2

0.6988818222256519

In [108]:
# Tanpa Pipeline
num_cols = ['Close', 'High', 'Low', 'Open']
assembler = VectorAssembler(inputCols = num_cols, outputCol = "x")
df_vec = assembler.transform(df)

scaler = StandardScaler(inputCol = "x", outputCol = "features", withMean = True, withStd = True)
scaler = scaler.fit(df_vec)
df_scaled = scaler.transform(df_vec)

In [110]:
df_final = df_scaled.select("features", "Volume")

In [111]:
train, test = df_final.randomSplit([0.7, 0.3], seed = 123)

In [112]:
lm = LinearRegression(labelCol = "Volume")
model = lm.fit(train)

In [114]:
y_preds = model.transform(test)

In [115]:
y_preds.show()

+--------------------+--------+--------------------+
|            features|  Volume|          prediction|
+--------------------+--------+--------------------+
|[-0.8434360552860...|38421000| 6.581895311367832E9|
|[-0.8433421927248...|42147200| 6.802389662420353E9|
|[-0.8431523269395...|22516400|  6.60873278690996E9|
|[-0.8430870446511...|17201900| 6.436401989146374E9|
|[-0.8430793330781...|28943700| 6.495664942185322E9|
|[-0.8429900654732...|24435300| 6.518037175147762E9|
|[-0.8429557562739...|12939000| 6.420127574531206E9|
|[-0.8429112802474...|21905400|  6.43401857497414E9|
|[-0.8429055201020...|18056500| 6.414721654222771E9|
|[-0.8428785761850...|18936500| 6.422113351779537E9|
|[-0.8428778523845...|31808000|6.4887551572655525E9|
|[-0.8428770968847...|21469200| 6.446912133998688E9|
|[-0.8428602568765...|13957200| 6.445301446289722E9|
|[-0.8428510026047...|60869200| 6.521778664517097E9|
|[-0.8428326203780...|40783700| 6.581808853034462E9|
|[-0.8428173230686...|21604200| 6.459770582834

In [118]:
res = model.evaluate(test)

In [119]:
res.residuals.show()

+--------------------+
|           residuals|
+--------------------+
|-6.543474311367832E9|
|-6.760242462420353E9|
| -6.58621638690996E9|
|-6.419200089146374E9|
|-6.466721242185322E9|
|-6.493601875147762E9|
|-6.407188574531206E9|
| -6.41211317497414E9|
|-6.396665154222771E9|
|-6.403176851779537E9|
|-6.45694715726555...|
|-6.425442933998688E9|
|-6.431344246289722E9|
|-6.460909464517097E9|
|-6.541025153034462E9|
|-6.438166382834436E9|
|-6.404317623062122E9|
|-6.389923474823809E9|
| -6.42505248443712E9|
|-6.383959883360889E9|
+--------------------+
only showing top 20 rows



In [120]:
ul = test.select("features")

In [122]:
pred = model.transform(ul)

In [124]:
pred.show()

+--------------------+--------------------+
|            features|          prediction|
+--------------------+--------------------+
|[-0.8434360552860...| 6.581895311367832E9|
|[-0.8433421927248...| 6.802389662420353E9|
|[-0.8431523269395...|  6.60873278690996E9|
|[-0.8430870446511...| 6.436401989146374E9|
|[-0.8430793330781...| 6.495664942185322E9|
|[-0.8429900654732...| 6.518037175147762E9|
|[-0.8429557562739...| 6.420127574531206E9|
|[-0.8429112802474...|  6.43401857497414E9|
|[-0.8429055201020...| 6.414721654222771E9|
|[-0.8428785761850...| 6.422113351779537E9|
|[-0.8428778523845...|6.4887551572655525E9|
|[-0.8428770968847...| 6.446912133998688E9|
|[-0.8428602568765...| 6.445301446289722E9|
|[-0.8428510026047...| 6.521778664517097E9|
|[-0.8428326203780...| 6.581808853034462E9|
|[-0.8428173230686...| 6.459770582834436E9|
|[-0.8427752388977...| 6.424461823062122E9|
|[-0.8427559124163...| 6.404339474823809E9|
|[-0.8427534888615...|  6.45463968443712E9|
|[-0.8427526075251...| 6.4028952

In [128]:
print("MAE: ", res.meanAbsoluteError)
print('MSE: ', res.meanSquaredError)
print("RMSE: ", res.rootMeanSquaredError)
print("R2: ", res.r2)
print("Adj R2: ", res.r2adj)

MAE:  9518026857.738497
MSE:  1.557379053680848e+20
RMSE:  12479499403.745522
R2:  0.6988818222255906
Adj R2:  0.6979326711530156
