In [1]:

## Trying linear regression to see if this dataset can be better on regression than Clustering!

import findspark
findspark.init('/home/ubuntu/spark-2.3.2-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Modelling').getOrCreate() 

In [2]:
#Using the 2 obtained merged datasets from previous steps

final1  = spark.read.csv("Updated Datasets after cleaning/final1.csv",header=True,inferSchema=True) 
final2  = spark.read.csv("Updated Datasets after cleaning/final2.csv",header=True,inferSchema=True) 

final1.show()
final2.show()

+-------+----+-----+----------+----------+------+--------+---------+--------+
|Country|TIME|  Sex|GDPsubject| GDP Value|Impexp|    Cost|pte_value|se_value|
+-------+----+-----+----------+----------+------+--------+---------+--------+
|    AUS|2013|WOMEN|       TOT|1102723.05|   IMP|246949.7|    38.14|    7.89|
|    AUS|2013|  MEN|       TOT|1102723.05|   IMP|246949.7|    13.65|   11.97|
|    AUS|2013|  TOT|       TOT|1102723.05|   IMP|246949.7|    24.88|    10.1|
|    AUS|2013|WOMEN|       TOT|1102723.05|   EXP|254201.7|    38.14|    7.89|
|    AUS|2013|  MEN|       TOT|1102723.05|   EXP|254201.7|    13.65|   11.97|
|    AUS|2013|  TOT|       TOT|1102723.05|   EXP|254201.7|    24.88|    10.1|
|    AUS|2014|WOMEN|       TOT|1116293.11|   IMP|238300.4|    38.33|    7.98|
|    AUS|2014|  MEN|       TOT|1116293.11|   IMP|238300.4|    14.03|   12.05|
|    AUS|2014|  TOT|       TOT|1116293.11|   IMP|238300.4|    25.21|   10.18|
|    AUS|2014|WOMEN|       TOT|1116293.11|   EXP|240425.8|    38

In [10]:
# Import VectorAssembler and Vectors
from pyspark.ml.feature import VectorAssembler

# The input columns are the feature column names, and the output column is what you'd like the new column to be named. 
vector_assembler = VectorAssembler(inputCols = ['Cost','pte_value','se_value'], outputCol = 'features')




# Now that we've created the assembler variable, let's actually transform the data.
vector_output = vector_assembler.transform(final1)


# Using print schema, you see that the features output column has been added. 
vector_output.printSchema()


# You can see that the features column is a DenseVector that combines the various features as expected.
vector_output.head(1)


root
 |-- Country: string (nullable = true)
 |-- TIME: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- GDPsubject: string (nullable = true)
 |-- GDP Value: double (nullable = true)
 |-- Impexp: string (nullable = true)
 |-- Cost: double (nullable = true)
 |-- pte_value: double (nullable = true)
 |-- se_value: double (nullable = true)
 |-- features: vector (nullable = true)



[Row(Country='AUS', TIME=2013, Sex='WOMEN', GDPsubject='TOT', GDP Value=1102723.05, Impexp='IMP', Cost=246949.7, pte_value=38.14, se_value=7.89, features=DenseVector([246949.7, 38.14, 7.89]))]

In [28]:

#for seeing if these input can predict the 2016 gender wage gape or not?

vector_assembler2 = VectorAssembler(inputCols = ['Percent Representation in Parliament','2015_gni','2015_gnif'], 
                                    outputCol = 'features')

vector_output2 = vector_assembler2.transform(final2)
vector_output2.printSchema()
vector_output2.head(1)


root
 |-- Country: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- 2011_fte: double (nullable = true)
 |-- 2012_fte: double (nullable = true)
 |-- 2013_fte: double (nullable = true)
 |-- 2014_fte: double (nullable = true)
 |-- 2015_fte: double (nullable = true)
 |-- 2016_fte: double (nullable = true)
 |-- 2011gwg: double (nullable = true)
 |-- 2012gwg: double (nullable = true)
 |-- 2013gwg: double (nullable = true)
 |-- 2014gwg: double (nullable = true)
 |-- 2015gwg: double (nullable = true)
 |-- 2016gwg: double (nullable = true)
 |-- 2000_gnif: integer (nullable = true)
 |-- 2005_gnif: integer (nullable = true)
 |-- 2010_gnif: integer (nullable = true)
 |-- 2011_gnif: integer (nullable = true)
 |-- 2012_gnif: integer (nullable = true)
 |-- 2013_gnif: integer (nullable = true)
 |-- 2014_gnif: integer (nullable = true)
 |-- 2015_gnif: integer (nullable = true)
 |-- 2000_gni: integer (nullable = true)
 |-- 2005_gni: integer (nullable = true)
 |-- 2010_gni: integer (nulla

[Row(Country='Australia', Sex='Men', 2011_fte=80.2, 2012_fte=79.8, 2013_fte=78.7, 2014_fte=77.7, 2015_fte=77.9, 2016_fte=77.2, 2011gwg=16.0, 2012gwg=13.8, 2013gwg=18.0, 2014gwg=15.4, 2015gwg=13.0, 2016gwg=14.3, 2000_gnif=26268, 2005_gnif=29848, 2010_gnif=31749, 2011_gnif=32039, 2012_gnif=32968, 2013_gnif=33478, 2014_gnif=33751, 2015_gnif=34271, 2000_gni=45151, 2005_gni=49034, 2010_gni=47883, 2011_gni=48125, 2012_gni=49539, 2013_gni=50135, 2014gni=50577, 2015_gni=51386, GII Rank=2, Gender Inequality Index (GII)=0.11, Maternal Mortality Ratio=6, Adolescent Birth Rate=12.1, Percent Representation in Parliament=30.5, Population with Secondary Education (Female)=94.3, Population with Secondary Education (Male)=94.6, Labour Force Participation Rate (Female)=58.8, Labour Force Participation Rate (Male)=71.8, features=DenseVector([30.5, 51386.0, 34271.0]))]

In [29]:
# Because the features have been combined into one vector, we no longer need them. Below we select the features and label.
vector_output = vector_output.select(['features', 'GDP Value'])

# You can see that the dataframe now only contains two columns. 
print(vector_output.head(1))
vector_output.show(3)

[Row(features=DenseVector([246949.7, 38.14, 7.89]), GDP Value=1102723.05)]
+--------------------+----------+
|            features| GDP Value|
+--------------------+----------+
|[246949.7,38.14,7...|1102723.05|
|[246949.7,13.65,1...|1102723.05|
|[246949.7,24.88,1...|1102723.05|
+--------------------+----------+
only showing top 3 rows



In [30]:
vector_output2 = vector_output2.select(['features', '2016gwg'])
print(vector_output2.head(1))
vector_output2.show(3)


[Row(features=DenseVector([30.5, 51386.0, 34271.0]), 2016gwg=14.3)]
+--------------------+--------+
|            features| 2016gwg|
+--------------------+--------+
|[30.5,51386.0,342...|    14.3|
|[30.5,51386.0,342...|    14.3|
|[30.3,57882.0,298...|14.08125|
+--------------------+--------+
only showing top 3 rows



In [31]:
# Let's do a randomised 70/30 split. 
train_data,test_data = vector_output.randomSplit([0.7,0.3])

# Let's see our training data.
train_data.describe().show()

# And our testing data.
test_data.describe().show()

+-------+-----------------+
|summary|        GDP Value|
+-------+-----------------+
|  count|             1332|
|   mean|947248.7424474488|
| stddev|2620082.536606752|
|    min|         12784.97|
|    max|           2.11E7|
+-------+-----------------+

+-------+-----------------+
|summary|        GDP Value|
+-------+-----------------+
|  count|              576|
|   mean|881000.3485416662|
| stddev|2574203.125893797|
|    min|         13431.52|
|    max|           2.11E7|
+-------+-----------------+



In [32]:
# Let's do a randomised 60/40 split. 
train_data2,test_data2 = vector_output2.randomSplit([0.6,0.4])

# Let's see our training data.
train_data2.describe().show()

# And our testing data.
test_data2.describe().show()

+-------+------------------+
|summary|           2016gwg|
+-------+------------------+
|  count|                39|
|   mean|14.066506410256412|
| stddev|2.3682424150561503|
|    min|               4.5|
|    max|              18.1|
+-------+------------------+

+-------+------------------+
|summary|           2016gwg|
+-------+------------------+
|  count|                25|
|   mean|13.519250000000005|
| stddev| 2.567052561629128|
|    min|               4.5|
|    max|              16.5|
+-------+------------------+



In [33]:

# Importing the LR package.
from pyspark.ml.regression import LinearRegression

# Instantiate the instance.
lr = LinearRegression(featuresCol='features', labelCol='GDP Value')
lr2=LinearRegression(featuresCol='features', labelCol='2016gwg')
# Fit the training data.

train_data,test_data = vector_output.randomSplit([0.7,0.3])
train_data2,test_data2 = vector_output2.randomSplit([0.6,0.4])

lr_model = lr.fit(train_data)
lr_model2 = lr2.fit(train_data2)
# Print the coefficients.
print("Coefficients: " + str(lr_model.coefficients))

# Print the intercept.
print("Intercept: " + str(lr_model.intercept) + "\n")

# Summarise the model and print out some evaluation metrics.
training_summary = lr_model.summary

# Print RMSE. 
print("RMSE: " + str(training_summary.rootMeanSquaredError))

# Print R2.
print("R2: " + str(training_summary.r2))

#dataset 2

# Print the coefficients.
print("\nCoefficients: " + str(lr_model2.coefficients))

# Print the intercept.
print("Intercept: " + str(lr_model2.intercept) + "\n")


# Summarise the model and print out some evaluation metrics.
training_summary2 = lr_model2.summary

# Print RMSE. 
print("RMSE: " + str(training_summary2.rootMeanSquaredError))

# Print R2.
print("R2: " + str(training_summary2.r2))


Coefficients: [3.323133611060712,-10793.001093395604,6590.061016754907]
Intercept: -136246.32694589027

RMSE: 1935001.2685033777
R2: 0.3065675545265537

Coefficients: [-0.00031901984408835073,-1.1461075031489649e-05,7.370379414726121e-05]
Intercept: 12.348509562983768

RMSE: 2.0633226886018208
R2: 0.07364859375632093


In [34]:
# Evaluating the modelagainst the test data.
test_results = lr_model.evaluate(test_data)

# And print the RMSE/R2. As expected, our RMSE and R2 are slightly worse when applying the testing set.
print("RMSE on test data: " + str(test_results.rootMeanSquaredError))
print("R2 on test data: " + str(test_results.r2))


RMSE on test data: 2447853.913063798
R2 on test data: 0.41581400773039257


In [35]:
# Evaluating the model against the test data.
test_results2 = lr_model2.evaluate(test_data2)

# And print the RMSE/R2. As expected, our RMSE and R2 are slightly worse when applying the testing set.
print("RMSE on test data2: " + str(test_results2.rootMeanSquaredError))
print("R2 on test data2: " + str(test_results2.r2))

RMSE on test data2: 2.907680264449592
R2 on test data2: -0.00268188875019848


In [36]:
# Linear Regression on train data --by changing some parameters

from pyspark.ml.regression import LinearRegression
lr1 = LinearRegression(featuresCol = 'features', labelCol='GDP Value', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model1= lr1.fit(train_data)
print("Coefficients: " + str(lr_model1.coefficients))
print("Intercept: " + str(lr_model1.intercept))

Coefficients: [3.323132735646495,-10792.969087636659,6590.044462744225]
Intercept: -136246.28670244865


In [37]:
# Linear Regression on train data
from pyspark.ml.regression import LinearRegression
lr2 = LinearRegression(featuresCol = 'features', labelCol='2016gwg', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model2 = lr2.fit(train_data2)
print("Coefficients: " + str(lr_model2.coefficients))
print("Intercept: " + str(lr_model2.intercept))

Coefficients: [0.0,0.0,3.331813322936408e-05]
Intercept: 12.981333348351374


In [38]:
#linear regression algorithm

#for dataset 1

lr_predictions1 = lr_model1.transform(test_data) 
lr_predictions1.select("prediction","GDP Value","features").show(5)

from pyspark.ml.evaluation import RegressionEvaluator

lrR2_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="GDP Value",metricName="r2")
print("R Squared (R2) = %g" % lrR2_evaluator.evaluate(lr_predictions1))

lrMAE_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="GDP Value",metricName="mae")
print("Mean Absolute Error (MAE) = %g" % lrMAE_evaluator.evaluate(lr_predictions1))

lrRMSE_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="GDP Value",metricName="rmse")
print("Root Mean Squared Error (RMSE) = %g" % lrRMSE_evaluator.evaluate(lr_predictions1))
 
#as seen below, this method for dataset 1 is not at all good!! only gives 41% R2

+------------------+---------+--------------------+
|        prediction|GDP Value|            features|
+------------------+---------+--------------------+
|-85840.83391593977| 23802.34|[13589.05,6.56,11...|
| -92759.6094053844| 29862.34|[24242.35,9.26,9.54]|
|-45997.12983585165| 28783.54|[24748.29,6.75,12...|
|-45997.12983585165| 83613.58|[24748.29,6.75,12...|
|-43198.09818500299| 31471.88|[25029.41,9.16,16.5]|
+------------------+---------+--------------------+
only showing top 5 rows

R Squared (R2) = 0.415814
Mean Absolute Error (MAE) = 999428
Root Mean Squared Error (RMSE) = 2.44785e+06


In [39]:
#linear regression algorithm

#for dataset 2 

lr_predictions2 = lr_model2.transform(test_data2) 
lr_predictions2.select("prediction","2016gwg","features").show(5)

from pyspark.ml.evaluation import RegressionEvaluator

lrR2_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="2016gwg",metricName="r2")
print("R Squared (R2) = %g" % lrR2_evaluator.evaluate(lr_predictions2))

lrMAE_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="2016gwg",metricName="mae")
print("Mean Absolute Error (MAE) = %g" % lrMAE_evaluator.evaluate(lr_predictions2))

lrRMSE_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="2016gwg",metricName="rmse")
print("Root Mean Squared Error (RMSE) = %g" % lrRMSE_evaluator.evaluate(lr_predictions2))

#as seen below, for dataset2 it gives a model with good rmse value but R2 is not accuracy

+------------------+--------+--------------------+
|        prediction| 2016gwg|            features|
+------------------+--------+--------------------+
|13.574162892901448|     9.4|[10.1,29561.0,177...|
|13.479606030796514|14.08125|[15.8,28556.0,149...|
|13.608447251994464|14.08125|[18.0,27034.0,188...|
| 13.68104746430125|    16.3|[18.9,35540.0,210...|
|13.713532644199878|14.08125|[19.8,31347.0,219...|
+------------------+--------+--------------------+
only showing top 5 rows

R Squared (R2) = 0.0148614
Mean Absolute Error (MAE) = 1.80419
Root Mean Squared Error (RMSE) = 2.88213


In [40]:
# Decision Tree Algorithm - applying to dataset 2 

from pyspark.ml.regression import DecisionTreeRegressor

dt = DecisionTreeRegressor(featuresCol ='features', labelCol = '2016gwg',maxBins=50)
dt_model2 = dt.fit(train_data2)
dt_predictions2 = dt_model2.transform(test_data2)
dt_evaluator2 = RegressionEvaluator(
    labelCol="2016gwg", predictionCol="prediction", metricName="rmse")
rmse = dt_evaluator2.evaluate(dt_predictions2)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
dt_predictions2.select('prediction', '2016gwg', 'features').show(10)

#Gives rmse as 0.85 which is good

Root Mean Squared Error (RMSE) on test data = 0.859803
+------------------+--------+--------------------+
|        prediction| 2016gwg|            features|
+------------------+--------+--------------------+
|               9.4|     9.4|[10.1,29561.0,177...|
|14.081250000000002|14.08125|[15.8,28556.0,149...|
|          14.08125|14.08125|[18.0,27034.0,188...|
|              16.3|    16.3|[18.9,35540.0,210...|
|          14.08125|14.08125|[19.8,31347.0,219...|
|               4.5|     4.5|[21.0,32687.0,172...|
|          14.08125|14.08125|[22.1,29665.0,189...|
|          14.08125|14.08125|[22.5,39248.0,233...|
|          14.08125|14.08125|[23.4,30533.0,221...|
|14.081249999999999|    16.8|[23.5,49885.0,263...|
+------------------+--------+--------------------+
only showing top 10 rows



In [41]:
r_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="2016gwg",metricName="r2")
r2=r_evaluator.evaluate(dt_predictions2)
print("R Squared (R2) = %g" % r2)
dt_predictions2.select('prediction', '2016gwg', 'features').show(10)


#R2 value 0.91is really good!

R Squared (R2) = 0.912327
+------------------+--------+--------------------+
|        prediction| 2016gwg|            features|
+------------------+--------+--------------------+
|               9.4|     9.4|[10.1,29561.0,177...|
|14.081250000000002|14.08125|[15.8,28556.0,149...|
|          14.08125|14.08125|[18.0,27034.0,188...|
|              16.3|    16.3|[18.9,35540.0,210...|
|          14.08125|14.08125|[19.8,31347.0,219...|
|               4.5|     4.5|[21.0,32687.0,172...|
|          14.08125|14.08125|[22.1,29665.0,189...|
|          14.08125|14.08125|[22.5,39248.0,233...|
|          14.08125|14.08125|[23.4,30533.0,221...|
|14.081249999999999|    16.8|[23.5,49885.0,263...|
+------------------+--------+--------------------+
only showing top 10 rows



In [43]:
# Decision Tree Algorithm - applying to dataset 1 

from pyspark.ml.regression import DecisionTreeRegressor

dt = DecisionTreeRegressor(featuresCol ='features', labelCol = 'GDP Value',maxBins=50)
dt_model = dt.fit(train_data)
dt_predictions = dt_model2.transform(test_data)
dt_evaluator = RegressionEvaluator(
    labelCol="GDP Value", predictionCol="prediction", metricName="rmse")
rmse = dt_evaluator.evaluate(dt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
dt_predictions.select('prediction', 'GDP Value', 'features').show(10)


Root Mean Squared Error (RMSE) on test data = 3.37456e+06
+----------+---------+--------------------+
|prediction|GDP Value|            features|
+----------+---------+--------------------+
|      16.5| 23802.34|[13589.05,6.56,11...|
|      16.5| 29862.34|[24242.35,9.26,9.54]|
|      16.5| 28783.54|[24748.29,6.75,12...|
|      16.5| 83613.58|[24748.29,6.75,12...|
|      16.5| 31471.88|[25029.41,9.16,16.5]|
|      16.5| 64935.93|[25029.41,12.06,1...|
|      16.5| 32729.61|[25936.72,5.22,17...|
|      16.5| 67574.53|[25936.72,5.22,17...|
|      16.5|  85652.1|[26197.98,6.95,12...|
|      16.5| 31471.88|[26666.99,6.75,19...|
+----------+---------+--------------------+
only showing top 10 rows



In [46]:
r_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="GDP Value",metricName="r2")
r2=r_evaluator.evaluate(dt_predictions)
print("R Squared (R2) = %g" % r2)
dt_predictions.select('prediction', 'GDP Value', 'features').show(10)


#R2 is minus, says this does not suit dataset 1

R Squared (R2) = -0.110232
+----------+---------+--------------------+
|prediction|GDP Value|            features|
+----------+---------+--------------------+
|      16.5| 23802.34|[13589.05,6.56,11...|
|      16.5| 29862.34|[24242.35,9.26,9.54]|
|      16.5| 28783.54|[24748.29,6.75,12...|
|      16.5| 83613.58|[24748.29,6.75,12...|
|      16.5| 31471.88|[25029.41,9.16,16.5]|
|      16.5| 64935.93|[25029.41,12.06,1...|
|      16.5| 32729.61|[25936.72,5.22,17...|
|      16.5| 67574.53|[25936.72,5.22,17...|
|      16.5|  85652.1|[26197.98,6.95,12...|
|      16.5| 31471.88|[26666.99,6.75,19...|
+----------+---------+--------------------+
only showing top 10 rows



AS SEEN, ONLY THE DATASET2 WITH PARTICULAR FEATURES GIVES AN ACCURATE MODEL 91% PREDICTION POWER.

(can be applied and seen with other attributes of dataset 2, if time permits)

Changing the split value and iterating--

In [19]:
### Iteration 2 with dataset 2  -70/30 split

# Let's do a randomised 70/30 split. 
train_data2,test_data2 = vector_output2.randomSplit([0.7,0.3])

# Let's see our training data.
train_data2.describe().show()

# And our testing data.
test_data2.describe().show()

+-------+------------------+
|summary|           2016gwg|
+-------+------------------+
|  count|                48|
|   mean|13.916666666666663|
| stddev| 2.527794079390451|
|    min|               4.5|
|    max|              18.1|
+-------+------------------+

+-------+------------------+
|summary|           2016gwg|
+-------+------------------+
|  count|                16|
|   mean|13.660937500000005|
| stddev|2.2321858448241567|
|    min|               7.8|
|    max|              18.1|
+-------+------------------+



In [20]:
# Importing the LR package.
from pyspark.ml.regression import LinearRegression

# Instantiate the instance.
lr = LinearRegression(featuresCol='features', labelCol='GDP Value')
lr2=LinearRegression(featuresCol='features', labelCol='2016gwg')
# Fit the training data.

train_data,test_data = vector_output.randomSplit([0.7,0.3])

lr_model = lr.fit(train_data)
lr_model2 = lr2.fit(train_data2)
# Print the coefficients.
print("Coefficients: " + str(lr_model.coefficients))

# Print the intercept.
print("Intercept: " + str(lr_model.intercept) + "\n")

# Summarise the model and print out some evaluation metrics.
training_summary = lr_model.summary

# Print RMSE. 
print("RMSE: " + str(training_summary.rootMeanSquaredError))

# Print R2.
print("R2: " + str(training_summary.r2))

#dataset 2

# Print the coefficients.
print("\nCoefficients: " + str(lr_model2.coefficients))

# Print the intercept.
print("Intercept: " + str(lr_model2.intercept) + "\n")


# Summarise the model and print out some evaluation metrics.
training_summary2 = lr_model2.summary

# Print RMSE. 
print("RMSE: " + str(training_summary2.rootMeanSquaredError))

# Print R2.
print("R2: " + str(training_summary2.r2))


Coefficients: [4.043977049925853,-18182.42848105325,10512.164639303077]
Intercept: -256737.21352440442

RMSE: 2084955.9206300266
R2: 0.3668494808288326

Coefficients: [0.02852237395665148,1.3420948370530414e-05,1.2153456555468153e-05]
Intercept: 12.154099010516255

RMSE: 2.4512457243638575
R2: 0.03964081777278661


In [21]:
#Dataset 1 with 70/30 --no changes in split but output is changed
# Evaluating the modelagainst the test data.
test_results = lr_model.evaluate(test_data)

# And print the RMSE/R2. As expected, our RMSE and R2 are slightly worse when applying the testing set.
print("RMSE on test data: " + str(test_results.rootMeanSquaredError))
print("R2 on test data: " + str(test_results.r2))

#Both test and train have almost same values for R2 and rmse

RMSE on test data: 2068764.9642582173
R2 on test data: 0.3516309120621469


In [22]:
# Evaluating the model against the test data.
test_results2 = lr_model2.evaluate(test_data2)

# And print the RMSE/R2. As expected, our RMSE and R2 are slightly worse when applying the testing set.
print("RMSE on test data2: " + str(test_results2.rootMeanSquaredError))
print("R2 on test data2: " + str(test_results2.r2))

#In test results, the R2 value increased and shifted more towards 1-- good sign!

RMSE on test data2: 2.093661266798269
R2 on test data2: 0.06161542307065859


In [23]:
#linear regression algorithm using Regression Evaluator

#for dataset 2 

lr_predictions2 = lr_model2.transform(test_data2) 
lr_predictions2.select("prediction","2016gwg","features").show(5)

from pyspark.ml.evaluation import RegressionEvaluator

lrR2_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="2016gwg",metricName="r2")
print("R Squared (R2) = %g" % lrR2_evaluator.evaluate(lr_predictions2))

lrMAE_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="2016gwg",metricName="mae")
print("Mean Absolute Error (MAE) = %g" % lrMAE_evaluator.evaluate(lr_predictions2))

lrRMSE_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="2016gwg",metricName="rmse")
print("Root Mean Squared Error (RMSE) = %g" % lrRMSE_evaluator.evaluate(lr_predictions2))

# 61% accuracy with linear regression on dataset2

+------------------+--------+--------------------+
|        prediction| 2016gwg|            features|
+------------------+--------+--------------------+
| 13.05515809475113|     9.4|[10.1,29561.0,177...|
|13.057065272600136|14.08125|[14.4,27034.0,106...|
|13.259076019271923|14.08125|[18.0,27034.0,188...|
| 14.08561004191009|    18.1|[19.4,64406.0,422...|
| 13.41253145985605|14.08125|[22.1,29665.0,189...|
+------------------+--------+--------------------+
only showing top 5 rows

R Squared (R2) = 0.0616154
Mean Absolute Error (MAE) = 1.21169
Root Mean Squared Error (RMSE) = 2.09366


In [24]:
# Decision Tree Algorithm - applying to dataset 2

from pyspark.ml.regression import DecisionTreeRegressor

dt = DecisionTreeRegressor(featuresCol ='features', labelCol = '2016gwg',maxBins=50)
dt_model2 = dt.fit(train_data2)
dt_predictions2 = dt_model2.transform(test_data2)
dt_evaluator2 = RegressionEvaluator(
    labelCol="2016gwg", predictionCol="prediction", metricName="rmse")
rmse = dt_evaluator2.evaluate(dt_predictions2)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
dt_predictions2.select('prediction', '2016gwg', 'features').show(10)

#low value of rmse--good sign!

Root Mean Squared Error (RMSE) on test data = 0.142637
+------------------+--------+--------------------+
|        prediction| 2016gwg|            features|
+------------------+--------+--------------------+
|               9.4|     9.4|[10.1,29561.0,177...|
|14.081250000000002|14.08125|[14.4,27034.0,106...|
|          14.08125|14.08125|[18.0,27034.0,188...|
|              18.1|    18.1|[19.4,64406.0,422...|
|         14.190625|14.08125|[22.1,29665.0,189...|
|14.081250000000002|14.08125|[22.5,39248.0,233...|
|14.396527777777784|14.08125|[28.5,66105.0,468...|
|14.081250000000002|14.08125|[30.1,44844.0,229...|
|14.081250000000002|14.08125|[30.1,44844.0,229...|
|14.396527777777784|14.08125|[30.3,57882.0,298...|
+------------------+--------+--------------------+
only showing top 10 rows



In [25]:
r_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="2016gwg",metricName="r2")
r2=r_evaluator.evaluate(dt_predictions2)
print("R Squared (R2) = %g" % r2)
dt_predictions2.select('prediction', '2016gwg', 'features').show(10)

#gives approximately 99% accuracy when 70/30 split is done in dataset2--- great!


R Squared (R2) = 0.995645
+------------------+--------+--------------------+
|        prediction| 2016gwg|            features|
+------------------+--------+--------------------+
|               9.4|     9.4|[10.1,29561.0,177...|
|14.081250000000002|14.08125|[14.4,27034.0,106...|
|          14.08125|14.08125|[18.0,27034.0,188...|
|              18.1|    18.1|[19.4,64406.0,422...|
|         14.190625|14.08125|[22.1,29665.0,189...|
|14.081250000000002|14.08125|[22.5,39248.0,233...|
|14.396527777777784|14.08125|[28.5,66105.0,468...|
|14.081250000000002|14.08125|[30.1,44844.0,229...|
|14.081250000000002|14.08125|[30.1,44844.0,229...|
|14.396527777777784|14.08125|[30.3,57882.0,298...|
+------------------+--------+--------------------+
only showing top 10 rows

