In [1]:
#Linear regression
from pyspark.sql import SparkSession
from pyspark import SparkContext

In [2]:
# run command export PYSPARK_SUBMIT_ARGS="--master spark://127.0.0.0"
spark2 = SparkSession.builder.appName('ml').getOrCreate()

In [3]:
#Create a Spark Session
SpSession = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("ml") \
    .config("spark.executor.memory", "0.1g") \
    .config("spark.cores.max","2") \
    .config("spark.sql.warehouse.dir", "/home/sushant/Projects/Spark_Project/temp")\
    .getOrCreate()
    
#Get the Spark Context from Spark Session    
SpContext = SpSession.sparkContext

#Test Spark
testData = SpContext.parallelize([3,6,4,2])
testData.count()
#check http://localhost:4040 to see if Spark is running

4

In [6]:
autoData = SpContext.textFile("auto-miles-per-gallon.csv")
autoData.cache()
autoData.take(5)
# Remove header
dataLines = autoData.filter(lambda x: "CYLINDERS" not in x)
dataLines.count()

from pyspark.sql import Row
"""--------------------------------------------------------------------------
Cleanup Data
-------------------------------------------------------------------------"""

from pyspark.sql import Row

#Use default for average HP
avgHP =SpContext.broadcast(80.0)

#Function to cleanup Data
def CleanupData( inputStr) :
    global avgHP
    attList=inputStr.split(",")
    
    #Replace ? values with a normal value
    hpValue = attList[3]
    if hpValue == "?":
        hpValue=avgHP.value
       
    #Create a row with cleaned up and converted data
    values= Row(     MPG=float(attList[0]),\
                     CYLINDERS=float(attList[1]), \
                     DISPLACEMENT=float(attList[2]), 
                     HORSEPOWER=float(hpValue),\
                     WEIGHT=float(attList[4]), \
                     ACCELERATION=float(attList[5]), \
                     MODELYEAR=float(attList[6])
                       ) 
    return values

#Run map for cleanup
autoMap = dataLines.map(CleanupData)
autoMap.cache()
autoMap.take(5)
#Create a Data Frame with the data. 
autoDf = SpSession.createDataFrame(autoMap)


In [7]:
"""--------------------------------------------------------------------------
Perform Data Analytics
-------------------------------------------------------------------------"""
#See descriptive analytics.
autoDf.select("MPG","CYLINDERS").describe().show()


#Find correlation between predictors and target
for i in autoDf.columns:
    if not( isinstance(autoDf.select(i).take(1)[0][0], str)) :
        print( "Correlation to MPG for ", i, autoDf.stat.corr('MPG',i))



+-------+------------------+------------------+
|summary|               MPG|         CYLINDERS|
+-------+------------------+------------------+
|  count|               398|               398|
|   mean|23.514572864321615| 5.454773869346734|
| stddev| 7.815984312565783|1.7010042445332123|
|    min|               9.0|               3.0|
|    max|              46.6|               8.0|
+-------+------------------+------------------+

('Correlation to MPG for ', 'ACCELERATION', 0.42028891210165054)
('Correlation to MPG for ', 'CYLINDERS', -0.7753962854205539)
('Correlation to MPG for ', 'DISPLACEMENT', -0.8042028248058979)
('Correlation to MPG for ', 'HORSEPOWER', -0.7746308409203806)
('Correlation to MPG for ', 'MODELYEAR', 0.5792671330833092)
('Correlation to MPG for ', 'MPG', 1.0)
('Correlation to MPG for ', 'WEIGHT', -0.8317409332443344)


In [9]:
'''--------------------------------------------------------------------------
Prepare data for ML
-------------------------------------------------------------------------'''

#Transform to a Data Frame for input to Machine Learing
#Drop columns that are not required (low correlation)

from pyspark.ml.linalg import Vectors
def transformToLabeledPoint(row) :
    lp = ( row["MPG"], Vectors.dense([row["ACCELERATION"],\
                        row["DISPLACEMENT"], \
                        row["WEIGHT"]]))
    return lp
    
autoLp = autoMap.map(transformToLabeledPoint)
autoDF = SpSession.createDataFrame(autoLp,["label", "features"])
autoDF.select("label","features").show(10)

+-----+-------------------+
|label|           features|
+-----+-------------------+
| 18.0|[12.0,307.0,3504.0]|
| 15.0|[11.5,350.0,3693.0]|
| 18.0|[11.0,318.0,3436.0]|
| 16.0|[12.0,304.0,3433.0]|
| 17.0|[10.5,302.0,3449.0]|
| 15.0|[10.0,429.0,4341.0]|
| 14.0| [9.0,454.0,4354.0]|
| 14.0| [8.5,440.0,4312.0]|
| 14.0|[10.0,455.0,4425.0]|
| 15.0| [8.5,390.0,3850.0]|
+-----+-------------------+
only showing top 10 rows



In [10]:
"""--------------------------------------------------------------------------
Perform Machine Learning
-------------------------------------------------------------------------"""

#Split into training and testing data
(trainingData, testData) = autoDF.randomSplit([0.9, 0.1])
trainingData.count()
testData.count()

#Build the model on training data
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(maxIter=10)
lrModel = lr.fit(trainingData)

#Print the metrics
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))

#Predict on the test data
predictions = lrModel.transform(testData)
predictions.select("prediction","label","features").show()

#Find R2 for Linear Regression
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="label",metricName="r2")
evaluator.evaluate(predictions)

Coefficients: [0.13121280656751166,-0.014334819573204522,-0.005777197029317336]
Intercept: 41.4094590992
+------------------+-----+-------------------+
|        prediction|label|           features|
+------------------+-----+-------------------+
|13.695847388400853| 10.0|[15.0,307.0,4376.0]|
| 15.63149889077511| 14.0|[14.5,302.0,4042.0]|
|14.491944696892986| 14.0|[15.5,304.0,4257.0]|
|10.216522812545879| 16.0|[11.5,400.0,4668.0]|
|19.090126557002506| 16.0|[18.0,258.0,3632.0]|
| 19.12357062854149| 16.5|[16.7,168.0,3820.0]|
|17.318536261844443| 17.6|[13.4,302.0,3725.0]|
|22.370390937091322| 18.0|[13.5,258.0,2962.0]|
|23.671406928330903| 19.0|[21.9,120.0,3270.0]|
|18.982448350606532| 19.2|[13.2,305.0,3425.0]|
| 21.79583876702373| 19.4|[17.2,232.0,3210.0]|
| 25.26841376652021| 20.3|[15.9,131.0,2830.0]|
|24.563686374686245| 22.0|[14.5,121.0,2945.0]|
|28.282358160866544| 22.0|[16.5,108.0,2379.0]|
| 27.53023470839626| 22.0|[18.0,121.0,2511.0]|
|28.672212998084355| 23.0|[14.0,122.0,2220.0]|
|2

0.6479532399744575