# Building Model using Linear Regression

In [1]:
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql._
import org.apache.spark.ml.feature._
import org.apache.spark.ml._
import org.apache.spark.sql.types.{StructType, StructField, StringType, IntegerType,DoubleType};
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql._
import org.apache.spark.ml.feature.{VectorAssembler,VectorIndexer}
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.evaluation.RegressionEvaluator

In [2]:
val conf = new SparkConf().setAppName("Regression_on_USA_House_Data")
val sc = new SparkContext(conf)
val spark = new SQLContext(sc) 

conf = org.apache.spark.SparkConf@561a3c0c
sc = org.apache.spark.SparkContext@4bda76cf
spark = org.apache.spark.sql.SQLContext@167e86f6




org.apache.spark.sql.SQLContext@167e86f6

In [3]:
val schema = StructType(Seq(StructField("AvgAreaIncome",DoubleType,true), StructField("AvgAreaHouseAge",DoubleType,true), 
               StructField("AvgAreaNumberoRooms",DoubleType,true), 
               StructField("AvgAreaNumberofBedRooms",DoubleType,true), StructField("AreaPopulation",DoubleType,true), 
               StructField("Price",DoubleType,true), StructField("Address",StringType,true)))

schema = StructType(StructField(AvgAreaIncome,DoubleType,true), StructField(AvgAreaHouseAge,DoubleType,true), StructField(AvgAreaNumberoRooms,DoubleType,true), StructField(AvgAreaNumberofBedRooms,DoubleType,true), StructField(AreaPopulation,DoubleType,true), StructField(Price,DoubleType,true), StructField(Address,StringType,true))


StructType(StructField(AvgAreaIncome,DoubleType,true), StructField(AvgAreaHouseAge,DoubleType,true), StructField(AvgAreaNumberoRooms,DoubleType,true), StructField(AvgAreaNumberofBedRooms,DoubleType,true), StructField(AreaPopulation,DoubleType,true), StructField(Price,DoubleType,true), StructField(Address,StringType,true))

In [4]:
val data = spark.read.format("csv")
                .option("header","true")
                .option("delimiter", ",")
                .schema(schema)
                .load("/user/viswatejaster9073/ML/USA_Housing.csv")

data = [AvgAreaIncome: double, AvgAreaHouseAge: double ... 5 more fields]


[AvgAreaIncome: double, AvgAreaHouseAge: double ... 5 more fields]

# Now Identify the Label (Value we need to predict) In our case its Price
- I am renaming the Price varibale as label for better understanding
- Removing unwanted columns (Dimension reduction/Feture engineering)

- <strike>-So for above requrement I need to create new data frame convert column name price as label and then drop the price column as is is redunent
- To achive this I am taking the name of the columns as array as shown below in the cell 89, exluding price
- and passing this array to selectExr,he :_* operator (which turns an array into a vararg)

## Removing Address column as it wont help for our ML model and Renaming Price column

In [5]:
val final_data = data.withColumnRenamed("price","label").drop("Address")

final_data = [AvgAreaIncome: double, AvgAreaHouseAge: double ... 4 more fields]


[AvgAreaIncome: double, AvgAreaHouseAge: double ... 4 more fields]

In [6]:
final_data.printSchema

root
 |-- AvgAreaIncome: double (nullable = true)
 |-- AvgAreaHouseAge: double (nullable = true)
 |-- AvgAreaNumberoRooms: double (nullable = true)
 |-- AvgAreaNumberofBedRooms: double (nullable = true)
 |-- AreaPopulation: double (nullable = true)
 |-- label: double (nullable = true)



## Checking Null values

In [7]:
final_data.map(x=>x.anyNull).filter($"value" === true).count

5000

### Imputing null values with median

In [8]:
val imputer = new Imputer().setInputCols(final_data.columns).setOutputCols(final_data.columns)
                .setStrategy("median")

imputer = imputer_97b4e67c0185


imputer_97b4e67c0185

In [9]:
val model = imputer.fit(final_data)
val model_final=model.transform(final_data)

model = imputer_97b4e67c0185
model_final = [AvgAreaIncome: double, AvgAreaHouseAge: double ... 4 more fields]


[AvgAreaIncome: double, AvgAreaHouseAge: double ... 4 more fields]

In [10]:
model_final.show()

+------------------+------------------+-------------------+-----------------------+------------------+------------------+
|     AvgAreaIncome|   AvgAreaHouseAge|AvgAreaNumberoRooms|AvgAreaNumberofBedRooms|    AreaPopulation|             label|
+------------------+------------------+-------------------+-----------------------+------------------+------------------+
| 79545.45857431678| 5.682861321615587|  7.009188142792237|                   4.09|23086.800502686456|1059033.5578701235|
| 68789.32900775905| 5.969800580150023| 7.0023266199799705|                   4.05|  36183.2878037408|  1232156.01224821|
| 79248.64245482568|6.0028998082752425|  6.730821019094919|                   3.09| 40173.07217364482|  1505890.91484695|
| 68789.32900775905| 5.969800580150023| 7.0023266199799705|                   4.05|  36183.2878037408|  1232156.01224821|
|61287.067178656784| 5.865889840310001|  8.512727430375099|                   5.13| 36882.15939970458|1058987.9878760849|
| 68789.32900775905| 5.9

In [11]:
model_final.map(x=>x.anyNull).filter($"value" === true).count

0

# Model building using Linear Regression

## using Vector Assemble 
- We need to convert all our  columns other than label(Predicted/dependend variable) as a vector features

In [12]:
val assembler = new VectorAssembler().setInputCols(model_final.columns.filter(!_.contains("label")))
                                .setOutputCol("features")

assembler = vecAssembler_6f97ae7e5442


vecAssembler_6f97ae7e5442

### using transformation, write the data in to a file in the format LIBSVM, so that Our ML model works faster with this format

In [14]:
val output = assembler.transform(model_final).select($"label",$"features")
            .write.format("libsvm").save("/user/viswatejaster9073/ML/libdata.txt")

lastException: Throwable = null
output: Unit = ()


### here we are reading the file which we wrote as LIBSVM format

In [15]:
val model_training = spark.read.format("libsvm").load("/user/viswatejaster9073/ML/libdata.txt")

model_training = [label: double, features: vector]


[label: double, features: vector]

### If you see the below printSchema result you can see features column added as vector type

In [16]:
model_training.printSchema

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [17]:
model_training.show()

+------------------+--------------------+
|             label|            features|
+------------------+--------------------+
|1059033.5578701235|(5,[0,1,2,3,4],[7...|
|  1232156.01224821|(5,[0,1,2,3,4],[6...|
|  1505890.91484695|(5,[0,1,2,3,4],[7...|
|  1232156.01224821|(5,[0,1,2,3,4],[6...|
|1058987.9878760849|(5,[0,1,2,3,4],[6...|
|  1232156.01224821|(5,[0,1,2,3,4],[6...|
|1260616.8066294468|(5,[0,1,2,3,4],[6...|
|  1232156.01224821|(5,[0,1,2,3,4],[6...|
| 630943.4893385402|(5,[0,1,2,3,4],[5...|
|  1232156.01224821|(5,[0,1,2,3,4],[6...|
|1068138.0743935304|(5,[0,1,2,3,4],[8...|
|  1232156.01224821|(5,[0,1,2,3,4],[6...|
|1502055.8173744078|(5,[0,1,2,3,4],[6...|
|  1232156.01224821|(5,[0,1,2,3,4],[6...|
|1573936.5644777215|(5,[0,1,2,3,4],[7...|
|  1232156.01224821|(5,[0,1,2,3,4],[6...|
| 798869.5328331633|(5,[0,1,2,3,4],[5...|
|  1232156.01224821|(5,[0,1,2,3,4],[6...|
|1545154.8126419624|(5,[0,1,2,3,4],[8...|
|  1232156.01224821|(5,[0,1,2,3,4],[6...|
+------------------+--------------

# We are using Linear Regression as our ML model

In [18]:
val lg = new LinearRegression()

lg = linReg_038be81e87c7


linReg_038be81e87c7

### creating test and train data with 70% as training and 30% as test data

In [19]:
val Array(train,test) = model_training.randomSplit(Array(0.7,0.3),seed=123)

train = [label: double, features: vector]
test = [label: double, features: vector]


[label: double, features: vector]

In [68]:
train.count()

6948

In [69]:
test.count()

3052

### Creating pipelines

In [24]:
val pipeline = new Pipeline().setStages(Array(lg))

pipeline = pipeline_5a794761781f


lastException: Throwable = null


pipeline_5a794761781f

In [25]:
val model_lg  = pipeline.fit(train)

model_lg = pipeline_5a794761781f


pipeline_5a794761781f

### Testing Our model with test data

In [30]:
val results = model_lg.transform(test)

results = [label: double, features: vector ... 1 more field]


[label: double, features: vector ... 1 more field]

### If you see the below result column prediction is added which is the predicted values on our test data

In [32]:
results.printSchema

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- prediction: double (nullable = false)



### We need only label(Price) and prediction(Predicted price) columns to check the metrics 

In [34]:
val final_result = results.select($"prediction",$"label")

final_result = [prediction: double, label: double]


[prediction: double, label: double]

# checking model accurecy using metrics like RMSE, R-square,MSE
- Refer http://waytodatascience.com/calculating-accuracy/

In [35]:
def modelev(metric: String): Double = {
 val metics = new RegressionEvaluator().setMetricName(metric).
         setPredictionCol("prediction").
         setLabelCol("label")

    metics.evaluate(final_result) 

}

modelev: (metric: String)Double


In [36]:
println("RMSE = " +modelev("rmse"))
println("MSE = " +modelev("mse"))
println("R-Square = " +modelev("r2"))
println("MAE = " +modelev("mae"))

RMSE = 70629.57227802856
MSE = 4.98853648017726E9
R-Square = 0.9219338665184864
MAE = 41423.54571564162


# we can save our model and we can use for production purpose

In [39]:
model_lg.write.overwrite.save("/user/viswatejaster9073/ML/Lg_model")

In [38]:
model_lg

pipeline_5a794761781f