<a href="https://colab.research.google.com/github/syedmahmoodiagents/BigData/blob/main/Simplistic_pyspark_LinearRegression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# !pip install pyspark

In [2]:
from pyspark.sql import SparkSession

In [3]:
from pyspark.ml.regression import LinearRegression

In [4]:
from pyspark.ml.feature import VectorAssembler

In [15]:
from pyspark.ml.evaluation import RegressionEvaluator

In [5]:
sp = SparkSession.builder.appName('LinearRegression').getOrCreate()

In [6]:
import numpy as np
import pandas as pd

In [7]:
np.random.seed(42)
data = pd.DataFrame({
    'a': np.random.normal(17, 2.3, 20),
    'b': np.random.normal(18, 2.2, 20),
    'y': np.random.normal(17.5, 2.5, 20)
})

In [8]:
data  = data.round(2)

In [9]:
data

Unnamed: 0,a,b,y
0,18.14,21.22,19.35
1,16.68,17.5,17.93
2,18.49,18.15,17.21
3,20.5,14.87,16.75
4,16.46,16.8,13.8
5,16.46,18.24,15.7
6,20.63,15.47,16.35
7,18.77,18.83,20.14
8,15.92,16.68,18.36
9,18.25,17.36,13.09


In [10]:
df = sp.createDataFrame(data, ['a','b','y'])

In [11]:
df.show(5)

+-----+-----+-----+
|    a|    b|    y|
+-----+-----+-----+
|18.14|21.22|19.35|
|16.68| 17.5|17.93|
|18.49|18.15|17.21|
| 20.5|14.87|16.75|
|16.46| 16.8| 13.8|
+-----+-----+-----+
only showing top 5 rows



In [12]:
vec = VectorAssembler(inputCols=['a','b'], outputCol='XX')

In [13]:
dt = vec.transform(df)

In [14]:
dt.show()

+-----+-----+-----+-------------+
|    a|    b|    y|           XX|
+-----+-----+-----+-------------+
|18.14|21.22|19.35|[18.14,21.22]|
|16.68| 17.5|17.93| [16.68,17.5]|
|18.49|18.15|17.21|[18.49,18.15]|
| 20.5|14.87|16.75| [20.5,14.87]|
|16.46| 16.8| 13.8| [16.46,16.8]|
|16.46|18.24| 15.7|[16.46,18.24]|
|20.63|15.47|16.35|[20.63,15.47]|
|18.77|18.83|20.14|[18.77,18.83]|
|15.92|16.68|18.36|[15.92,16.68]|
|18.25|17.36|13.09|[18.25,17.36]|
|15.93|16.68|18.31|[15.93,16.68]|
|15.93|22.08|16.54|[15.93,22.08]|
|17.56|17.97|15.81|[17.56,17.97]|
| 12.6|15.67|19.03| [12.6,15.67]|
|13.03|19.81|20.08|[13.03,19.81]|
|15.71|15.31|19.83|[15.71,15.31]|
|14.67|18.46| 15.4|[14.67,18.46]|
|17.72|13.69|16.73|[17.72,13.69]|
|14.91|15.08|18.33|[14.91,15.08]|
|13.75|18.43|19.94|[13.75,18.43]|
+-----+-----+-----+-------------+



In [None]:
mod = LinearRegression(featuresCol='XX', labelCol='y')

In [None]:
model = mod.fit(dt)

In [None]:
model.transform(dt)

+-----+-----+-----+-------------+------------------+
|    a|    b|    y|           XX|        prediction|
+-----+-----+-----+-------------+------------------+
|18.14|21.22|19.35|[18.14,21.22]|17.133387562310137|
|16.68| 17.5|17.93| [16.68,17.5]|  17.4142812602453|
|18.49|18.15|17.21|[18.49,18.15]|16.862504493489183|
| 20.5|14.87|16.75| [20.5,14.87]| 16.04415511089924|
|16.46| 16.8| 13.8| [16.46,16.8]| 17.44944304005367|
|16.46|18.24| 15.7|[16.46,18.24]|  17.5234263290357|
|20.63|15.47|16.35|[20.63,15.47]| 16.03295255303351|
|18.77|18.83|20.14|[18.77,18.83]|16.806917201104397|
|15.92|16.68|18.36|[15.92,16.68]|17.617859468036848|
|18.25|17.36|13.09|[18.25,17.36]| 16.89950830670155|
|15.93|16.68|18.31|[15.93,16.68]| 17.61462647355416|
|15.93|22.08|16.54|[15.93,22.08]| 17.89206380723677|
|17.56|17.97|15.81|[17.56,17.97]|17.153925069256175|
| 12.6|15.67|19.03| [12.6,15.67]| 18.63932257943295|
|13.03|19.81|20.08|[13.03,19.81]| 18.71300577250081|
|15.71|15.31|19.83|[15.71,15.31]| 17.615365473

In [None]:
# from pyspark.ml.evaluation import RegressionEvaluator

In [18]:

predictions = model.transform(dt)
evaluator = RegressionEvaluator(labelCol='y', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(predictions)


In [19]:
print(f"Root Mean Squared Error (RMSE) on training data = {rmse}")

Root Mean Squared Error (RMSE) on training data = 1.8668971828458636
