<a href="https://colab.research.google.com/github/VishwanathReddyAenugu/MyLearning/blob/main/pyspark/Pyspark5Mlib.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.2.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 41 kB/s 
[?25hCollecting py4j==0.10.9.2
  Downloading py4j-0.10.9.2-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 54.9 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.0-py2.py3-none-any.whl size=281805912 sha256=a803c87fc4235c454841c27d6015f221ff90f2659e13032c2b39c0373faebfca
  Stored in directory: /root/.cache/pip/wheels/0b/de/d2/9be5d59d7331c6c2a7c1b6d1a4f463ce107332b1ecd4e80718
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.2 pyspark-3.2.0


# **Machine learning with pyspark**

In [2]:
import pyspark
import pandas as pd

In [3]:
data = {
        "Name": ["krish","sudharshan","sunny","paul","harsha","shubam"],
        "age" : [31,30,29,24,21,23],
        "Experience": [10,8,4,3,1,2],
        "salary": [30000, 25000, 20000, 20000, 15000, 18000]
       }

df = pd.DataFrame.from_dict(data)     
df.to_csv("test1.csv")

In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Mlib").getOrCreate()

In [5]:
training = spark.read.csv("test1.csv", header = True, inferSchema=True)
training.show()

+---+----------+---+----------+------+
|_c0|      Name|age|Experience|salary|
+---+----------+---+----------+------+
|  0|     krish| 31|        10| 30000|
|  1|sudharshan| 30|         8| 25000|
|  2|     sunny| 29|         4| 20000|
|  3|      paul| 24|         3| 20000|
|  4|    harsha| 21|         1| 15000|
|  5|    shubam| 23|         2| 18000|
+---+----------+---+----------+------+



In [6]:
training.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- salary: integer (nullable = true)



In [7]:
training.columns

['_c0', 'Name', 'age', 'Experience', 'salary']

In [8]:
from pyspark.ml.feature import VectorAssembler
featureassembler = VectorAssembler(inputCols = ["age","Experience"], outputCol = "Independent Features")

In [9]:
output = featureassembler.transform(training)

In [11]:
output.show()

+---+----------+---+----------+------+--------------------+
|_c0|      Name|age|Experience|salary|Independent Features|
+---+----------+---+----------+------+--------------------+
|  0|     krish| 31|        10| 30000|         [31.0,10.0]|
|  1|sudharshan| 30|         8| 25000|          [30.0,8.0]|
|  2|     sunny| 29|         4| 20000|          [29.0,4.0]|
|  3|      paul| 24|         3| 20000|          [24.0,3.0]|
|  4|    harsha| 21|         1| 15000|          [21.0,1.0]|
|  5|    shubam| 23|         2| 18000|          [23.0,2.0]|
+---+----------+---+----------+------+--------------------+



In [None]:
output.columns

['_c0', 'Name', 'age', 'Experience', 'salary', 'Independent Features']

In [12]:
final_data = output.select("Independent Features","salary")

In [13]:
final_data.show()

+--------------------+------+
|Independent Features|salary|
+--------------------+------+
|         [31.0,10.0]| 30000|
|          [30.0,8.0]| 25000|
|          [29.0,4.0]| 20000|
|          [24.0,3.0]| 20000|
|          [21.0,1.0]| 15000|
|          [23.0,2.0]| 18000|
+--------------------+------+



In [15]:
from pyspark.ml.regression import LinearRegression
train_data,test_data = final_data.randomSplit([0.75,0.25])
regressor = LinearRegression(featuresCol="Independent Features", labelCol = "salary")
regressor = regressor.fit(train_data)

In [16]:
regressor.coefficients

DenseVector([-90.5483, 1608.7819])

In [17]:
regressor.intercept

16079.13669064716

In [18]:
predict_results =  regressor.evaluate(test_data)

In [19]:
predict_results.predictions.show()



+--------------------+------+------------------+
|Independent Features|salary|        prediction|
+--------------------+------+------------------+
|          [23.0,2.0]| 18000|17214.090796328448|
+--------------------+------+------------------+



In [20]:
predict_results.meanAbsoluteError, predict_results.meanSquaredError

(785.909203671552, 617653.276415653)