#### Data Frame API
#### Example of pyspark ML

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('ML').getOrCreate()

In [2]:
df_spark = spark.read.csv("Info_3.csv", header = True, inferSchema=True)
df_spark.show()

+--------+---+----------+------+
|    Name|Age|Experience|Salary|
+--------+---+----------+------+
|Mohammed| 33|        10| 30000|
|  Ruhaan| 31|         8| 25000|
|  Sabeel| 32|         9| 28000|
| Ibrahim| 34|        12| 32000|
|     Isa| 35|        10| 40000|
|   Rahul| 23|         2| 15000|
|   Virat| 30|         8| 25000|
+--------+---+----------+------+



In [3]:
df_spark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [4]:
df_spark.columns

['Name', 'Age', 'Experience', 'Salary']

In [8]:
# we need to group independent variables [Age, Experience] into  new feature, in pyspark we use vector assembler
from pyspark.ml.feature import VectorAssembler
featureassembler = VectorAssembler(inputCols=['Age','Experience'], outputCol="Independent Features")


In [9]:
output = featureassembler.transform(df_spark)
output.show()

+--------+---+----------+------+--------------------+
|    Name|Age|Experience|Salary|Independent Features|
+--------+---+----------+------+--------------------+
|Mohammed| 33|        10| 30000|         [33.0,10.0]|
|  Ruhaan| 31|         8| 25000|          [31.0,8.0]|
|  Sabeel| 32|         9| 28000|          [32.0,9.0]|
| Ibrahim| 34|        12| 32000|         [34.0,12.0]|
|     Isa| 35|        10| 40000|         [35.0,10.0]|
|   Rahul| 23|         2| 15000|          [23.0,2.0]|
|   Virat| 30|         8| 25000|          [30.0,8.0]|
+--------+---+----------+------+--------------------+



In [10]:
df_final = output.select("Independent Features", "Salary")

In [11]:
df_final.show()

+--------------------+------+
|Independent Features|Salary|
+--------------------+------+
|         [33.0,10.0]| 30000|
|          [31.0,8.0]| 25000|
|          [32.0,9.0]| 28000|
|         [34.0,12.0]| 32000|
|         [35.0,10.0]| 40000|
|          [23.0,2.0]| 15000|
|          [30.0,8.0]| 25000|
+--------------------+------+



In [12]:
from pyspark.ml.regression import LinearRegression
train_data, test_data = df_final.randomSplit([0.75,0.25])
lr_spark = LinearRegression(featuresCol='Independent Features', labelCol='Salary')
lr_spark = lr_spark.fit(train_data)

In [13]:
lr_spark.coefficients

DenseVector([466.1885, 1045.082])

In [14]:
lr_spark.intercept

2187.5

In [18]:
test_data.show()

+--------------------+------+
|Independent Features|Salary|
+--------------------+------+
|          [30.0,8.0]| 25000|
|          [32.0,9.0]| 28000|
|         [33.0,10.0]| 30000|
|         [34.0,12.0]| 32000|
|         [35.0,10.0]| 40000|
+--------------------+------+



In [15]:
#prediction
pred_results = lr_spark.evaluate(test_data)

In [16]:
pred_results.predictions.show()

+--------------------+------+------------------+
|Independent Features|Salary|        prediction|
+--------------------+------+------------------+
|          [30.0,8.0]| 25000|24533.811475409842|
|          [32.0,9.0]| 28000|26511.270491803283|
|         [33.0,10.0]| 30000|28022.540983606563|
|         [34.0,12.0]| 32000|30578.893442622957|
|         [35.0,10.0]| 40000| 28954.91803278689|
+--------------------+------+------------------+

