<a href="https://colab.research.google.com/github/vajiha/learnbay28aug/blob/master/Day6_PySpark_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#importing spark context object manually
!pip install pyspark py4j
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("spark_sql_learning").getOrCreate()



In [2]:
#loading a csv file into a dataframe for applyign machine learning algorithms.
df = spark.read.csv("/content/sample_data/50_Startups_h4s1of.csv",header=True,inferSchema=True)

In [3]:
df.show()

+---------+--------------+---------------+----------+---------+
|R&D Spend|Administration|Marketing Spend|     State|   Profit|
+---------+--------------+---------------+----------+---------+
| 165349.2|      136897.8|       471784.1|  New York|192261.83|
| 162597.7|     151377.59|      443898.53|California|191792.06|
|153441.51|     101145.55|      407934.54|   Florida|191050.39|
|144372.41|     118671.85|      383199.62|  New York|182901.99|
|142107.34|      91391.77|      366168.42|   Florida|166187.94|
| 131876.9|      99814.71|      362861.36|  New York|156991.12|
|134615.46|     147198.87|      127716.82|California|156122.51|
|130298.13|     145530.06|      323876.68|   Florida| 155752.6|
|120542.52|     148718.95|      311613.29|  New York|152211.77|
|123334.88|     108679.17|      304981.62|California|149759.96|
|101913.08|     110594.11|      229160.95|   Florida|146121.95|
|100671.96|      91790.61|      249744.55|California| 144259.4|
| 93863.75|     127320.38|      249839.4

In [4]:
df.columns

['R&D Spend', 'Administration', 'Marketing Spend', 'State', 'Profit']

In [5]:
#string indexing
#converts string into number

from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol="State",outputCol="State_Indexed")
final_df = indexer.fit(df).transform(df)
final_df.show()

+---------+--------------+---------------+----------+---------+-------------+
|R&D Spend|Administration|Marketing Spend|     State|   Profit|State_Indexed|
+---------+--------------+---------------+----------+---------+-------------+
| 165349.2|      136897.8|       471784.1|  New York|192261.83|          1.0|
| 162597.7|     151377.59|      443898.53|California|191792.06|          0.0|
|153441.51|     101145.55|      407934.54|   Florida|191050.39|          2.0|
|144372.41|     118671.85|      383199.62|  New York|182901.99|          1.0|
|142107.34|      91391.77|      366168.42|   Florida|166187.94|          2.0|
| 131876.9|      99814.71|      362861.36|  New York|156991.12|          1.0|
|134615.46|     147198.87|      127716.82|California|156122.51|          0.0|
|130298.13|     145530.06|      323876.68|   Florida| 155752.6|          2.0|
|120542.52|     148718.95|      311613.29|  New York|152211.77|          1.0|
|123334.88|     108679.17|      304981.62|California|149759.96| 

In [7]:
#building the vectors

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=["R&D Spend","Administration","Marketing Spend","State_Indexed"],
                            outputCol="features")
final_data_df = assembler.transform(final_df)
final_data_df.show(truncate=False)


+---------+--------------+---------------+----------+---------+-------------+-----------------------------------+
|R&D Spend|Administration|Marketing Spend|State     |Profit   |State_Indexed|features                           |
+---------+--------------+---------------+----------+---------+-------------+-----------------------------------+
|165349.2 |136897.8      |471784.1       |New York  |192261.83|1.0          |[165349.2,136897.8,471784.1,1.0]   |
|162597.7 |151377.59     |443898.53      |California|191792.06|0.0          |[162597.7,151377.59,443898.53,0.0] |
|153441.51|101145.55     |407934.54      |Florida   |191050.39|2.0          |[153441.51,101145.55,407934.54,2.0]|
|144372.41|118671.85     |383199.62      |New York  |182901.99|1.0          |[144372.41,118671.85,383199.62,1.0]|
|142107.34|91391.77      |366168.42      |Florida   |166187.94|2.0          |[142107.34,91391.77,366168.42,2.0] |
|131876.9 |99814.71      |362861.36      |New York  |156991.12|1.0          |[131876.9,9

In [8]:
final = final_data_df.select("features","Profit")
final.show(truncate=False)

+---------+-----------------------------------+
|Profit   |features                           |
+---------+-----------------------------------+
|192261.83|[165349.2,136897.8,471784.1,1.0]   |
|191792.06|[162597.7,151377.59,443898.53,0.0] |
|191050.39|[153441.51,101145.55,407934.54,2.0]|
|182901.99|[144372.41,118671.85,383199.62,1.0]|
|166187.94|[142107.34,91391.77,366168.42,2.0] |
|156991.12|[131876.9,99814.71,362861.36,1.0]  |
|156122.51|[134615.46,147198.87,127716.82,0.0]|
|155752.6 |[130298.13,145530.06,323876.68,2.0]|
|152211.77|[120542.52,148718.95,311613.29,1.0]|
|149759.96|[123334.88,108679.17,304981.62,0.0]|
|146121.95|[101913.08,110594.11,229160.95,2.0]|
|144259.4 |[100671.96,91790.61,249744.55,0.0] |
|141585.52|[93863.75,127320.38,249839.44,2.0] |
|134307.35|[91992.39,135495.07,252664.93,0.0] |
|132602.65|[119943.24,156547.42,256512.92,2.0]|
|129917.04|[114523.61,122616.84,261776.23,1.0]|
|126992.93|[78013.11,121597.55,264346.06,0.0] |
|125370.37|[94657.16,145077.58,282574.31

In [9]:
#split the data into train and test

train,test = final.randomSplit([0.7,0.3])
train.show()

+---------+--------------------+
|   Profit|            features|
+---------+--------------------+
|  14681.4|[0.0,116983.8,451...|
| 35673.41|[542.05,51743.15,...|
| 64926.08|[1000.23,124153.0...|
| 65200.33|[22177.74,154806....|
| 69758.98|[15505.73,127382....|
| 71498.49|[23640.93,96189.6...|
| 77798.83|[27892.92,84710.7...|
| 81005.76|[38558.51,82982.0...|
| 81229.06|[20229.59,65947.9...|
| 89949.14|[44069.95,51283.1...|
| 96479.51|[46014.02,85047.4...|
|  96712.8|[46426.07,157693....|
| 96778.92|[55493.95,103057....|
|101004.64|[65605.48,153032....|
|103282.38|[66051.52,182645....|
|105008.31|[72107.6,127864.5...|
|105733.54|[75328.87,144135....|
|108733.99|[67532.53,105751....|
|110352.25|[73994.56,122782....|
|111313.02|[78389.47,153773....|
+---------+--------------------+
only showing top 20 rows



In [10]:
#building the model

from pyspark.ml.regression import LinearRegression
op_lr = LinearRegression(featuresCol="features",labelCol="Profit")
trained_model = op_lr.fit(train)

In [11]:
#evaluating the model
#apply the 30% of data in the model built
results = trained_model.evaluate(test)

In [12]:
print(results.r2)

0.9751575659678717
