In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
# Create a SparkSession
spark = SparkSession.builder \
    .appName("Linear Regression Example") \
    .getOrCreate()


In [3]:
# Load your dataset
df = spark.read.csv("vehicle_EDA.csv", header=True, inferSchema=True)


In [4]:
df.show()

+-----+----+------------+-----+---------+------------+----+--------+------------+------------+-----+-----+-----------+-----+
|price|year|manufacturer|model|condition|   cylinders|fuel|odometer|title_status|transmission|drive| type|paint_color|state|
+-----+----+------------+-----+---------+------------+----+--------+------------+------------+-----+-----+-----------+-----+
| 6000|2011|        ford|f-150|     good|6 cylindersw| gas|   98729|       clean|   automatic|  4wd|sedan|      white|   az|
|11900|2011|        ford|f-150|     good|6 cylindersw| gas|   98729|       clean|   automatic|  4wd|sedan|      white|   ar|
|21000|2011|        ford|f-150|     good|6 cylindersw| gas|   98729|       clean|   automatic|  4wd|sedan|      white|   fl|
| 1500|2011|        ford|f-150|     good|6 cylindersw| gas|   98729|       clean|   automatic|  4wd|sedan|      white|   ma|
| 4900|2011|        ford|f-150|     good|6 cylindersw| gas|   98729|       clean|   automatic|  4wd|sedan|      white|   nc|


In [5]:
df.count()

339050

In [6]:
len(df.columns)

14

In [7]:
df.printSchema()

root
 |-- price: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- manufacturer: string (nullable = true)
 |-- model: string (nullable = true)
 |-- condition: string (nullable = true)
 |-- cylinders: string (nullable = true)
 |-- fuel: string (nullable = true)
 |-- odometer: integer (nullable = true)
 |-- title_status: string (nullable = true)
 |-- transmission: string (nullable = true)
 |-- drive: string (nullable = true)
 |-- type: string (nullable = true)
 |-- paint_color: string (nullable = true)
 |-- state: string (nullable = true)



In [8]:
df.describe().show()

+-------+-----------------+------------------+------------+--------------------+---------+------------+------+------------------+------------+------------+------+------+-----------+------+
|summary|            price|              year|manufacturer|               model|condition|   cylinders|  fuel|          odometer|title_status|transmission| drive|  type|paint_color| state|
+-------+-----------------+------------------+------------+--------------------+---------+------------+------+------------------+------------+------------+------+------+-----------+------+
|  count|           339050|            339050|      339050|              339050|   339050|      339050|339050|            339050|      339050|      339050|339050|339050|     339050|339050|
|   mean|41111.30849727179|2010.7232148650642|        null|  1942.1225550463842|     null|        null|  null|100807.04491372954|        null|        null|  null|  null|       null|  null|
| stddev|4869252.508568353| 9.780865941571017|        n

In [9]:
df.head()

Row(price=6000, year=2011, manufacturer='ford', model='f-150', condition='good', cylinders='6 cylindersw', fuel='gas', odometer=98729, title_status='clean', transmission='automatic', drive='4wd', type='sedan', paint_color='white', state='az')

In [10]:
df.corr("price","year")

-0.008449092277232303

In [11]:
df.corr("price","odometer")

0.029939993632698128

In [12]:
from pyspark.ml.feature import StringIndexer

In [15]:
#indexer=StringIndexer(inputCol="region",outputCol="region_cat",handleInvalid="skip")
#indexed=indexer.fit(df).transform(df)

In [16]:
#indexed.show(5)

+------+-----+------+------------+--------------------+---------+-----------+----+--------+------------+------------+-----+------+-----------+-----+----------+
|region|price|  year|manufacturer|               model|condition|  cylinders|fuel|odometer|title_status|transmission|drive|  type|paint_color|state|region_cat|
+------+-----+------+------------+--------------------+---------+-----------+----+--------+------------+------------+-----+------+-----------+-----+----------+
|auburn|15000|2013.0|        ford|           f-150 xlt|excellent|6 cylinders| gas|128000.0|       clean|   automatic|  rwd| truck|      black|   al|     350.0|
|auburn|27990|2012.0|         gmc|sierra 2500 hd ex...|     good|8 cylinders| gas| 68696.0|       clean|       other|  4wd|pickup|      black|   al|     350.0|
|auburn|34590|2016.0|   chevrolet|silverado 1500 do...|     good|6 cylinders| gas| 29499.0|       clean|       other|  4wd|pickup|     silver|   al|     350.0|
|auburn|35000|2019.0|      toyota|      

In [13]:
len(df.columns)

14

In [14]:
indexer=StringIndexer(inputCol="manufacturer",outputCol="manufacturer_cat")
indexed=indexer.fit(df).transform(df)
indexed=indexer.fit(indexed).transform(indexed)

IllegalArgumentException: requirement failed: Output column manufacturer_cat already exists.

In [15]:
indexed.show(2)

+-----+----+------------+-----+---------+------------+----+--------+------------+------------+-----+-----+-----------+-----+----------------+
|price|year|manufacturer|model|condition|   cylinders|fuel|odometer|title_status|transmission|drive| type|paint_color|state|manufacturer_cat|
+-----+----+------------+-----+---------+------------+----+--------+------------+------------+-----+-----+-----------+-----+----------------+
| 6000|2011|        ford|f-150|     good|6 cylindersw| gas|   98729|       clean|   automatic|  4wd|sedan|      white|   az|             0.0|
|11900|2011|        ford|f-150|     good|6 cylindersw| gas|   98729|       clean|   automatic|  4wd|sedan|      white|   ar|             0.0|
+-----+----+------------+-----+---------+------------+----+--------+------------+------------+-----+-----+-----------+-----+----------------+
only showing top 2 rows



In [20]:
#indexed.corr("price","region_cat")

0.0011606699334546834

In [16]:
indexed.corr("price","manufacturer_cat")

-0.0007130114010283831

In [17]:
indexer=StringIndexer(inputCol="model",outputCol="model_cat")
indexed=indexer.fit(indexed).transform(indexed)

In [18]:
indexer=StringIndexer(inputCol="condition",outputCol="condition_cat")
indexed=indexer.fit(indexed).transform(indexed)

In [19]:
indexer=StringIndexer(inputCol="cylinders",outputCol="cylinders_cat")
indexed=indexer.fit(indexed).transform(indexed)

In [20]:
indexer=StringIndexer(inputCol="fuel",outputCol="fuel_cat")
indexed=indexer.fit(indexed).transform(indexed)

In [21]:
indexer=StringIndexer(inputCol="title_status",outputCol="title_status_cat")
indexed=indexer.fit(indexed).transform(indexed)

In [22]:
indexer=StringIndexer(inputCol="transmission",outputCol="transmission_cat")
indexed=indexer.fit(indexed).transform(indexed)

In [23]:
indexer=StringIndexer(inputCol="drive",outputCol="drive_cat")
indexed=indexer.fit(indexed).transform(indexed)

In [24]:
indexer=StringIndexer(inputCol="type",outputCol="type_cat")
indexed=indexer.fit(indexed).transform(indexed)

In [25]:
indexer=StringIndexer(inputCol="paint_color",outputCol="paint_color_cat")
indexed=indexer.fit(indexed).transform(indexed)

In [26]:
indexer=StringIndexer(inputCol="state",outputCol="status_cat")
indexed=indexer.fit(indexed).transform(indexed)

In [27]:
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler

In [28]:
indexed.columns

['price',
 'year',
 'manufacturer',
 'model',
 'condition',
 'cylinders',
 'fuel',
 'odometer',
 'title_status',
 'transmission',
 'drive',
 'type',
 'paint_color',
 'state',
 'manufacturer_cat',
 'model_cat',
 'condition_cat',
 'cylinders_cat',
 'fuel_cat',
 'title_status_cat',
 'transmission_cat',
 'drive_cat',
 'type_cat',
 'paint_color_cat',
 'status_cat']

In [29]:
assembler=VectorAssembler(inputCols=["manufacturer_cat","model_cat","condition_cat","cylinders_cat","fuel_cat","title_status_cat","transmission_cat","drive_cat",
"type_cat","paint_color_cat","status_cat","odometer","year"],outputCol="features")

In [30]:
assembler

VectorAssembler_ef48b13a91c7

In [31]:
output=assembler.transform(indexed)

In [32]:
output.show()

+-----+----+------------+-----+---------+------------+----+--------+------------+------------+-----+-----+-----------+-----+----------------+---------+-------------+-------------+--------+----------------+----------------+---------+--------+---------------+----------+--------------------+
|price|year|manufacturer|model|condition|   cylinders|fuel|odometer|title_status|transmission|drive| type|paint_color|state|manufacturer_cat|model_cat|condition_cat|cylinders_cat|fuel_cat|title_status_cat|transmission_cat|drive_cat|type_cat|paint_color_cat|status_cat|            features|
+-----+----+------------+-----+---------+------------+----+--------+------------+------------+-----+-----+-----------+-----+----------------+---------+-------------+-------------+--------+----------------+----------------+---------+--------+---------------+----------+--------------------+
| 6000|2011|        ford|f-150|     good|6 cylindersw| gas|   98729|       clean|   automatic|  4wd|sedan|      white|   az|      

In [33]:
output.select("features","price").show(truncate=False)

+-------------------------------------+-----+
|features                             |price|
+-------------------------------------+-----+
|(13,[10,11,12],[15.0,98729.0,2011.0])|6000 |
|(13,[10,11,12],[33.0,98729.0,2011.0])|11900|
|(13,[10,11,12],[1.0,98729.0,2011.0]) |21000|
|(13,[10,11,12],[16.0,98729.0,2011.0])|1500 |
|(13,[10,11,12],[8.0,98729.0,2011.0]) |4900 |
|(13,[10,11,12],[3.0,98729.0,2011.0]) |1600 |
|(13,[10,11,12],[3.0,98729.0,2011.0]) |1000 |
|(13,[10,11,12],[3.0,98729.0,2011.0]) |15995|
|(13,[10,11,12],[7.0,98729.0,2011.0]) |5000 |
|(13,[10,11,12],[6.0,98729.0,2011.0]) |3000 |
|(13,[10,11,12],[17.0,98729.0,2011.0])|13995|
|(13,[10,11,12],[17.0,98729.0,2011.0])|24999|
|(13,[10,11,12],[17.0,98729.0,2011.0])|21850|
|(13,[10,11,12],[17.0,98729.0,2011.0])|26850|
|(13,[10,11,12],[17.0,98729.0,2011.0])|11999|
|(13,[10,11,12],[17.0,98729.0,2011.0])|24999|
|(13,[10,11,12],[17.0,98729.0,2011.0])|21850|
|(13,[10,11,12],[17.0,98729.0,2011.0])|26850|
|(13,[10,11,12],[17.0,98729.0,2011

In [34]:
final_dataframe=output.select("features","price")

In [35]:
final_dataframe.show()

+--------------------+-----+
|            features|price|
+--------------------+-----+
|(13,[10,11,12],[1...| 6000|
|(13,[10,11,12],[3...|11900|
|(13,[10,11,12],[1...|21000|
|(13,[10,11,12],[1...| 1500|
|(13,[10,11,12],[8...| 4900|
|(13,[10,11,12],[3...| 1600|
|(13,[10,11,12],[3...| 1000|
|(13,[10,11,12],[3...|15995|
|(13,[10,11,12],[7...| 5000|
|(13,[10,11,12],[6...| 3000|
|(13,[10,11,12],[1...|13995|
|(13,[10,11,12],[1...|24999|
|(13,[10,11,12],[1...|21850|
|(13,[10,11,12],[1...|26850|
|(13,[10,11,12],[1...|11999|
|(13,[10,11,12],[1...|24999|
|(13,[10,11,12],[1...|21850|
|(13,[10,11,12],[1...|26850|
|(13,[10,11,12],[1...|11999|
|(13,[10,11,12],[1...|24999|
+--------------------+-----+
only showing top 20 rows



In [36]:
train_data,test_data=final_dataframe.randomSplit([0.7,0.3])

In [37]:
train_data

DataFrame[features: vector, price: int]

In [38]:
train_data

DataFrame[features: vector, price: int]

In [39]:
test_data

DataFrame[features: vector, price: int]

In [40]:
from pyspark.ml.regression import LinearRegression

In [41]:
lr=LinearRegression(featuresCol="features",labelCol="price")

In [42]:
trained_model=lr.fit(train_data)

In [43]:
trained_model

LinearRegressionModel: uid=LinearRegression_060cd8ac4853, numFeatures=13

In [44]:
result=trained_model.evaluate(train_data)

In [45]:
print(result.r2)

0.001681491242096933


In [46]:
print(result.meanSquaredError)

27306343906176.316


In [47]:
print(result.meanAbsoluteError)

105062.75583147534


In [48]:
unlabeled_data=test_data.select("features")

In [49]:
predictions=trained_model.transform(unlabeled_data)

In [50]:
predictions.show()

+--------------------+-------------------+
|            features|         prediction|
+--------------------+-------------------+
|(13,[0,1,2,3,4,7,...|  43035.57753712498|
|(13,[0,1,2,3,4,8,...| -9153.354343037121|
|(13,[0,1,2,3,4,11...|  185516.1609926736|
|(13,[0,1,2,3,4,11...| 160175.39130702429|
|(13,[0,1,2,3,4,11...|  162576.7633304391|
|(13,[0,1,2,3,4,11...|-20292.825883017853|
|(13,[0,1,2,3,4,11...| 109868.14851557929|
|(13,[0,1,2,3,4,11...| 172263.56683586724|
|(13,[0,1,2,3,4,11...|-25608.860828771256|
|(13,[0,1,2,3,5,11...|-115130.98109546956|
|(13,[0,1,2,3,5,11...| -39894.12134373188|
|(13,[0,1,2,3,5,11...|-100503.51186726987|
|(13,[0,1,2,3,5,11...| 18623.554169099778|
|(13,[0,1,2,3,5,11...|  8230.756095546298|
|(13,[0,1,2,3,5,11...| -135251.4839531798|
|(13,[0,1,2,3,5,11...| -37090.57413534541|
|(13,[0,1,2,3,6,8,...|  46349.85962607153|
|(13,[0,1,2,3,6,11...| 169813.62818947528|
|(13,[0,1,2,3,6,11...| 213400.11881729309|
|(13,[0,1,2,3,6,11...| 121343.11787792854|
+----------