In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler,StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
# Create a SparkSession
spark = SparkSession.builder \
    .appName("Linear Regression Example") \
    .getOrCreate()


In [3]:
# Load your dataset
df = spark.read.csv("vehicle_EDA.csv", header=True, inferSchema=True)


In [4]:
train, test = df.randomSplit([0.7, 0.3])

In [5]:
numerical=["odometer","year"]

In [6]:
numerical_vector_assembler = VectorAssembler(inputCols=numerical,
                                             outputCol='numerical_feature_vector')

train = numerical_vector_assembler.transform(train)
test = numerical_vector_assembler.transform(test)

In [7]:
scaler = StandardScaler(inputCol='numerical_feature_vector',
                        outputCol='scaled_numerical_feature_vector',
                        withStd=True, withMean=True)

scaler = scaler.fit(train)

train = scaler.transform(train)
test = scaler.transform(test)

In [8]:
indexer = StringIndexer(inputCols=['manufacturer','model','condition','cylinders','fuel','title_status','transmission','drive','type','paint_color','state'],
                        outputCols=['manufacturer_index','m_i','co_i','cy_i','f_i','ts_i','tr_i','d_i','ty_i','p_i','s_i'],handleInvalid="keep")

indexer = indexer.fit(train)
train = indexer.transform(train)
test = indexer.transform(test)

In [9]:
one_hot_encoder = OneHotEncoder(inputCols=['manufacturer_index','m_i','co_i','cy_i','f_i','ts_i','tr_i','d_i','ty_i','p_i','s_i'],
                                outputCols=['manufacturer_index_h','m_i_h','co_i_h','cy_i_h','f_i_h','ts_i_h','tr_i_h','d_i_h','ty_i_h','p_i_h','s_i_h'])

one_hot_encoder = one_hot_encoder.fit(train)

train = one_hot_encoder.transform(train)
test = one_hot_encoder.transform(test)

In [10]:
assembler = VectorAssembler(inputCols=['scaled_numerical_feature_vector',
                                       'manufacturer_index_h','m_i_h','co_i_h','cy_i_h','f_i_h','ts_i_h','tr_i_h','d_i_h','ty_i_h','p_i_h','s_i_h'],
                            outputCol='final_feature_vector')

train = assembler.transform(train)
test = assembler.transform(test)

In [11]:
lr = LinearRegression(featuresCol='final_feature_vector',
                      labelCol='price')

In [12]:
lr=lr.fit(train)

In [17]:
pred_train_df = lr.transform(train).withColumnRenamed('prediction',
                                                      'predicted_vehicle_value')

pred_train_df.show(5)

+-----+----+------------+-------+---------+-----------+-----+--------+------------+------------+-----+-----+-----------+-----+------------------------+-------------------------------+------------------+------+----+----+---+----+----+---+----+---+---+--------------------+--------------------+-------------+-------------+-------------+-------------+-------------+-------------+--------------+--------------+--------------+--------------------+-----------------------+
|price|year|manufacturer|  model|condition|  cylinders| fuel|odometer|title_status|transmission|drive| type|paint_color|state|numerical_feature_vector|scaled_numerical_feature_vector|manufacturer_index|   m_i|co_i|cy_i|f_i|ts_i|tr_i|d_i|ty_i|p_i|s_i|manufacturer_index_h|               m_i_h|       co_i_h|       cy_i_h|        f_i_h|       ts_i_h|       tr_i_h|        d_i_h|        ty_i_h|         p_i_h|         s_i_h|final_feature_vector|predicted_vehicle_value|
+-----+----+------------+-------+---------+-----------+-----+-----

In [18]:
pred_test_df = lr.transform(test).withColumnRenamed('prediction', 'predicted_vehicle_value')

In [19]:
result=lr.evaluate(train)

In [20]:
print(result.r2)

0.045268164165857994


In [21]:
unlabeled_data=test.select("final_feature_vector")

In [22]:
predictions=lr.transform(unlabeled_data)

In [23]:
predictions.show()

+--------------------+-------------------+
|final_feature_vector|         prediction|
+--------------------+-------------------+
|(20918,[0,1,2,261...| 1008992.0028683455|
|(20918,[0,1,2,196...| -67303.56096432231|
|(20918,[0,1,2,196...| -67303.56096432231|
|(20918,[0,1,2,196...| -67303.56096432231|
|(20918,[0,1,3,260...|  179550.4253027456|
|(20918,[0,1,2,150...| -547479.8662257438|
|(20918,[0,1,2,208...|  670042.6498239018|
|(20918,[0,1,8,44,...| 1175430.7775578927|
|(20918,[0,1,2,143...| 259228.27718296228|
|(20918,[0,1,3,298...| 1615376.8392090765|
|(20918,[0,1,24,20...|-21075.512010765728|
|(20918,[0,1,2,208...| -383175.6778566007|
|(20918,[0,1,3,926...| -315681.9526317855|
|(20918,[0,1,2,208...|-3348.3193358114077|
|(20918,[0,1,2,323...| 240068.55161260505|
|(20918,[0,1,2,81,...| -41006.83781834139|
|(20918,[0,1,4,47,...|  536757.7944496732|
|(20918,[0,1,4,89,...| 132657.79096820363|
|(20918,[0,1,3,316...| 115230.10938838418|
|(20918,[0,1,2,323...|  189193.8136616185|
+----------