In [1]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder.appName("UsedCarPricePrediction").getOrCreate()
sc = spark.sparkContext

# Reading csv file in pyspark dataframe:
df = spark.read.csv(r"D:\vehicles.csv",header=True, inferSchema=True)

df.printSchema()
df.count()

#Drop the independent columns :
columns_to_delete = ['id', 'url', 'region_url','VIN','image_url','description','county','lat','long','posting_date','size','state']
df1 = df.drop(*columns_to_delete)

# Show column after deleting:
df1.columns

#drop duplicated records :
df2 = df1.distinct()
df2.count()

#Calculate the percentage of null values for each column :
null_counts = df2.select([sum(col(column).isNull().cast('int')).alias(column) for column in df2.columns])
total_rows = df2.count()
null_percentages = null_counts.select([((col(column) / total_rows) * 100).alias(column + "_null_percentage") for column in df2.columns])
null_percentages.show() # show null value %

#Drop rows with null values in specified columns :
df3 = df2.na.drop(subset=['region','price','year','model','odometer','manufacturer','transmission','title_status','fuel'])
df3.count()

#Handling Missing Values with Categorical Encoding :
df4 = df3.fillna('unknown')

#handling cloumns :

#manufacturer :
# Define the list of top 20 manufacturers
manufacturer_values = ['nissan','honda','chevrolet','mercedes-benz','ram','dodge','ford','jeep','toyota','bmw','subaru','volkswagen','kia','cadillac','hyundai','lexus','audi','chrysler','acura','buick']

# Use when function to update the 'manufacturer' column
df5 = df4.withColumn('manufacturer', 
                   when(df4['manufacturer'].isin(manufacturer_values), df4['manufacturer'])
                   .otherwise('others'))

# region :
# Count the occurrences of each region value
manufacturer_counts = df5.groupBy('region').count()

# Sort the counts in descending order and select the top 50 region
top_manufacturers = manufacturer_counts.orderBy('count', ascending=False).limit(50)

# Extract the top 50 region values
manufacturer_values = [row['region'] for row in top_manufacturers.collect()]

# Use when function to update the 'region' column
df6 = df5.withColumn('region', 
                     when(df5['region'].isin(manufacturer_values), df5['region'])
                     .otherwise('others'))

#model :
# Count the occurrences of each model value
manufacturer_counts = df6.groupBy('model').count()

# Sort the counts in descending order and select the top 50 model
top_manufacturers = manufacturer_counts.orderBy('count', ascending=False).limit(50)

# Extract the top 50 model values
manufacturer_values = [row['model'] for row in top_manufacturers.collect()]

# Use when function to update the 'model' column
df7 = df6.withColumn('model',
                     when(df6['model'].isin(manufacturer_values), df6['model'])
                     .otherwise('others'))


# transmission :
names_to_match = ['automatic','manual','other','unknown'] 
df8 = df7.filter((col("transmission").isin(names_to_match)) )

#year :
#converting year, odometer, price column type to integer type:

df9 = df8.withColumn("year", col('year').cast("int"))
df10 = df9.withColumn("odometer", col('odometer').cast("int"))
df11 = df10.withColumn("price", col('price').cast("int"))
df11.printSchema()

# handling outliers :
#price:
# Calculate quartiles

price_percentiles = df11.approxQuantile("price", [0.15, 0.75], 0.01)
price_percentile15 = price_percentiles[0]
price_percentile75 = price_percentiles[1]

# Calculate IQR and upper/lower limits

price_iqr = price_percentile75 - price_percentile15
price_upper_limit = price_percentile75 + 1.5 * price_iqr
price_lower_limit = price_percentile15

# Filter DataFrame based on limits
df12 = df11.filter((col("price") < price_upper_limit) & (col("price") > price_lower_limit))


#odometer:
# Calculate percentiles
odometer_percentiles = df12.approxQuantile("odometer", [0.05, 0.25, 0.75], 0.01)
odometer_percentile05 = odometer_percentiles[0]
odometer_percentile25 = odometer_percentiles[1]
odometer_percentile75 = odometer_percentiles[2]

# Calculate IQR and upper/lower limits
odometer_iqr = odometer_percentile75 - odometer_percentile25
odometer_upper_limit = odometer_percentile75 + 1.5 * odometer_iqr
odometer_lower_limit = odometer_percentile05

# Filter DataFrame based on limits
df13 = df12.filter((col("odometer") < odometer_upper_limit) & (col("odometer") > odometer_lower_limit))

#year : removing year before 1996 based on barplot distribution :
# Filter DataFrame based on the condition
df14 = df13.where(df13['year'] > 1996)

# Drop records where year column has a value of 2022
df15 = df14.filter(df['year'] != 2022)

# adding new column 'car_age' based on purchase year and till 2022
df16 = df15.withColumn('car_age', 2024 - col('year'))

#droping year :
df17 = df16.drop('year')

root
 |-- id: string (nullable = true)
 |-- url: string (nullable = true)
 |-- region: string (nullable = true)
 |-- region_url: string (nullable = true)
 |-- price: string (nullable = true)
 |-- year: string (nullable = true)
 |-- manufacturer: string (nullable = true)
 |-- model: string (nullable = true)
 |-- condition: string (nullable = true)
 |-- cylinders: string (nullable = true)
 |-- fuel: string (nullable = true)
 |-- odometer: string (nullable = true)
 |-- title_status: string (nullable = true)
 |-- transmission: string (nullable = true)
 |-- VIN: string (nullable = true)
 |-- drive: string (nullable = true)
 |-- size: string (nullable = true)
 |-- type: string (nullable = true)
 |-- paint_color: string (nullable = true)
 |-- image_url: string (nullable = true)
 |-- description: string (nullable = true)
 |-- county: string (nullable = true)
 |-- state: string (nullable = true)
 |-- lat: string (nullable = true)
 |-- long: string (nullable = true)
 |-- posting_date: string (nu

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler,StandardScaler
from pyspark.ml.regression import LinearRegression,RandomForestRegressor,DecisionTreeRegressor
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [3]:
train, test = df17.randomSplit([0.8, 0.2],seed=23)
numerical=["odometer","car_age"]
numerical_vector_assembler = VectorAssembler(inputCols=numerical,outputCol='numerical_feature_vector')

train = numerical_vector_assembler.transform(train)
test = numerical_vector_assembler.transform(test)

scaler = StandardScaler(inputCol='numerical_feature_vector',outputCol='scaled_numerical_feature_vector',withStd=True, withMean=True)

scaler = scaler.fit(train)

train = scaler.transform(train)
test = scaler.transform(test)

indexer = StringIndexer(inputCols=['manufacturer','model','condition','cylinders','fuel','title_status','transmission','drive','type','paint_color','region'],
                        outputCols=['manufacturer_index','m_i','co_i','cy_i','f_i','ts_i','tr_i','d_i','ty_i','p_i','r_i'],handleInvalid="keep")

indexer = indexer.fit(train)
train = indexer.transform(train)
test = indexer.transform(test)


one_hot_encoder = OneHotEncoder(inputCols=['manufacturer_index','m_i','co_i','cy_i','f_i','ts_i','tr_i','d_i','ty_i','p_i','r_i'],
                                outputCols=['manufacturer_index_h','m_i_h','co_i_h','cy_i_h','f_i_h','ts_i_h','tr_i_h','d_i_h','ty_i_h','p_i_h','r_i_h'])

one_hot_encoder = one_hot_encoder.fit(train)

train = one_hot_encoder.transform(train)
test = one_hot_encoder.transform(test)

assembler = VectorAssembler(inputCols=['scaled_numerical_feature_vector',
                                       'manufacturer_index_h','m_i_h','co_i_h','cy_i_h','f_i_h','ts_i_h','tr_i_h','d_i_h','ty_i_h','p_i_h','r_i_h'],
                            outputCol='final_feature_vector')

train = assembler.transform(train)
test = assembler.transform(test)

In [4]:
lr = LinearRegression(featuresCol='final_feature_vector',labelCol='price')
lr=lr.fit(train)
#pred_test_df = lr.transform(test).withColumnRenamed('prediction', 'predicted_vehicle_value')
#result=lr.evaluate(test)

#pred_train_df = lr.transform(train).withColumnRenamed('prediction','predicted_vehicle_value')
#pred_test_df = lr.transform(test).withColumnRenamed('prediction', 'predicted_vehicle_value')

#result=lr.evaluate(test)
#print(result.r2)

#unlabeled_data=test.select("final_feature_vector")
#predictions=lr.transform(unlabeled_data)
#predictions.show()


# Step 7: Make predictions on the testing data
predictions = lr.transform(test)

# Step 8: Evaluate the model
evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(predictions)
mse = evaluator.evaluate(predictions, {evaluator.metricName: "mse"})
mae = evaluator.evaluate(predictions, {evaluator.metricName: "mae"})
rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})

# Print the evaluation metrics
print("R2 on test data = %g" % r2)
print("Mean squared error:", mse)
print("Mean absolute error:", mae)
print("Root mean squared error:", rmse)

# Show predictions
predictions.select("final_feature_vector", "price", "prediction").show()


R2 on test data = 0.743116
Mean squared error: 38948822.38252994
Mean absolute error: 4640.350445107468
Root mean squared error: 6240.899164585976
+--------------------+-----+-------------------+
|final_feature_vector|price|         prediction|
+--------------------+-----+-------------------+
|(186,[0,1,3,23,76...| 3900|  6108.907040803881|
|(186,[0,1,3,23,74...| 4500| 112.30122763755207|
|(186,[0,1,6,23,75...| 4750| -7905.007637644871|
|(186,[0,1,16,23,7...| 4900| 1440.0850407869839|
|(186,[0,1,2,28,74...| 5991|-171.45731900099054|
|(186,[0,1,20,23,7...| 6500|  9519.871136631193|
|(186,[0,1,6,30,75...| 6500|  12724.29484689311|
|(186,[0,1,2,23,76...| 7200|  15326.06860554286|
|(186,[0,1,5,23,76...| 7500| -5820.476367295847|
|(186,[0,1,6,23,76...| 8200| 12847.421796108987|
|(186,[0,1,10,23,7...| 8499|  9910.185377823325|
|(186,[0,1,3,23,76...| 9500| 12604.677624211541|
|(186,[0,1,3,23,76...| 9988|  16891.53947227353|
|(186,[0,1,6,23,76...|11000| 15118.939766590569|
|(186,[0,1,5,39,76..

In [5]:
rf = RandomForestRegressor(featuresCol='final_feature_vector',labelCol='price')
rf = rf.fit(train)

predictions = rf.transform(test)

# Evaluate the model
evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="r2")

# Evaluate the model
r2 = evaluator.evaluate(predictions)
mse = evaluator.evaluate(predictions, {evaluator.metricName: "mse"})
mae = evaluator.evaluate(predictions, {evaluator.metricName: "mae"})
rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})

# Print the evaluation metrics
print("R2 on test data = %g" % r2)
print("Mean squared error:", mse)
print("Mean absolute error:", mae)
print("Root mean squared error:", rmse)

# Show predictions
predictions.select("final_feature_vector", "price", "prediction").show()

R2 on test data = 0.686468
Mean squared error: 47537754.49670209
Mean absolute error: 5159.37308984287
Root mean squared error: 6894.762831069833
+--------------------+-----+------------------+
|final_feature_vector|price|        prediction|
+--------------------+-----+------------------+
|(186,[0,1,3,23,76...| 3900|10114.175231275975|
|(186,[0,1,3,23,74...| 4500|10376.878064782726|
|(186,[0,1,6,23,75...| 4750| 8146.762035620438|
|(186,[0,1,16,23,7...| 4900| 9500.227944210896|
|(186,[0,1,2,28,74...| 5991| 8780.978165985589|
|(186,[0,1,20,23,7...| 6500| 9155.349491119863|
|(186,[0,1,6,30,75...| 6500|  9850.82213390965|
|(186,[0,1,2,23,76...| 7200| 18495.53578962089|
|(186,[0,1,5,23,76...| 7500|  9921.94636707278|
|(186,[0,1,6,23,76...| 8200|19053.153481524063|
|(186,[0,1,10,23,7...| 8499|10352.797791822859|
|(186,[0,1,3,23,76...| 9500|20262.772349341856|
|(186,[0,1,3,23,76...| 9988|11245.759102995926|
|(186,[0,1,6,23,76...|11000|14016.323812685529|
|(186,[0,1,5,39,76...|12900| 18677.019

In [6]:
# Define Decision Tree Regressor
dt = DecisionTreeRegressor(featuresCol='final_feature_vector',
                      labelCol='price')

# Train the model
dt = dt.fit(train)

# Make predictions
predictions = dt.transform(test)

# Evaluate the model
evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(predictions)
mse = evaluator.evaluate(predictions, {evaluator.metricName: "mse"})
mae = evaluator.evaluate(predictions, {evaluator.metricName: "mae"})
rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})

# Print the evaluation metrics
print("R2 on test data = %g" % r2)
print("Mean squared error:", mse)
print("Mean absolute error:", mae)
print("Root mean squared error:", rmse)

# Show predictions
predictions.select("final_feature_vector", "price", "prediction").show()

R2 on test data = 0.650171
Mean squared error: 53041059.0453581
Mean absolute error: 5393.497656813513
Root mean squared error: 7282.929290152288
+--------------------+-----+------------------+
|final_feature_vector|price|        prediction|
+--------------------+-----+------------------+
|(186,[0,1,3,23,76...| 3900|10982.724018589934|
|(186,[0,1,3,23,74...| 4500|10982.724018589934|
|(186,[0,1,6,23,75...| 4750| 7788.588836394521|
|(186,[0,1,16,23,7...| 4900| 7788.588836394521|
|(186,[0,1,2,28,74...| 5991|12500.075196344424|
|(186,[0,1,20,23,7...| 6500|10982.724018589934|
|(186,[0,1,6,30,75...| 6500| 7788.588836394521|
|(186,[0,1,2,23,76...| 7200| 20115.38405162984|
|(186,[0,1,5,23,76...| 7500| 7788.588836394521|
|(186,[0,1,6,23,76...| 8200|15833.057570977919|
|(186,[0,1,10,23,7...| 8499| 7788.588836394521|
|(186,[0,1,3,23,76...| 9500| 20115.38405162984|
|(186,[0,1,3,23,76...| 9988|10982.724018589934|
|(186,[0,1,6,23,76...|11000|11405.149932157396|
|(186,[0,1,5,39,76...|12900|22734.9459

In [7]:
# Define Ridge Regression model
ridge = LinearRegression(featuresCol="final_feature_vector", labelCol="price", elasticNetParam=0.0, regParam=0.5) # regParam is the regularization parameter for Ridge Regression

# Train the model
ridge_model = ridge.fit(train)

# Make predictions
predictions = ridge_model.transform(test)

# Evaluate the model
evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(predictions)
mse = evaluator.evaluate(predictions, {evaluator.metricName: "mse"})
mae = evaluator.evaluate(predictions, {evaluator.metricName: "mae"})
rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})
print("R2 on test data = %g" % r2)
print("Mean squared error:", mse)
print("Mean absolute error:", mae)
print("Root mean squared error:", rmse)
# Show predictions
predictions.select("final_feature_vector", "price", "prediction").show()

R2 on test data = 0.743116
Mean squared error: 38948809.745570175
Mean absolute error: 4640.338939907425
Root mean squared error: 6240.8981521548785
+--------------------+-----+-------------------+
|final_feature_vector|price|         prediction|
+--------------------+-----+-------------------+
|(186,[0,1,3,23,76...| 3900|  6109.265210978125|
|(186,[0,1,3,23,74...| 4500| 112.87396984635416|
|(186,[0,1,6,23,75...| 4750| -7904.114396578705|
|(186,[0,1,16,23,7...| 4900| 1440.7160952953163|
|(186,[0,1,2,28,74...| 5991|-171.09320751106134|
|(186,[0,1,20,23,7...| 6500|  9519.996705272728|
|(186,[0,1,6,30,75...| 6500| 12724.176367756787|
|(186,[0,1,2,23,76...| 7200| 15326.391892851847|
|(186,[0,1,5,23,76...| 7500| -5819.570609181599|
|(186,[0,1,6,23,76...| 8200| 12847.224991158264|
|(186,[0,1,10,23,7...| 8499|  9910.534971907164|
|(186,[0,1,3,23,76...| 9500|  12604.76843654692|
|(186,[0,1,3,23,76...| 9988| 16891.541002381928|
|(186,[0,1,6,23,76...|11000| 15118.846256434274|
|(186,[0,1,5,39,76

In [8]:
# Define Ridge Regression model
lasso = LinearRegression(featuresCol="final_feature_vector", labelCol="price", elasticNetParam=1.0, regParam=0.1) # regParam is the regularization parameter for Ridge Regression

# Train the model
las_model = lasso.fit(train)

# Make predictions
predictions = las_model.transform(test)

# Evaluate the model
evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(predictions)
mse = evaluator.evaluate(predictions, {evaluator.metricName: "mse"})
mae = evaluator.evaluate(predictions, {evaluator.metricName: "mae"})
rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})
print("R2 on test data = %g" % r2)
print("Mean squared error:", mse)
print("Mean absolute error:", mae)
print("Root mean squared error:", rmse)
# Show predictions
predictions.select("final_feature_vector", "price", "prediction").show()

R2 on test data = 0.743115
Mean squared error: 38949005.4006498
Mean absolute error: 4640.342148746028
Root mean squared error: 6240.913827369338
+--------------------+-----+-------------------+
|final_feature_vector|price|         prediction|
+--------------------+-----+-------------------+
|(186,[0,1,3,23,76...| 3900|  6109.338796938748|
|(186,[0,1,3,23,74...| 4500| 111.15723483642068|
|(186,[0,1,6,23,75...| 4750| -7903.279073356662|
|(186,[0,1,16,23,7...| 4900| 1439.1016819446595|
|(186,[0,1,2,28,74...| 5991|-173.57784649423047|
|(186,[0,1,20,23,7...| 6500|  9518.195562997218|
|(186,[0,1,6,30,75...| 6500| 12722.274636017126|
|(186,[0,1,2,23,76...| 7200| 15325.396707734355|
|(186,[0,1,5,23,76...| 7500| -5823.042511518564|
|(186,[0,1,6,23,76...| 8200| 12848.318277258508|
|(186,[0,1,10,23,7...| 8499|  9910.226508231712|
|(186,[0,1,3,23,76...| 9500| 12605.221170141886|
|(186,[0,1,3,23,76...| 9988|  16891.04591368686|
|(186,[0,1,6,23,76...|11000|  15121.45301514355|
|(186,[0,1,5,39,76...

In [9]:
# Train GBoost model
gbt = GBTRegressor(featuresCol="final_feature_vector", labelCol="price", maxIter=175)
model = gbt.fit(train)

# Make predictions
predictions = model.transform(test)


# Evaluate the model
evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(predictions)
mse = evaluator.evaluate(predictions, {evaluator.metricName: "mse"})
mae = evaluator.evaluate(predictions, {evaluator.metricName: "mae"})
rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})
print("R2 on test data = %g" % r2)
print("Mean squared error:", mse)
print("Mean absolute error:", mae)
print("Root mean squared error:", rmse)
# Show predictions
predictions.select("final_feature_vector", "price", "prediction").show()

R2 on test data = 0.842289
Mean squared error: 23912227.2681711
Mean absolute error: 3388.127346053956
Root mean squared error: 4890.013013088114
+--------------------+-----+------------------+
|final_feature_vector|price|        prediction|
+--------------------+-----+------------------+
|(186,[0,1,3,23,76...| 3900|6808.4406292880785|
|(186,[0,1,3,23,74...| 4500|7285.7724225155625|
|(186,[0,1,6,23,75...| 4750| 4634.417508310227|
|(186,[0,1,16,23,7...| 4900| 7640.617234001238|
|(186,[0,1,2,28,74...| 5991| 4349.747365716682|
|(186,[0,1,20,23,7...| 6500|  8255.21521167016|
|(186,[0,1,6,30,75...| 6500| 7822.015681993523|
|(186,[0,1,2,23,76...| 7200|14452.627643046984|
|(186,[0,1,5,23,76...| 7500| 7428.094462120806|
|(186,[0,1,6,23,76...| 8200|10971.939164453655|
|(186,[0,1,10,23,7...| 8499| 8915.405505002782|
|(186,[0,1,3,23,76...| 9500| 12071.43422107346|
|(186,[0,1,3,23,76...| 9988| 12383.41063188269|
|(186,[0,1,6,23,76...|11000| 12906.71767793949|
|(186,[0,1,5,39,76...|12900|19281.8823

In [11]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType

# Define the schema for the DataFrame
schema = StructType([
    StructField("Model", StringType(), True),
    StructField("R2", DoubleType(), True),
    StructField("Mse", DoubleType(), True),
    StructField("Mae",DoubleType(),True),
    StructField("Rmse",DoubleType(),True)
])

# Define the data for the DataFrame
data = [
    ("Linear", 0.743116,38948822.382529,4640.35044,6240.89916),
    ("RandomForest",0.686468,47537754.496702,5159.3730898,6894.7628310),
    ("DecisionTree", 0.650171,53041059.0453581, 5393.4976568,7282.929290),
    ("Ridge",0.743116,38948809.745570,4640.3389399,6240.8981521),
    ("Lasso",0.743115,38949005.4006498,4640.34214874,6240.9138273),
    ("GradientBoosting",0.842289,23912227.2681711,3388.12734605,4890.013013088)
]

# Create a DataFrame using the schema and data
df = spark.createDataFrame(data, schema)

# Show the DataFrame
df.show()


ConnectionRefusedError: [WinError 10061] No connection could be made because the target machine actively refused it

In [None]:
'''
paramGrid = (ParamGridBuilder()
             .addGrid(gbt.maxDepth, [5, 10, 15])
             .addGrid(gbt.maxBins, [20, 30])
             .build())

evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="price", metricName="r2")

cv = CrossValidator(estimator=gbt, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)

cvModel = cv.fit(train)

predictions = cvModel.transform(test)
r2= evaluator.evaluate(predictions)
print("R2 on test data = %g" % r2)


'''