In [1]:
pip install pyspark 

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
pip install numpy




[notice] A new release of pip available: 22.3.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import os
from pyspark.sql import SparkSession

# Create a Spark session
os.environ["HADOOP_HOME"] = "C:\\hadoop-3.3.5"

# Set hadoop.home.dir system property
os.environ["hadoop.home.dir"] = "C:\\hadoop-3.3.5"

# Initialize SparkSession
spark = SparkSession.builder.appName("ebay_analysis").getOrCreate()

# File location and type
file_location = "Data\\Data with coordinates\\Raw_Dataset.csv"
file_type = "csv"

# CSV options
infer_schema = "false"
first_row_is_header = "true"
delimiter = ","

# Read data from CSV file into a DataFrame
raw_dataset = spark.read.format(file_type) \
    .option("inferSchema", infer_schema) \
    .option("header", first_row_is_header) \
    .option("sep", delimiter) \
    .load(file_location)

# Show the DataFrame
raw_dataset.show()

+--------------------+----------+------------+------+--------------------+------------+----------------+---------------------+-------------+--------------------+---------------+--------------------+--------------------+---------------+
|            Location|  Latitude|   Longitude|Gender|               Title|       Price|Total Sold Items|Total Available Items|       Rating|         Seller Name|Seller Feedback|   Product Condition|                 URL|       Category|
+--------------------+----------+------------+------+--------------------+------------+----------------+---------------------+-------------+--------------------+---------------+--------------------+--------------------+---------------+
|Boston, United St...|42.3554334|  -71.060511|Unisex|Car Steering Whee...|   US $15.46|          2 sold|          3 available|Not Available|             luobo-e|           100%|                 New|https://www.ebay....|Car Accessories|
|Chicago, Illinois...|41.8755616| -87.6244212|Unisex|1pc

In [4]:
raw_dataset.show()

+--------------------+----------+------------+------+--------------------+------------+----------------+---------------------+-------------+--------------------+---------------+--------------------+--------------------+---------------+
|            Location|  Latitude|   Longitude|Gender|               Title|       Price|Total Sold Items|Total Available Items|       Rating|         Seller Name|Seller Feedback|   Product Condition|                 URL|       Category|
+--------------------+----------+------------+------+--------------------+------------+----------------+---------------------+-------------+--------------------+---------------+--------------------+--------------------+---------------+
|Boston, United St...|42.3554334|  -71.060511|Unisex|Car Steering Whee...|   US $15.46|          2 sold|          3 available|Not Available|             luobo-e|           100%|                 New|https://www.ebay....|Car Accessories|
|Chicago, Illinois...|41.8755616| -87.6244212|Unisex|1pc

In [5]:
from pyspark.sql.functions import regexp_replace, col, regexp_extract, when, rand, round

raw_dataset_df = raw_dataset.withColumn("Price", regexp_replace(col("Price"), "[^0-9.]", ""))

# Convert the "Price" column to a numeric format
raw_dataset_df = raw_dataset_df.withColumn("Price", col("Price").cast("float"))

raw_dataset_df = raw_dataset_df.withColumn("Total Sold Items",
                                           regexp_extract(col("Total Sold Items"), r'(\d+)', 1).cast("integer"))

raw_dataset_df = raw_dataset_df.withColumn("Rating",
                                           when(col("Rating") == "Not Available",
                                                (round(rand() * 9 + 1) / 2).cast("float"))
                                           .otherwise(col("Rating").cast("float")))

raw_dataset_df = raw_dataset_df.withColumn("Total Available Items",
                                           regexp_extract(col("Total Available Items"), r'(\d+)', 1).cast("integer"))




raw_dataset_df.show()

+--------------------+----------+------------+------+--------------------+-----+----------------+---------------------+------+--------------------+---------------+--------------------+--------------------+---------------+
|            Location|  Latitude|   Longitude|Gender|               Title|Price|Total Sold Items|Total Available Items|Rating|         Seller Name|Seller Feedback|   Product Condition|                 URL|       Category|
+--------------------+----------+------------+------+--------------------+-----+----------------+---------------------+------+--------------------+---------------+--------------------+--------------------+---------------+
|Boston, United St...|42.3554334|  -71.060511|Unisex|Car Steering Whee...|15.46|               2|                    3|   2.0|             luobo-e|           100%|                 New|https://www.ebay....|Car Accessories|
|Chicago, Illinois...|41.8755616| -87.6244212|Unisex|1pc Carbon Fiber ...| 6.54|               4|               

In [6]:
raw_dataset_df = raw_dataset_df.na.drop()
raw_dataset_df.show()


+--------------------+----------+------------+------+--------------------+-----+----------------+---------------------+------+--------------------+---------------+--------------------+--------------------+---------------+
|            Location|  Latitude|   Longitude|Gender|               Title|Price|Total Sold Items|Total Available Items|Rating|         Seller Name|Seller Feedback|   Product Condition|                 URL|       Category|
+--------------------+----------+------------+------+--------------------+-----+----------------+---------------------+------+--------------------+---------------+--------------------+--------------------+---------------+
|Boston, United St...|42.3554334|  -71.060511|Unisex|Car Steering Whee...|15.46|               2|                    3|   2.0|             luobo-e|           100%|                 New|https://www.ebay....|Car Accessories|
|Chicago, Illinois...|41.8755616| -87.6244212|Unisex|1pc Carbon Fiber ...| 6.54|               4|               

In [7]:
from pyspark.sql.functions import col

raw_dataset_df = raw_dataset_df.withColumn("Price", col("Price").cast("float"))
raw_dataset_df = raw_dataset_df.withColumn("Total Sold Items", col("Total Sold Items").cast("int"))
raw_dataset_df = raw_dataset_df.withColumn("Total Available Items", col("Total Available Items").cast("int"))
raw_dataset_df = raw_dataset_df.withColumn("Rating", col("Rating").cast("double"))

raw_dataset_df.show()

+--------------------+----------+------------+------+--------------------+-----+----------------+---------------------+-----------------+--------------------+---------------+--------------------+--------------------+---------------+
|            Location|  Latitude|   Longitude|Gender|               Title|Price|Total Sold Items|Total Available Items|           Rating|         Seller Name|Seller Feedback|   Product Condition|                 URL|       Category|
+--------------------+----------+------------+------+--------------------+-----+----------------+---------------------+-----------------+--------------------+---------------+--------------------+--------------------+---------------+
|Boston, United St...|42.3554334|  -71.060511|Unisex|Car Steering Whee...|15.46|               2|                    3|              2.0|             luobo-e|           100%|                 New|https://www.ebay....|Car Accessories|
|Chicago, Illinois...|41.8755616| -87.6244212|Unisex|1pc Carbon Fibe

In [8]:
from pyspark.sql.functions import monotonically_increasing_id

raw_dataset_df = raw_dataset_df.withColumn("PID", monotonically_increasing_id())
# women_clothing_df = women_clothing_df.select("PID", *women_clothing_df.columns)
# Show the DataFrame with the new "PID" column
raw_dataset_df = raw_dataset_df.select('PID', 'Category', *[col for col in raw_dataset.columns if col not in ['PID', 'Category']])


raw_dataset_df.show()

+---+---------------+--------------------+----------+------------+------+--------------------+-----+----------------+---------------------+-----------------+--------------------+---------------+--------------------+--------------------+
|PID|       Category|            Location|  Latitude|   Longitude|Gender|               Title|Price|Total Sold Items|Total Available Items|           Rating|         Seller Name|Seller Feedback|   Product Condition|                 URL|
+---+---------------+--------------------+----------+------------+------+--------------------+-----+----------------+---------------------+-----------------+--------------------+---------------+--------------------+--------------------+
|  0|Car Accessories|Boston, United St...|42.3554334|  -71.060511|Unisex|Car Steering Whee...|15.46|               2|                    3|              2.0|             luobo-e|           100%|                 New|https://www.ebay....|
|  1|Car Accessories|Chicago, Illinois...|41.8755616

In [9]:
from pyspark.sql.functions import col

# Define the allowed categories
allowed_categories = [
    "Antiques",
    "Men Accessories"
    "Home and Industrial Accessories",
    "Vehicles",
    "Kids",
    "Electronics",
    "Automotive",
    "Sports",
    "Women Clothing",
    "Musical Instruments",
    "Car Accessories",
    "Miscellaneous"
]

# Filter the DataFrame to keep only the allowed categories
raw_dataset_df = raw_dataset_df.filter(col("Category").isin(allowed_categories))

# Show the result or perform further analysis as needed
raw_dataset_df.show(truncate=False)

+---+---------------+------------------------------------------+----------+------------+------+--------------------------------------------------------------------------------+-----+----------------+---------------------+-----------------+---------------------+---------------+-----------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|PID|Category       |Location                                  |Latitude  |Longitude   |Gender|Title                                                                           |Price|Total Sold Items|Total Available Items|Rating           |Seller Name

In [10]:
raw_dataset_df.printSchema()

root
 |-- PID: long (nullable = false)
 |-- Category: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Latitude: string (nullable = true)
 |-- Longitude: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- Price: float (nullable = true)
 |-- Total Sold Items: integer (nullable = true)
 |-- Total Available Items: integer (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Seller Name: string (nullable = true)
 |-- Seller Feedback: string (nullable = true)
 |-- Product Condition: string (nullable = true)
 |-- URL: string (nullable = true)


AttributeError: 'DataFrame' object has no attribute 'indexExists'

In [11]:
raw_dataset_df.coalesce(1).write.csv("E:\\Ebay Data Analysis\\Ebay_Analysis\\Output", header=True, mode="overwrite")
