In [None]:
pip install pyspark

In [3]:
import os
from pyspark.sql import SparkSession

# Create a Spark session
os.environ["HADOOP_HOME"] = "C:\\hadoop-3.3.5"

# Set hadoop.home.dir system property
os.environ["hadoop.home.dir"] = "C:\\hadoop-3.3.5"

# Initialize SparkSession
spark = SparkSession.builder.appName("ebay_analysis").getOrCreate()

# File location and type
file_location = "Data\\Final_Dataset_Cleaned.csv"
file_type = "csv"

# CSV options
infer_schema = "false"
first_row_is_header = "true"
delimiter = ","

# Read data from CSV file into a DataFrame
final_dataset = spark.read.format(file_type) \
    .option("inferSchema", infer_schema) \
    .option("header", first_row_is_header) \
    .option("sep", delimiter) \
    .load(file_location)

# Show the DataFrame
final_dataset.show()
final_dataset.printSchema()

+---+---------------+--------------------+----------+------------+------+--------------------+-----+----------------+---------------------+------+--------------------+---------------+--------------------+--------------------+
|PID|       Category|            Location|  Latitude|   Longitude|Gender|               Title|Price|Total Sold Items|Total Available Items|Rating|         Seller Name|Seller Feedback|   Product Condition|                 URL|
+---+---------------+--------------------+----------+------------+------+--------------------+-----+----------------+---------------------+------+--------------------+---------------+--------------------+--------------------+
|  0|Car Accessories|Boston, United St...|42.3554334|  -71.060511|Unisex|Car Steering Whee...|15.46|               2|                    3|   3.0|             luobo-e|           100%|                 New|https://www.ebay....|
|  1|Car Accessories|Chicago, Illinois...|41.8755616| -87.6244212|Unisex|1pc Carbon Fiber ...| 6

In [4]:
from pyspark.sql.functions import col

final_dataset = final_dataset.withColumn("PID", col("PID").cast("int"))
final_dataset = final_dataset.withColumn("Price", col("Price").cast("double"))
final_dataset = final_dataset.withColumn("Total Sold Items", col("Total Sold Items").cast("int"))
final_dataset = final_dataset.withColumn("Total Available Items", col("Total Available Items").cast("int"))
final_dataset = final_dataset.withColumn("Rating", col("Rating").cast("double"))


In [6]:
final_dataset.show()


+---+---------------+--------------------+----------+------------+------+--------------------+-----+----------------+---------------------+------+--------------------+---------------+--------------------+--------------------+
|PID|       Category|            Location|  Latitude|   Longitude|Gender|               Title|Price|Total Sold Items|Total Available Items|Rating|         Seller Name|Seller Feedback|   Product Condition|                 URL|
+---+---------------+--------------------+----------+------------+------+--------------------+-----+----------------+---------------------+------+--------------------+---------------+--------------------+--------------------+
|  0|Car Accessories|Boston, United St...|42.3554334|  -71.060511|Unisex|Car Steering Whee...|15.46|               2|                    3|   3.0|             luobo-e|           100%|                 New|https://www.ebay....|
|  1|Car Accessories|Chicago, Illinois...|41.8755616| -87.6244212|Unisex|1pc Carbon Fiber ...| 6

In [7]:
from pyspark.sql.functions import col

# Define the allowed categories
allowed_categories = [
    "Antiques",
    "Men Accessories"
    "Home and Industrial Accessories",
    "Vehicles",
    "Kids",
    "Electronics",
    "Automotive",
    "Sports",
    "Women Clothing",
    "Musical Instruments",
    "Car Accessories",
    "Miscellaneous"
]

# Filter the DataFrame to keep only the allowed categories
final_dataset = final_dataset.filter(col("Category").isin(allowed_categories))

# Show the result or perform further analysis as needed
final_dataset.show(truncate=False)


+---+---------------+------------------------------------------+----------+------------+------+--------------------------------------------------------------------------------+-----+----------------+---------------------+------+---------------------+---------------+-----------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|PID|Category       |Location                                  |Latitude  |Longitude   |Gender|Title                                                                           |Price|Total Sold Items|Total Available Items|Rating|Seller Name          |Seller Feed

In [9]:
final_dataset.coalesce(1).write.csv("E:\\Ebay Data Analysis\\Ebay_Analysis\\Output", header=True, mode="overwrite")

In [36]:
from pyspark.sql.functions import lit

column_names = ["Gender", "Title", "Price", "Total Sold", "Total Available", "Location", "Rating", "Seller Name", "Seller Feedback", "Product Condition", "URL"]

# Assuming data is your DataFrame with the default column names
women_clothing_df = women_clothing_df.toDF(*column_names)

women_clothing_df = women_clothing_df.withColumn('Category', lit('Women Clothing'))
# Now you can explore the dataframe with the correct column names
women_clothing_df.printSchema()



root
 |-- Gender: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Total Sold: string (nullable = true)
 |-- Total Available: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Rating: string (nullable = true)
 |-- Seller Name: string (nullable = true)
 |-- Seller Feedback: string (nullable = true)
 |-- Product Condition: string (nullable = true)
 |-- URL: string (nullable = true)
 |-- Category: string (nullable = false)


In [37]:
from pyspark.sql.functions import monotonically_increasing_id


elec_auto_df_with_id = women_clothing_df.withColumn("row_id", monotonically_increasing_id())

# Identify the row to be deleted (in this case, the first row)
women_clothing_df = elec_auto_df_with_id.filter("row_id != 0")

# Drop the identifier column if not needed
women_clothing_df = women_clothing_df.drop("row_id")


# Show the result or proceed with further analysis
women_clothing_df.show()

+------+--------------------+------------+----------+--------------------+--------------------+-------------+---------------+---------------+-----------------+--------------------+--------------+
|Gender|               Title|       Price|Total Sold|     Total Available|            Location|       Rating|    Seller Name|Seller Feedback|Product Condition|                 URL|      Category|
+------+--------------------+------------+----------+--------------------+--------------------+-------------+---------------+---------------+-----------------+--------------------+--------------+
|Unisex|Faux PU Leather S...|US $16.99/ea|1,992 sold|         9 available|San Francisco, Ca...|Not Available|    1 To 3 Shop|          98.8%|    New with tags|https://www.ebay....|Women Clothing|
|Female|Women's Cotton Bl...|   US $21.98|3,023 sold|        13 available|Las Vegas, Nevada...|Not Available|   auctiontrove|          99.8%|    New with tags|https://www.ebay....|Women Clothing|
|Female|Women Winter

In [38]:
women_clothing_df = women_clothing_df.withColumn("PID", monotonically_increasing_id())
# women_clothing_df = women_clothing_df.select("PID", *women_clothing_df.columns)
# Show the DataFrame with the new "PID" column
women_clothing_df = women_clothing_df.select('PID', 'Category', *[col for col in women_clothing_df.columns if col not in ['PID', 'Category']])


women_clothing_df.show()

+---+--------------+------+--------------------+------------+----------+--------------------+--------------------+-------------+---------------+---------------+-----------------+--------------------+
|PID|      Category|Gender|               Title|       Price|Total Sold|     Total Available|            Location|       Rating|    Seller Name|Seller Feedback|Product Condition|                 URL|
+---+--------------+------+--------------------+------------+----------+--------------------+--------------------+-------------+---------------+---------------+-----------------+--------------------+
|  0|Women Clothing|Unisex|Faux PU Leather S...|US $16.99/ea|1,992 sold|         9 available|San Francisco, Ca...|Not Available|    1 To 3 Shop|          98.8%|    New with tags|https://www.ebay....|
|  1|Women Clothing|Female|Women's Cotton Bl...|   US $21.98|3,023 sold|        13 available|Las Vegas, Nevada...|Not Available|   auctiontrove|          99.8%|    New with tags|https://www.ebay....|


In [39]:
from pyspark.sql.functions import regexp_replace, col, regexp_extract, when, rand, round

women_clothing_df_temp = women_clothing_df.withColumn("Price", regexp_replace(col("Price"), "[^0-9.]", ""))

# Convert the "Price" column to a numeric format
women_clothing_df_temp = women_clothing_df_temp.withColumn("Price", col("Price").cast("float"))

women_clothing_df_temp = women_clothing_df_temp.withColumn("Total Sold",
                                                                 regexp_extract(col("Total Sold"), r'(\d+)', 1).cast("integer"))

women_clothing_df_temp = women_clothing_df_temp.withColumn("Rating",
                                                                 when(col("Rating") == "Not Available",
                                                                      (round(rand() * 9 + 1) / 2).cast("float"))
                                                                 .otherwise(col("Rating").cast("float")))

women_clothing_df_temp = women_clothing_df_temp.withColumn("Total Available",
                                                                 regexp_extract(col("Total Available"), r'(\d+)', 1).cast("integer"))



In [40]:
# women_clothing_df_temp = women_clothing_df_temp.drop("PID")
women_clothing_df_temp.show()

+---+--------------+------+--------------------+-----+----------+---------------+--------------------+------+---------------+---------------+-----------------+--------------------+
|PID|      Category|Gender|               Title|Price|Total Sold|Total Available|            Location|Rating|    Seller Name|Seller Feedback|Product Condition|                 URL|
+---+--------------+------+--------------------+-----+----------+---------------+--------------------+------+---------------+---------------+-----------------+--------------------+
|  0|Women Clothing|Unisex|Faux PU Leather S...|16.99|         1|              9|San Francisco, Ca...|   1.5|    1 To 3 Shop|          98.8%|    New with tags|https://www.ebay....|
|  1|Women Clothing|Female|Women's Cotton Bl...|21.98|         3|             13|Las Vegas, Nevada...|   3.5|   auctiontrove|          99.8%|    New with tags|https://www.ebay....|
|  2|Women Clothing|Female|Women Winter Warm...| 14.1|       578|             10|Alameda, Calif

In [44]:
# Top Selling products in the category... 

top_selling_products = women_clothing_df_temp.sort(col("Total Sold").desc()).limit(50)
top_selling_products.show()

top_selling_products.write.csv("C:\\Users\\dhrum\\Desktop", header=True, mode="overwrite")

+-----------+--------------+------+--------------------+-----+----------+---------------+--------------------+------+--------------------+---------------+-----------------+--------------------+
|        PID|      Category|Gender|               Title|Price|Total Sold|Total Available|            Location|Rating|         Seller Name|Seller Feedback|Product Condition|                 URL|
+-----------+--------------+------+--------------------+-----+----------+---------------+--------------------+------+--------------------+---------------+-----------------+--------------------+
|25769805213|Women Clothing|Unisex|NEW LADIES ITALIA...|17.99|       975|              6|       Not Available|   1.0|          elabelleuk|           100%|    New with tags|https://www.ebay....|
|60129544846|Women Clothing|Female|Sexy Women's Deep...|13.99|       972|             10|       Not Available|   1.5|           Kaylee_xo|           100%| New without tags|https://www.ebay....|
|        844|Women Clothing|Fe

In [32]:
# Counting products in each Price Range (Price Range Distribution) - To identify popular price ranges

price_bins = [0, 100, 200, 300, 400, 500, 1000, float('inf')]
labels = ["$0 - $100", "$100 - $200", "$200 - $300", "$300 - $400", "$400 - $500", "$500 - $1000", "$1000 +"]

df_with_price_ranges = women_clothing_df_temp.withColumn("Price Range",
                                              when((col("Price") >= price_bins[0]) & (col("Price") < price_bins[1]), labels[0])
                                              .when((col("Price") >= price_bins[1]) & (col("Price") < price_bins[2]), labels[1])
                                              .when((col("Price") >= price_bins[2]) & (col("Price") < price_bins[3]), labels[2])
                                              .when((col("Price") >= price_bins[3]) & (col("Price") < price_bins[4]), labels[3])
                                              .when((col("Price") >= price_bins[4]) & (col("Price") < price_bins[5]), labels[4])
                                              .when((col("Price") >= price_bins[5]) & (col("Price") < price_bins[6]), labels[5])
                                              .when(col("Price") >= price_bins[6], labels[6])
                                              .otherwise("Unknown")
                                              )

# Group by "Price Range" and count the occurrences
price_range_counts = df_with_price_ranges.groupBy("Price Range").count()

# Show the result
price_range_counts.show()

+------------+-----+
| Price Range|count|
+------------+-----+
|   $0 - $100|24038|
|     Unknown|  947|
| $300 - $400|  909|
| $200 - $300| 1439|
| $100 - $200| 4367|
|$500 - $1000| 1033|
|     $1000 +|  786|
| $400 - $500|  574|
+------------+-----+


In [21]:
# 

from  pyspark.sql.functions import sum
women_clothing_df_temp = women_clothing_df_temp.withColumn("Total Sold", when(col("Total Sold").cast("int").isNotNull(), col("Total Sold").cast("int")).otherwise(0))

top_sellers = women_clothing_df_temp.groupBy("Seller Name").agg(sum("Total Sold").alias("Total Items Sold")).orderBy(col("Total Items Sold").desc())

top_sellers.show()

+--------------------+----------------+
|         Seller Name|Total Items Sold|
+--------------------+----------------+
|Megatone_Electron...|            8661|
|Drone Superstore ...|            7650|
|  DJI Official Store|            7149|
|       Not Available|            4769|
|  Bliss Toys for All|            4760|
|Caold Solar and S...|            4400|
|            4drctoys|            4299|
|Sherco Automotive...|            3847|
|     EverydayGadgetz|            3637|
|     infinity-drones|            3272|
|Steve's I Want That!|            2317|
|    BLUETTI OFFICIAL|            2098|
|Eagle Eye Drones LLC|            2071|
|          Bluetti US|            1931|
|Los Gatos, Califo...|            1883|
|Paladin Distribution|            1746|
|   Knownobob's Store|            1683|
|          DYNORACING|            1588|
|    GO NITRO HOBBIES|            1496|
|SMONET Official S...|            1461|
+--------------------+----------------+


In [22]:
# Location based Analysis - which 
sales_by_location = women_clothing_df_temp.groupBy("Location").agg(sum("Total Sold").alias("Total Items Sold")).orderBy(col("Total Items Sold").desc())

sales_by_location.show()

+--------------------+----------------+
|            Location|Total Items Sold|
+--------------------+----------------+
|       Not Available|           35047|
|Rockaway Park, Ne...|            8705|
|New York, New Yor...|            7692|
|Los Angeles, Cali...|            7525|
|Fort Lauderdale, ...|            7161|
|Princeton Junctio...|            5669|
|Pompano Beach, Fl...|            5492|
|Miami, Florida, U...|            4528|
|La Puente, Califo...|            4411|
|Alameda, Californ...|            3637|
|       United States|            2806|
|Walnut, Californi...|            2785|
|Chino, California...|            2711|
|Laingsburg, Michi...|            2317|
|Hebron, Kentucky,...|            2277|
|Potsdam, New York...|            2071|
|Chicago, Illinois...|            1750|
|Walton, Kentucky,...|            1717|
|Perth Amboy, New ...|            1717|
|La Mesa, Californ...|            1683|
+--------------------+----------------+


In [23]:
ratings_distribution = women_clothing_df_temp.groupBy("Rating").count().orderBy("Rating")

ratings_distribution.show()

+------+-----+
|Rating|count|
+------+-----+
|  NULL|   43|
|   0.5| 1382|
|   1.0| 2947|
|   1.5| 2865|
|   2.0| 2817|
|   2.3|    1|
|   2.5| 2726|
|   3.0| 2824|
|   3.1|    2|
|   3.2|    2|
|   3.3|    8|
|   3.5| 2868|
|   3.6|   14|
|   3.7|   60|
|   3.8|   21|
|   3.9|    8|
|   4.0| 3056|
|   4.1|   23|
|   4.2|   74|
|   4.3|  192|
+------+-----+


In [27]:
sales_by_condition = women_clothing_df_temp.groupBy("Product Condition").agg(sum("Total Sold").alias("Total Items Sold")).orderBy(col("Total Items Sold").desc())

sales_by_condition.show()

+--------------------+----------------+
|   Product Condition|Total Items Sold|
+--------------------+----------------+
|                 New|          139061|
|                Used|           14049|
|Certified - Refur...|            8889|
|            Open box|            3162|
|          OnlineSemi|            1883|
|               98.6%|            1395|
|               99.4%|            1298|
|                100%|             830|
|New other (see de...|             767|
|               97.6%|             449|
|For parts or not ...|             342|
|               99.1%|             339|
|               96.3%|             266|
|CE Showroom and A...|             240|
|      New � Open box|             219|
|                 98%|             214|
|               98.2%|             194|
|               98.5%|             171|
|                   2|             169|
|           Brand New|              95|
+--------------------+----------------+


In [29]:
price_performance = women_clothing_df_temp.groupBy("Price").agg(sum("Total Sold").alias("Total Items Sold")).orderBy("Price")

price_performance.show()

+-----+----------------+
|Price|Total Items Sold|
+-----+----------------+
| NULL|            2707|
| 0.25|               4|
| 0.99|              68|
|  1.0|              13|
| 1.39|               0|
| 1.47|               0|
| 1.49|               7|
|  1.5|               7|
| 1.59|               0|
| 1.61|              60|
| 1.65|              38|
| 1.67|              26|
| 1.69|              25|
| 1.75|              35|
| 1.79|               6|
| 1.83|               0|
| 1.85|               1|
| 1.87|               0|
| 1.89|              53|
| 1.93|               4|
+-----+----------------+


In [8]:
# output_path = r"E:\Ebay Data Analysis\Ebay_Analysis\Output"
women_clothing_df_temp.coalesce(1).write.csv("E:\\Ebay Data Analysis\\Ebay_Analysis\\Output", header=True, mode="overwrite")


In [None]:
women_clothing_df_temp.show()