<a href="https://colab.research.google.com/github/zaraaa12/BigData/blob/main/macbook_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('MacBook').getOrCreate()


In [5]:
df = spark.read.csv('/content/Data_Sales_MacBook.csv', header = True, inferSchema = True)
df.show(100)

+------+-----------------+-------------+----------+--------------------+--------------------+
|Rating|         Location|         Sold|     Price|                URLs|        Product_Name|
+------+-----------------+-------------+----------+--------------------+--------------------+
|  NULL|            Bogor|         NULL| 1.500.000|https://shopee.co...|Macbook Air M1 20...|
|   5.0|    Jakarta Pusat|    9 Terjual| 6.100.000|https://shopee.co...|MacBook Air  2020...|
|   5.0|  Jakarta Selatan|1,6RB Terjual|12.499.000|https://shopee.co...|MacBook Air 13 in...|
|   4.3|           Bekasi|   14 Terjual|12.499.000|https://shopee.co...|Apple MacBook Air...|
|   3.0|           Bekasi|   10 Terjual|17.499.000|https://shopee.co...|Apple MacBook Air...|
|  NULL|           Bekasi|    1 Terjual|18.999.000|https://shopee.co...|Apple Macbook Air...|
|   5.0|           Bekasi|    2 Terjual|22.499.000|https://shopee.co...|Apple Macbook Air...|
|   5.0|    Jakarta Pusat|   13 Terjual| 7.200.000|https://s

In [6]:
df_cleaned = df.dropDuplicates()
df.show()

+------+---------------+-------------+----------+--------------------+--------------------+
|Rating|       Location|         Sold|     Price|                URLs|        Product_Name|
+------+---------------+-------------+----------+--------------------+--------------------+
|  NULL|          Bogor|         NULL| 1.500.000|https://shopee.co...|Macbook Air M1 20...|
|   5.0|  Jakarta Pusat|    9 Terjual| 6.100.000|https://shopee.co...|MacBook Air  2020...|
|   5.0|Jakarta Selatan|1,6RB Terjual|12.499.000|https://shopee.co...|MacBook Air 13 in...|
|   4.3|         Bekasi|   14 Terjual|12.499.000|https://shopee.co...|Apple MacBook Air...|
|   3.0|         Bekasi|   10 Terjual|17.499.000|https://shopee.co...|Apple MacBook Air...|
|  NULL|         Bekasi|    1 Terjual|18.999.000|https://shopee.co...|Apple Macbook Air...|
|   5.0|         Bekasi|    2 Terjual|22.499.000|https://shopee.co...|Apple Macbook Air...|
|   5.0|  Jakarta Pusat|   13 Terjual| 7.200.000|https://shopee.co...|Macbook Pr

In [7]:
df_cleaned = df_cleaned.na.drop()
df_cleaned.show(100)

+------+-----------------+-------------+----------+--------------------+--------------------+
|Rating|         Location|         Sold|     Price|                URLs|        Product_Name|
+------+-----------------+-------------+----------+--------------------+--------------------+
|   4.9|    Jakarta Pusat|  654 Terjual|10.488.000|https://shopee.co...|"MacBook M1 | Mac...|
|   5.0|         Semarang|    2 Terjual| 8.703.000|https://shopee.co...|Macbook Air 13 M1...|
|   5.0|          Bandung|    5 Terjual|10.699.000|https://shopee.co...|"Macbook Air M1 8...|
|   5.0|      Kab. Sleman|    1 Terjual|10.699.000|https://shopee.co...|Macbook Air M1 8/...|
|   5.0|    Jakarta Timur|    1 Terjual| 9.000.000|https://shopee.co...|[PRELOVED] MacBoo...|
|   5.0|         Surabaya|   64 Terjual| 6.000.000|https://shopee.co...|Macbook Pro M1 Ch...|
|   5.0|  Jakarta Selatan|   10 Terjual|23.499.000|https://shopee.co...|Macbook Air 15 in...|
|   5.0|    Jakarta Utara|  393 Terjual|10.654.000|https://s

In [8]:
from os import truncate
from pyspark.sql.functions import regexp_replace, when, col

df_cleaned = df_cleaned.withColumn(
    "Sold",
    regexp_replace(col("Sold"), r"(\s*Terjual|\+)", "")  # hapus "Terjual" dan "+"
)

# Step 2: Tangani "RB"
df_cleaned = df_cleaned.withColumn(
    "Sold",
    when(
        col("Sold").rlike(r"(?i)RB"),  # jika ada RB
        regexp_replace(col("Sold"), r"(?i)RB", "")
        .cast("float") * 1000
    ).otherwise(
        col("Sold").cast("int")  # jika tidak ada RB
    )
)


df_cleaned = df_cleaned.withColumn(
    "Price",
    regexp_replace(col("Price"), r"\.", "").cast("int")
)

df_cleaned.show(100)

+------+-----------------+------+--------+--------------------+--------------------+
|Rating|         Location|  Sold|   Price|                URLs|        Product_Name|
+------+-----------------+------+--------+--------------------+--------------------+
|   4.9|    Jakarta Pusat| 654.0|10488000|https://shopee.co...|"MacBook M1 | Mac...|
|   5.0|         Semarang|   2.0| 8703000|https://shopee.co...|Macbook Air 13 M1...|
|   5.0|          Bandung|   5.0|10699000|https://shopee.co...|"Macbook Air M1 8...|
|   5.0|      Kab. Sleman|   1.0|10699000|https://shopee.co...|Macbook Air M1 8/...|
|   5.0|    Jakarta Timur|   1.0| 9000000|https://shopee.co...|[PRELOVED] MacBoo...|
|   5.0|         Surabaya|  64.0| 6000000|https://shopee.co...|Macbook Pro M1 Ch...|
|   5.0|  Jakarta Selatan|  10.0|23499000|https://shopee.co...|Macbook Air 15 in...|
|   5.0|    Jakarta Utara| 393.0|10654000|https://shopee.co...|IBOX APPLE MACBOO...|
|   5.0|            Depok|  15.0| 8940000|https://shopee.co...|Ma

In [9]:
df_cleaned.show(100)

+------+-----------------+------+--------+--------------------+--------------------+
|Rating|         Location|  Sold|   Price|                URLs|        Product_Name|
+------+-----------------+------+--------+--------------------+--------------------+
|   4.9|    Jakarta Pusat| 654.0|10488000|https://shopee.co...|"MacBook M1 | Mac...|
|   5.0|         Semarang|   2.0| 8703000|https://shopee.co...|Macbook Air 13 M1...|
|   5.0|          Bandung|   5.0|10699000|https://shopee.co...|"Macbook Air M1 8...|
|   5.0|      Kab. Sleman|   1.0|10699000|https://shopee.co...|Macbook Air M1 8/...|
|   5.0|    Jakarta Timur|   1.0| 9000000|https://shopee.co...|[PRELOVED] MacBoo...|
|   5.0|         Surabaya|  64.0| 6000000|https://shopee.co...|Macbook Pro M1 Ch...|
|   5.0|  Jakarta Selatan|  10.0|23499000|https://shopee.co...|Macbook Air 15 in...|
|   5.0|    Jakarta Utara| 393.0|10654000|https://shopee.co...|IBOX APPLE MACBOO...|
|   5.0|            Depok|  15.0| 8940000|https://shopee.co...|Ma

In [10]:
df_cleaned.describe(['Price', 'Sold']).show()

+-------+-----------------+------------------+
|summary|            Price|              Sold|
+-------+-----------------+------------------+
|  count|               79|                78|
|   mean|9538156.962025316| 84.85897435897436|
| stddev|4714535.502466965|254.78933107589728|
|    min|            80000|               1.0|
|    max|         23499000|            2000.0|
+-------+-----------------+------------------+



In [11]:
from pyspark.sql.functions import corr
df_cleaned.select(corr('Price', 'Sold')).show()

+-------------------+
|  corr(Price, Sold)|
+-------------------+
|-0.2528219677071286|
+-------------------+



In [12]:
from pyspark.sql.functions import sum

df_cleaned.groupBy("Location").agg(sum("Sold").alias("Total_Sold")) \
    .orderBy("Total_Sold", ascending=False).show()

+-----------------+----------+
|         Location|Total_Sold|
+-----------------+----------+
|    Jakarta Utara|    3367.0|
|    Jakarta Pusat|    1226.0|
|    Jakarta Barat|     771.0|
|  Jakarta Selatan|     473.0|
|        Tangerang|     207.0|
|      Kab. Sleman|     122.0|
|         Surabaya|      98.0|
|Tangerang Selatan|      91.0|
|     Kab. Cirebon|      63.0|
|           Bekasi|      60.0|
|   Kab. Tangerang|      58.0|
|          Bandung|      36.0|
|         Makassar|      17.0|
|            Depok|      15.0|
|        Pekanbaru|       5.0|
|         Semarang|       4.0|
|    Kab. Karawang|       2.0|
|    Jakarta Timur|       2.0|
|        Pontianak|       1.0|
|           Malang|       1.0|
+-----------------+----------+



In [13]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer

tokenizer = Tokenizer(inputCol="Product_Name", outputCol="words")
wordsData = tokenizer.transform(df_cleaned)

remover = StopWordsRemover(inputCol="words", outputCol="filtered")
filteredData = remover.transform(wordsData)

vectorizer = CountVectorizer(inputCol="filtered", outputCol="features")
model = vectorizer.fit(filteredData)
result = model.transform(filteredData)

# Tampilkan 10 kata yang paling sering muncul
vocab = model.vocabulary
frequencies = result.select("features").rdd \
    .map(lambda row: row.features.toArray()) \
    .reduce(lambda x, y: x + y)

for word, freq in sorted(zip(vocab, frequencies), key=lambda x: -x[1])[:10]:
    print(word, int(freq))

air 68
macbook 67
m1 52
2020 39
256gb 30
13 28
inch 24
gb 23
ssd 23
second 20


In [14]:
#tokenize
from pyspark.ml.feature import Tokenizer

tokenizer = Tokenizer(inputCol="Product_Name", outputCol="items")
data_tokenized = tokenizer.transform(df_cleaned)

In [15]:
from pyspark.ml.fpm import FPGrowth
from pyspark.sql.functions import array_distinct

# Apply array_distinct to the "items" column to remove duplicates
data_tokenized_unique = data_tokenized.withColumn("items", array_distinct("items"))

# Now, apply FPGrowth to the data with unique items
fpGrowth = FPGrowth(itemsCol="items", minSupport=0.01, minConfidence=0.3)
model = fpGrowth.fit(data_tokenized_unique)

In [16]:
model.freqItemsets.show(100, truncate=False)

+----------------------------------------------------------+----+
|items                                                     |freq|
+----------------------------------------------------------+----+
|[mgn63id]                                                 |1   |
|[mgn63id, garansi]                                        |1   |
|[mgn63id, garansi, gb]                                    |1   |
|[mgn63id, garansi, gb, m1]                                |1   |
|[mgn63id, garansi, gb, m1, macbook]                       |1   |
|[mgn63id, garansi, gb, m1, macbook, air]                  |1   |
|[mgn63id, garansi, gb, m1, air]                           |1   |
|[mgn63id, garansi, gb, ssd]                               |1   |
|[mgn63id, garansi, gb, ssd, m1]                           |1   |
|[mgn63id, garansi, gb, ssd, m1, macbook]                  |1   |
|[mgn63id, garansi, gb, ssd, m1, macbook, air]             |1   |
|[mgn63id, garansi, gb, ssd, m1, air]                      |1   |
|[mgn63id,

In [None]:
model.associationRules.show(truncate=False)

In [None]:
model.associationRules.filter("array_contains(antecedent, 'murah')").show(truncate=False)

In [None]:
df_fix.write.csv('/content/output/Data_Sales_MacBook.csv', header = True)