In [60]:
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from urllib import request
from tabulate import tabulate
import pandas as pd
import requests
import random
import luigi
import time
import json
import csv 
import re
import os

In [2]:
class ExtractTokpedExsportData(luigi.Task):
    def requires(self):
        pass # Tidak ada task yang diperlukan
    
    def output(self):
        return luigi.LocalTarget('extract-raw-data/exsport_tokped_raw.csv') # MTempat penyimpanan data yang diekstrak


    def run(self):
        base_url = "https://www.tokopedia.com/exsportstore/product/page/{}" # URL dasar untuk mengambil data produk exsport dari Tokopedia

        # Mengatur opsi untuk webdriver Chrome
        options = webdriver.ChromeOptions()
        options.add_argument('--disable-blink-features=AutomationControlled') # Menonaktifkan fitur otomatisasi
        options.add_experimental_option('useAutomationExtension', False) # Menonaktifkan ekstensi otomatisasi
        options.add_experimental_option("excludeSwitches", ["enable-automation"]) # Mengecualikan switch otomatisasi
        driver = webdriver.Chrome(options=options) # Membuat instance dari webdriver Chrome

        product_data = [] # List untuk menyimpan data produk

        try:
            for page in range(1, 12): # Mengambil data dari halaman 1 hingga 11
                url = base_url.format(page) # Membuat URL untuk halaman saat ini
                driver.get(url) # Mengakses URL

                # Menunggu hingga elemen body muncul
                WebDriverWait(driver, 15).until(
                    EC.presence_of_element_located((By.TAG_NAME, 'body'))
                )

                # Menggulir halaman untuk memuat lebih banyak produk
                for _ in range(5):
                    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") # Menggulir ke bawah
                    time.sleep(2) # Menunggu 2 detik
                    driver.execute_script("window.scrollTo(0, document.body.scrollHeight / 2);") # Menggulir ke atas
                    time.sleep(2) # Menunggu 2 detik

                # Mengambil elemen produk
                product_containers = driver.find_elements(By.CSS_SELECTOR, "[data-testid='divProductWrapper']")

                for container in product_containers: # Iterasi setiap elemen produk
                    try:
                        name = container.find_element(By.CSS_SELECTOR, "[data-testid='linkProductName']").text # Mengambil nama produk
                    except:
                        name = None # Jika gagal, set nama menjadi None
                    
                    # Mengambil link produk dari elemen
                    try:
                        link = container.find_element(By.CSS_SELECTOR, "a.pcv3__info-content").get_attribute('href') # Mencari elemen link produk dan mengambil atribut 'href'
                    except:
                        link = None # Jika gagal, set link menjadi None

                    # Mengambil harga jual produk dari elemen
                    try:
                        price_sale_elem = container.find_element(By.CSS_SELECTOR, "[data-testid='linkProductPrice']") # Mencari elemen harga jual produk
                        price_sale = price_sale_elem.text if price_sale_elem else None # Mengambil teks dari elemen harga jual produk
                    except:
                        price_sale = None # Jika gagal, set harga jual menjadi None

                    # Mengambil harga asli produk dari elemen
                    try:
                        price_elem = container.find_element(By.CSS_SELECTOR, "[data-testid='lblProductSlashPrice']") # Mencari elemen harga asli produk
                        price = price_elem.text if price_elem else None # Mengambil teks dari elemen harga asli produk
                    except:
                        price = None # Jika gagal, set harga asli menjadi None

                    try:
                        discount_elem = container.find_element(By.CSS_SELECTOR, "[data-testid='lblProductDiscount']") # Mencari elemen diskon produk
                        discount = discount_elem.text if discount_elem else None # Mengambil teks dari elemen diskon produk
                    except:
                        discount = None # Jika gagal, set diskon menjadi None

                    # Mengambil rating produk dari elemen
                    try:
                        rating_elem = container.find_element(By.CSS_SELECTOR, ".prd_rating-average-text") # Mencari elemen rating produk
                        rating = rating_elem.text if rating_elem else None # Mengambil teks dari elemen rating produk
                    except:
                        rating = None # Jika gagal, set rating menjadi None
                    
                    # Mengambil jumlah produk yang terjual dari elemen
                    try:
                        sold_elem = container.find_element(By.CSS_SELECTOR, ".prd_label-integrity") # Mencari elemen jumlah produk yang terjual
                        sold = sold_elem.text if sold_elem else None # Mengambil teks dari elemen jumlah produk yang terjual
                    except:
                        sold = None # Jika gagal, set jumlah produk yang terjual menjadi None

                    # Mengambil link gambar produk dari elemen
                    try:
                        image_elem = container.find_element(By.CSS_SELECTOR, ".css-1q90pod") # Mencari elemen gambar produk
                        image = image_elem.get_attribute('src') if image_elem else None # Mengambil atribut 'src' dari elemen gambar produk
                    except:
                        image = None # Jika gagal, set link gambar menjadi None

                    # Menambahkan data produk ke dalam list product_data
                    product_data.append({
                        'name_product': name, # Nama produk
                        'product_link': link, # Link produk
                        'price_sale': price_sale, # Harga jual
                        'price_original': price, # Harga asli
                        'discount': discount, # Diskon
                        'sold': sold, # Jumlah produk yang terjual
                        'rating': rating, # Rating produk
                        'image_link': image # Link gambar produk
                    })

            # Mengonversi list product_data ke dalam DataFrame
            exsport_tokped_df = pd.DataFrame(product_data)

            # Menyimpan DataFrame ke dalam file CSV
            exsport_tokped_df.to_csv(self.output().path, index=False)

        except Exception as e:
            print(f"Terjadi kesalahan: {e}") # Menampilkan pesan kesalahan jika terjadi kesalahan
        
        finally:
            driver.quit() # Menutup browser

In [14]:
luigi.build([ExtractTokpedExsportData()], local_scheduler=True) # Menjalankan task ExtractTokpedExsportData

DEBUG: Checking if ExtractTokpedExsportData() is complete
INFO: Informed scheduler that task   ExtractTokpedExsportData__99914b932b   has status   PENDING
INFO: Done scheduling tasks
INFO: Running Worker with 1 processes
DEBUG: Asking scheduler for work...
DEBUG: Pending tasks: 1
INFO: [pid 26588] Worker Worker(salt=3377661383, workers=1, host=zueible, username=LENOVO, pid=26588) running   ExtractTokpedExsportData()
INFO: [pid 26588] Worker Worker(salt=3377661383, workers=1, host=zueible, username=LENOVO, pid=26588) done      ExtractTokpedExsportData()
DEBUG: 1 running tasks, waiting for next task to finish
INFO: Informed scheduler that task   ExtractTokpedExsportData__99914b932b   has status   DONE
DEBUG: Asking scheduler for work...
DEBUG: Done
DEBUG: There are no more tasks to run at this time
INFO: Worker Worker(salt=3377661383, workers=1, host=zueible, username=LENOVO, pid=26588) was stopped. Shutting down Keep-Alive thread
INFO: 
===== Luigi Execution Summary =====

Scheduled 1 t

True

In [5]:
class ExtractTokpedStockExsportData(luigi.Task):
    def requires(self):
        return ExtractTokpedExsportData() # Task yang diperlukan
    
    def output(self):
        return luigi.LocalTarget('extract-raw-data/exsport_stock_tokped_raw.csv') # Tempat penyimpanan data yang diekstrak


    def run(self):
        extract_data = pd.read_csv(self.input().path) # Membaca file CSV yang diekstrak sebelumnya
        pages = extract_data['product_link'].tolist() # Mengambil kolom product_link dan mengonversinya ke dalam list

        # Mengatur opsi untuk webdriver Chrome
        options = Options()
        options.add_argument("--headless")
        options.add_argument("--disable-gpu")
        options.add_argument("--window-size=1920x1080")
        driver = webdriver.Chrome(options=options) # Membuat instance dari webdriver Chrome

        product_data = [] # List untuk menyimpan data produk

        try:
            for page in pages: # Mengambil data dari halaman 1 hingga 11
                url = page # Membuat URL untuk halaman saat ini
                driver.get(url) # Mengakses URL

                # Menunggu hingga elemen body muncul
                WebDriverWait(driver, 15).until(
                    EC.presence_of_element_located((By.TAG_NAME, 'body'))
                )

                # Menggulir halaman untuk memuat lebih banyak produk
                for _ in range(5):
                    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") # Menggulir ke bawah
                    time.sleep(2) # Menunggu 2 detik
                    driver.execute_script("window.scrollTo(0, document.body.scrollHeight / 2);") # Menggulir ke atas
                    time.sleep(2) # Menunggu 2 detik

                # Mengambil elemen produk
                product_containers = driver.find_elements(By.CSS_SELECTOR, "[class='css-856ghu']")

                for container in product_containers: # Iterasi setiap elemen produk
                    try:
                        name = container.find_element(By.CSS_SELECTOR, "[data-testid='lblPDPDetailProductName']").text # Mengambil nama produk
                    except:
                        name = None # Jika gagal, set nama menjadi None

                    # Mengambil stock jual produk dari elemen
                    try:
                        stock_elem = container.find_element(By.CSS_SELECTOR, "[data-testid='stock-label']")  # Mencari elemen stok
                        stock_text = stock_elem.text.strip()  # Mengambil teks dan menghapus spasi ekstra
                        
                        # Mengambil angka stok setelah "Stok Total: "
                        stock = stock_text.split(":")[-1].strip()  
                    except:
                        stock = None  # Jika gagal, set stok menjadi None

                    # Mengambil kategori produk dari elemen
                    try:
                        # Mencari elemen etalase berdasarkan class
                        etalase_elem = container.find_element(By.CSS_SELECTOR, "li.css-1i6xy22 a b")
                        etalase = etalase_elem.text.strip()  # Mengambil teks dari elemen <b>
                    except:
                        etalase = None  # Jika gagal, set etalase menjadi None

                    # Menambahkan data produk ke dalam list product_data
                    product_data.append({
                        'name_product': name,
                        'stock': stock,
                        'kategori': etalase
                    })


            # Mengonversi list product_data ke dalam DataFrame
            exsport_stock_tokped_df = pd.DataFrame(product_data)

            # Menyimpan DataFrame ke dalam file CSV
            exsport_stock_tokped_df.to_csv(self.output().path, index=False)

        except Exception as e:
            print(f"Terjadi kesalahan: {e}") # Menampilkan pesan kesalahan jika terjadi kesalahan
        
        finally:
            driver.quit() # Menutup browser

In [6]:
luigi.build([ExtractTokpedStockExsportData()], local_scheduler=True) # Menjalankan task ExtractTokpedStockExsportData

DEBUG: Checking if ExtractTokpedStockExsportData() is complete
DEBUG: Checking if ExtractTokpedExsportData() is complete
INFO: Informed scheduler that task   ExtractTokpedStockExsportData__99914b932b   has status   PENDING
INFO: Informed scheduler that task   ExtractTokpedExsportData__99914b932b   has status   DONE
INFO: Done scheduling tasks
INFO: Running Worker with 1 processes
DEBUG: Asking scheduler for work...
DEBUG: Pending tasks: 1
INFO: [pid 13744] Worker Worker(salt=9433573625, workers=1, host=zueible, username=LENOVO, pid=13744) running   ExtractTokpedStockExsportData()
INFO: [pid 13744] Worker Worker(salt=9433573625, workers=1, host=zueible, username=LENOVO, pid=13744) done      ExtractTokpedStockExsportData()
DEBUG: 1 running tasks, waiting for next task to finish
INFO: Informed scheduler that task   ExtractTokpedStockExsportData__99914b932b   has status   DONE
DEBUG: Asking scheduler for work...
DEBUG: Done
DEBUG: There are no more tasks to run at this time
INFO: Worker Wo

True

In [6]:
class ValidateData(luigi.Task):
    def requires(self):
        return [ExtractTokpedExsportData(), ExtractTokpedStockExsportData()] # Task yang diperlukan
    
    def output(self):
        return luigi.LocalTarget('validate-raw-data/validate_data.txt')

    def run(self):
        # Buat direktori jika belum ada
        os.makedirs(os.path.dirname(self.output().path), exist_ok=True)
        
        raw_exsport_data = pd.read_csv(self.input()[0].path) # Membaca file CSV yang diekstrak sebelumnya
        raw_exsport_stock_data = pd.read_csv(self.input()[1].path) # Membaca file CSV yang diekstrak sebelumnya

        list_df = {
            'raw_exsport_data': raw_exsport_data,
            'raw_exsport_stock_data': raw_exsport_stock_data
        }
        
        with open(self.output().path, 'w', newline='') as f:
            # Check Data Shape
            f.write("==================== Check Data Shape ====================\n\n")
            result_df = []
            for name, df in list_df.items():
                n_columns = df.shape[1]
                n_rows = df.shape[0]
                result_df.append([name, n_columns, n_rows])
                
            headers_shape = ['Dataframe', 'Columns', 'Rows']
            f.write(tabulate(result_df, headers_shape, tablefmt="grid"))
            f.write("\n\n")

            # Check Data Values
            for name, df in list_df.items():
                result_val = []
                for col in df.columns:
                    col_type = df[col].dtype
                    sum_na = round(df[col].isna().sum() * 100 / len(df))
                    sum_dup = round(df.duplicated(keep=False).sum())
                    result_val.append([col, col_type, sum_na, sum_dup])

                headers_val = ['Column Name', 'Data Type', 'Missing Values (%)', 'Duplicate Values (count)']
                f.write(f"Checking Data Values: {name}\n")
                f.write(tabulate(result_val, headers_val, tablefmt="grid"))
                f.write("\n\n")

In [7]:
luigi.build([ValidateData()], local_scheduler=True) # Menjalankan task ValidateData

DEBUG: Checking if ValidateData() is complete
DEBUG: Checking if ExtractTokpedExsportData() is complete
DEBUG: Checking if ExtractTokpedStockExsportData() is complete
INFO: Informed scheduler that task   ValidateData__99914b932b   has status   PENDING
INFO: Informed scheduler that task   ExtractTokpedStockExsportData__99914b932b   has status   DONE
INFO: Informed scheduler that task   ExtractTokpedExsportData__99914b932b   has status   DONE
INFO: Done scheduling tasks
INFO: Running Worker with 1 processes
DEBUG: Asking scheduler for work...
DEBUG: Pending tasks: 1
INFO: [pid 23944] Worker Worker(salt=2009130413, workers=1, host=zueible, username=LENOVO, pid=23944) running   ValidateData()
INFO: [pid 23944] Worker Worker(salt=2009130413, workers=1, host=zueible, username=LENOVO, pid=23944) done      ValidateData()
DEBUG: 1 running tasks, waiting for next task to finish
INFO: Informed scheduler that task   ValidateData__99914b932b   has status   DONE
DEBUG: Asking scheduler for work...
D

True

In [49]:
df = pd.read_csv('extract-raw-data/exsport_tokped_raw.csv', index_col=False)
df.head()

Unnamed: 0,name_product,product_link,price_sale,price_original,discount,sold,rating,image_link
0,Exsport All Set Multipurpose Pouch - Dark Green,https://www.tokopedia.com/exsportstore/exsport...,Rp141.550,Rp149.000,5%,,,https://images.tokopedia.net/img/cache/200-squ...
1,Exsport All Set Multipurpose Pouch - Light Brown,https://www.tokopedia.com/exsportstore/exsport...,Rp141.550,Rp149.000,5%,,,https://images.tokopedia.net/img/cache/200-squ...
2,Exsport All Set Multipurpose Pouch - Dark Purple,https://www.tokopedia.com/exsportstore/exsport...,Rp141.550,Rp149.000,5%,,,https://images.tokopedia.net/img/cache/200-squ...
3,Exsport All Set Multipurpose Pouch - Black,https://www.tokopedia.com/exsportstore/exsport...,Rp141.550,Rp149.000,5%,,,https://images.tokopedia.net/img/cache/200-squ...
4,Exsport Basic Half Moon Mini Sling Bag - Blue...,https://www.tokopedia.com/exsportstore/exsport...,Rp170.050,Rp179.000,5%,,,https://images.tokopedia.net/img/cache/200-squ...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 802 entries, 0 to 801
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   name_product    802 non-null    object 
 1   product_link    802 non-null    object 
 2   price_sale      802 non-null    object 
 3   price_original  327 non-null    object 
 4   discount        327 non-null    object 
 5   sold            720 non-null    object 
 6   rating          662 non-null    float64
 7   image_link      802 non-null    object 
dtypes: float64(1), object(7)
memory usage: 50.3+ KB


In [37]:
def extract_color(name_product):
    color_list = [
        'DARK GREEN', 'SKY BLUE', 'BIRU MUDA', 'BLUE', 'ORANGE', 'LIGHT YELLOW', 'GREEN ARMY', 'FUCHSIA',
        'LIGHT BROWN', 'DARK BLUE', 'MINT GREEN', 'DARK GREY', 'UNGU MUDA', 'FUCSHIA', 'LIGHT PINK', 'CREAM',
        'GOLD', 'DARK OLIVE', 'RED', 'GREY', 'WHITE', 'DARK PURPLE', 'SALEM', 'FUSCHIA', 'HITAM', 'LIME',
        'BROWN', 'BLUEBERRY', 'SOFT BLUE', 'BLACK', 'DARK SALEM', 'YELLOW', 'MAROON', 'DARK BROWN', 'BEIGE',
        'LIGHT BLUE', 'GREEN', 'COKELAT MUDA', 'LIGHT GREEN', 'CURRY', 'PINK', 'KREM', 'BIRU TUA', 'KHAKI',
        'PURPLE', 'EMERALD GREEN', 'DARK ORANGE', 'FUCHIA', 'NAVY', 'LIGHT PURPLE', 'OLIVE', 'LIGHT GREY'
    ]

    color_translation = {
        'KREM': 'CREAM',
        'COKELAT MUDA': 'LIGHT BROWN',
        'BIRU TUA': 'DARK BLUE',
        'HITAM': 'BLACK',
        'BIRU MUDA': 'LIGHT BLUE',
        'SALEM': 'SALMON',
        'UNGU MUDA': 'LIGHT PURPLE',
        'FUCSHIA': 'FUCHSIA',
        'FUCHSIA': 'FUCHSIA',
        'FUCHIA': 'FUCHSIA',
    }

    name_product = str(name_product).strip()

    # Konversi ke huruf besar
    name_upper = name_product.upper()

    # Ambil warna setelah tanda "-"
    color_product = name_upper.split('-')[-1].strip()

    # Urutan pengecekan:
    if color_product in color_translation:  # 1. Cek di color_translation dulu
        return color_translation[color_product]
    elif color_product in color_list:  # 2. Cek di color_list
        return color_product
    else:  # 3. Jika tidak ada di keduanya
        return 'Tidak ada spesifikasi warna'

In [50]:
df['color_product'] = df['name_product'].apply(extract_color)
df['name_product'] = df['name_product'].str.split('-').str[0].str.upper()
df.head()

Unnamed: 0,name_product,product_link,price_sale,price_original,discount,sold,rating,image_link,color_product
0,EXSPORT ALL SET MULTIPURPOSE POUCH,https://www.tokopedia.com/exsportstore/exsport...,Rp141.550,Rp149.000,5%,,,https://images.tokopedia.net/img/cache/200-squ...,DARK GREEN
1,EXSPORT ALL SET MULTIPURPOSE POUCH,https://www.tokopedia.com/exsportstore/exsport...,Rp141.550,Rp149.000,5%,,,https://images.tokopedia.net/img/cache/200-squ...,LIGHT BROWN
2,EXSPORT ALL SET MULTIPURPOSE POUCH,https://www.tokopedia.com/exsportstore/exsport...,Rp141.550,Rp149.000,5%,,,https://images.tokopedia.net/img/cache/200-squ...,DARK PURPLE
3,EXSPORT ALL SET MULTIPURPOSE POUCH,https://www.tokopedia.com/exsportstore/exsport...,Rp141.550,Rp149.000,5%,,,https://images.tokopedia.net/img/cache/200-squ...,BLACK
4,EXSPORT BASIC HALF MOON MINI SLING BAG,https://www.tokopedia.com/exsportstore/exsport...,Rp170.050,Rp179.000,5%,,,https://images.tokopedia.net/img/cache/200-squ...,BLUEBERRY


In [51]:
df['price_original'] = df['price_original'].fillna(df['price_sale'])
df['price_original'] = df['price_original'].str.replace('Rp', '').str.replace('.', '').str.strip().apply(pd.to_numeric)
df['price_sale'] = df['price_sale'].str.replace('Rp', '').str.replace('.', '').str.strip().apply(pd.to_numeric)
df['discount'] = df['discount'].fillna('Tidak ada Discount')
df['sold'] = df['sold'].str.split().str[0].fillna('0')
df['rating'] = df['rating'].fillna('Tidak ada Rating')

In [61]:
def generate_product_id(index):
    letters = "ETWS"  # Menggunakan ETWS secara berurutan
    numbers = ''.join(random.choices("0123456789", k=3))  # 3 angka acak
    
    return letters + numbers

In [67]:
def generate_color_id(index):
    letters = "ECLR"  # Menggunakan ETWS secara berurutan
    numbers = ''.join(random.choices("0123456789", k=3))  # 3 angka acak
    
    return letters + numbers

In [62]:
df["product_id"] = [generate_product_id(i) for i in range(len(df))]

In [68]:
df_color = {
    'color_product' : df['color_product'].unique(),
    'color_id' : [generate_color_id(i) for i in range(len(df['color_product'].unique()))]
}

In [69]:
df_color = pd.DataFrame(df_color)
df_color.head()

Unnamed: 0,color_product,color_id
0,DARK GREEN,ECLR602
1,LIGHT BROWN,ECLR976
2,DARK PURPLE,ECLR107
3,BLACK,ECLR056
4,BLUEBERRY,ECLR112


In [66]:
df.head()

Unnamed: 0,name_product,product_link,price_sale,price_original,discount,sold,rating,image_link,color_product,product_id
0,EXSPORT ALL SET MULTIPURPOSE POUCH,https://www.tokopedia.com/exsportstore/exsport...,141550,149000,5%,0,Tidak ada Rating,https://images.tokopedia.net/img/cache/200-squ...,DARK GREEN,ETWS420
1,EXSPORT ALL SET MULTIPURPOSE POUCH,https://www.tokopedia.com/exsportstore/exsport...,141550,149000,5%,0,Tidak ada Rating,https://images.tokopedia.net/img/cache/200-squ...,LIGHT BROWN,ETWS485
2,EXSPORT ALL SET MULTIPURPOSE POUCH,https://www.tokopedia.com/exsportstore/exsport...,141550,149000,5%,0,Tidak ada Rating,https://images.tokopedia.net/img/cache/200-squ...,DARK PURPLE,ETWS822
3,EXSPORT ALL SET MULTIPURPOSE POUCH,https://www.tokopedia.com/exsportstore/exsport...,141550,149000,5%,0,Tidak ada Rating,https://images.tokopedia.net/img/cache/200-squ...,BLACK,ETWS882
4,EXSPORT BASIC HALF MOON MINI SLING BAG,https://www.tokopedia.com/exsportstore/exsport...,170050,179000,5%,0,Tidak ada Rating,https://images.tokopedia.net/img/cache/200-squ...,BLUEBERRY,ETWS824


In [71]:
df = df.merge(df_color[['color_product', 'color_id']], on='color_product', how='left')

In [72]:
df.head()

Unnamed: 0,name_product,product_link,price_sale,price_original,discount,sold,rating,image_link,color_product,product_id,color_id
0,EXSPORT ALL SET MULTIPURPOSE POUCH,https://www.tokopedia.com/exsportstore/exsport...,141550,149000,5%,0,Tidak ada Rating,https://images.tokopedia.net/img/cache/200-squ...,DARK GREEN,ETWS420,ECLR602
1,EXSPORT ALL SET MULTIPURPOSE POUCH,https://www.tokopedia.com/exsportstore/exsport...,141550,149000,5%,0,Tidak ada Rating,https://images.tokopedia.net/img/cache/200-squ...,LIGHT BROWN,ETWS485,ECLR976
2,EXSPORT ALL SET MULTIPURPOSE POUCH,https://www.tokopedia.com/exsportstore/exsport...,141550,149000,5%,0,Tidak ada Rating,https://images.tokopedia.net/img/cache/200-squ...,DARK PURPLE,ETWS822,ECLR107
3,EXSPORT ALL SET MULTIPURPOSE POUCH,https://www.tokopedia.com/exsportstore/exsport...,141550,149000,5%,0,Tidak ada Rating,https://images.tokopedia.net/img/cache/200-squ...,BLACK,ETWS882,ECLR056
4,EXSPORT BASIC HALF MOON MINI SLING BAG,https://www.tokopedia.com/exsportstore/exsport...,170050,179000,5%,0,Tidak ada Rating,https://images.tokopedia.net/img/cache/200-squ...,BLUEBERRY,ETWS824,ECLR112


In [56]:
df_stock = pd.read_csv('extract-raw-data/exsport_stock_tokped_raw.csv', index_col=False)
df_stock.head()

Unnamed: 0,name_product,stock,kategori
0,Exsport All Set Multipurpose Pouch - Dark Green,13,Semua Etalase
1,Exsport All Set Multipurpose Pouch - Light Brown,14,Semua Etalase
2,Exsport All Set Multipurpose Pouch - Dark Purple,12,Semua Etalase
3,Exsport All Set Multipurpose Pouch - Black,14,Semua Etalase
4,Exsport Basic Half Moon Mini Sling Bag - Blueb...,21,Tas Sling Bag


In [57]:
df_stock['color_product'] = df_stock['name_product'].apply(extract_color)
df_stock['name_product'] = df_stock['name_product'].str.split('-').str[0].str.upper()
df_stock['kategori'] = df_stock['kategori'].str.upper()
df_stock.head()

Unnamed: 0,name_product,stock,kategori,color_product
0,EXSPORT ALL SET MULTIPURPOSE POUCH,13,SEMUA ETALASE,DARK GREEN
1,EXSPORT ALL SET MULTIPURPOSE POUCH,14,SEMUA ETALASE,LIGHT BROWN
2,EXSPORT ALL SET MULTIPURPOSE POUCH,12,SEMUA ETALASE,DARK PURPLE
3,EXSPORT ALL SET MULTIPURPOSE POUCH,14,SEMUA ETALASE,BLACK
4,EXSPORT BASIC HALF MOON MINI SLING BAG,21,TAS SLING BAG,BLUEBERRY


In [64]:
df_stock = df_stock.merge(df[['name_product', 'product_id']], on='name_product', how='left')

In [65]:
df_stock.head()

Unnamed: 0,name_product,stock,kategori,color_product,product_id
0,EXSPORT ALL SET MULTIPURPOSE POUCH,13,SEMUA ETALASE,DARK GREEN,ETWS420
1,EXSPORT ALL SET MULTIPURPOSE POUCH,13,SEMUA ETALASE,DARK GREEN,ETWS485
2,EXSPORT ALL SET MULTIPURPOSE POUCH,13,SEMUA ETALASE,DARK GREEN,ETWS822
3,EXSPORT ALL SET MULTIPURPOSE POUCH,13,SEMUA ETALASE,DARK GREEN,ETWS882
4,EXSPORT ALL SET MULTIPURPOSE POUCH,14,SEMUA ETALASE,LIGHT BROWN,ETWS420


In [55]:
print(list(set(df_stock['kategori'])))

['Tas Sling Bag', 'Tas Tote Bag', 'Poppin Series', 'Semua Etalase', 'TAS LAPTOP', 'STARTER TENNIS SERIES', 'SWEET VALENTINE GIFTS', 'BASIC COLLECTION', 'JUNIOR SERIES THE FUTURE ME', 'Vacation Series', 'Tas Mini Backpack', 'FILL IN COLOUR SERIES', 'Tas Backpack']
