In [1]:
import json
import random
from copy import deepcopy
from os import path
import os
import requests

In [2]:
# Link api, key api dùng
API_URL = "https://noteb.com/api/webservice.php"
API_KEY = "112233aabbcc"

# Các file để tránh crawl data bị trùng
MODEL_CACHE_FILE = path.join("cache", "cached_model.txt")
CONFIGURATE_ID_CACHE_FILE = path.join("cache","cached_configurate_id.txt")
# Đường dẫn trỏ đến file cached_gen_configurate.txt dùng để lưu các giá trị hash id của các configuration đã được tạo ra
GEN_CONFIGURATE_CACHE_FILE = path.join("cache","cached_gen_configurate.txt")

In [3]:
# feature_domain
# List các giá trị của producer_domain, processor_name_domain, processor_model_domain
producer_domain = {"apple", "asus", "hp", "lenovo", "lg", "dell", "acer"}
processor_name_domain = {"intel", "amd", "cpu"}
processor_model_domain = {"i3", "i5", "i7", "ryzen 5", "ryzen 3", "ryzen 7", "integrated"}

# Configuration map để lấy cái map các id của từng thuộc tính ứng với chức năng get_conf_info trong api
config_map = {
    "model_info": "mode_id",
    "cpu": "cpu_id",
    "display": "display_id",
    "memory": "memory_id",
    "primary_storage": "primary_storage_id",
    "secondary_storage": "secondary_storage_id",
    "gpu": "gpu_id",
    "wireless": "wireless_id",
    "optical_drive": "optical_drive_id",
    "motherboard": "motherboard_id",
    "chassis": "chassis_id",
    "battery": "battery_id",
    "warranty": "warranty_id",
    "operating_system": "operating_system_id"
}

feature_config_map = dict()

for key, value in config_map.items():
  feature_config_map[key] = {
    "param": value
  }

# Các thuộc tính dùng để generate thêm data
require_feature = ["memory", "primary_storage", "secondary_storage", "operating_system"]
for feature in require_feature:
  feature_config_map[feature]["allow_custom"] = True

##  Class Feature

Class `Feature` chứa các class con `Producer`, `Processor`, `Ram`, `HDDStorage`, `SSDStorage`, `Graphic`, `Screen`, `Feature`, `Chassis`, `OperatingSystem`, `Battery`.

Constructor cho mỗi class có tham số là `data`: `data` là dictionary result được lấy từ response json.

In [4]:
# feature
class Feature:
    def __init__(self):
        pass

    def is_valid(self):
        pass

class Producer(Feature):
    def __init__(self, data):
        self.name = self.extract_name(data["noteb_name"])

    def extract_name(self, raw_name):
        raw_name = raw_name.strip().lower()
        for producer_name in producer_domain:
            if raw_name.find(producer_name) != -1:
                return producer_name

        return "N/A"

    def is_valid(self):
        return self.name in producer_domain

    def __str__(self):
        return self.name

class Processor(Feature):
    def __init__(self, data):
        self.name = data["prod"].strip().lower()
        self.model = self.extract_model(data["model"])
        self.cores = int(data["cores"])
        self.core_speed = float(data["base_speed"])
        self.boost_speed = float(data['boost_speed'])

    def extract_model(self, raw_model):
        raw_model = raw_model.strip().lower()
        for model_name in processor_model_domain:
            if raw_model.find(model_name) != -1:
                return model_name
        return "N/A"

    def is_valid(self):
        return self.name in processor_name_domain and self.model in processor_model_domain

    def __str__(self):
        return f"{self.name}, {self.model}, {self.cores}, {self.core_speed}, {self.boost_speed}"

class Ram(Feature):
    def __init__(self, data):
        self.type = data["type"].strip().lower()
        self.size = int(data["size"])

    def is_valid(self):
        return isinstance(self.size, int) and self.size > 0

    def __str__(self):
        return self.type + ', ' + str(self.size)

class HDDStorage(Feature):
    def __init__(self, data):
        self.size = self.extract_size(data)

    def extract_size(self, data):
        total = 0
        keys = ["primary_storage", "secondary_storage"]
        for key in keys:
            if data[key]["model"].find("HDD") != -1:
                total += int(data[key]["cap"])
        return total

    def is_valid(self):
        return isinstance(self.size, int) and self.size >= 0

    def __str__(self):
        return str(self.size)

class SSDStorage(Feature):
    def __init__(self, data):
        self.size = self.extract_size(data)

    def extract_size(self, data):
        total = 0
        keys = ["primary_storage", "secondary_storage"]
        for key in keys:
            if data[key]["model"].find("SSD") != -1:
                total += int(data[key]["cap"])
        return total

    def is_valid(self):
        return isinstance(self.size, int) and self.size >= 0

    def __str__(self):
        return str(self.size)

class Graphic(Feature):
    def __init__(self, data):
        self.size = int(data["memory_size"])
        self.prod = data["prod"].strip().lower()
        self.base_speed = int(data["base_speed"])
        self.boost_speed = int(data["boost_speed"])

    def is_valid(self):
        return isinstance(self.size, int) and isinstance(self.prod, str) and self.size > 0

    def __str__(self):
        return f"{self.prod}, {self.size}, {self.base_speed}, {self.boost_speed}"

class Screen(Feature):
    def __init__(self, data):
        self.size = float(data["size"])
        self.type = data["type"].strip().lower()
        self.horizontal_resolution = int(data["horizontal_resolution"])
        self.vertical_resolution = int(data["vertical_resolution"])
        self.sRGB = int(data["sRGB"])

    def is_valid(self):
        return isinstance(self.size, float) and isinstance(self.type, str) and self.size > 0

    def __str__(self):
        return f"{self.type}, {self.size}, {self.horizontal_resolution}, {self.vertical_resolution}, {self.sRGB}"

class Chassis(Feature):
    def __init__(self, data):
        self.weight = float(data["weight_kg"])

    def is_valid(self):
        return isinstance(self.weight, float) and self.weight > 0

    def __str__(self):
        return str(self.weight)

class OperatingSystem(Feature):
    def __init__(self, data):
        if isinstance(data, str):
            self.name = data.strip().lower()
        else:
            self.name = data["name"].strip().lower()

    def is_valid(self):
        return isinstance(self.name, str)

    def __str__(self):
        return self.name

class Battery(Feature):
    def __init__(self, data):
        self.capacity = float(data["capacity"])

    def is_valid(self):
        return isinstance(self.capacity, float) and self.capacity > 0

    def __str__(self):
        return str(self.capacity)

In [5]:
# Lớp Laptop có feature là 1 dictionary chứa tất cả các thuộc tính.
class Laptop:
    def __init__(self, data):
        self.features = {
            "producer": Producer(data["model_info"][0]),
            "processor": Processor(data["cpu"]),
            "ram": Ram(data["memory"]),
            "hdd_storage": HDDStorage(data),
            "ssd_storage":SSDStorage(data),
            "graphic": Graphic(data["gpu"]),
            "screen": Screen(data["display"]),
            "weight": Chassis(data["chassis"]),
            "os": OperatingSystem(data["operating_system"]),
            "battery": Battery(data["battery"])
        }
        self.price = (int(data["config_price_min"]) + int(data["config_price_max"])) / 2

    def is_valid(self):
        for key, feature in self.features.items():
            if not feature.is_valid():
                return False
        return True

    def __str__(self):
        feature_list = list(self.features)
        return ','.join(map(str, (self.features[key] for key in feature_list))).strip() + ',' + str(self.price)

In [6]:
# Lớp Controller chứa tất cả các function là method giống trong API để request lên web lấy dữ liệu
class Controller:
  @staticmethod
  def create_payload(method = None, params = None):
    payload = {
      "apikey": API_KEY,
      "method": method,
    }

    for key, value in params.items():
      param = f"param[{key}]"
      payload[param] = value

    return payload

  @staticmethod
  def get_list_model(model_name=None):
    params = {
      "model_name": model_name if model_name else '',
    }
    payload = Controller.create_payload(method = "list_models", params = params)


    r = requests.post(API_URL, data=payload)

    data = r.json()

    print("daily_hits_left: {}".format(data["daily_hits_left"]))

    return data["result"]

  @staticmethod
  def get_model_info(model_id):
    params = {
      "model_id": model_id
    }
    payload = Controller.create_payload(method="get_model_info", params = params)

    r = requests.post(API_URL, data=payload)

    data = r.json()

    print("daily_hits_left: {}".format(data["daily_hits_left"]))

    return data["result"]["0"] if data["result"] else None

  @staticmethod
  def get_model_info_all(model_id):
    params = {
      "model_id": model_id
    }
    payload = Controller.create_payload(method="get_model_info_all", params = params)

    r = requests.post(API_URL, data=payload)

    data = r.json()

    print("daily_hits_left: {}".format(data["daily_hits_left"]))

    return data["result"]["0"] if data["result"] else None

  @staticmethod
  def get_conf_info(config):
    payload = Controller.create_payload(method="get_conf_info", params=config)

    r = requests.post(API_URL, data=payload)

    data = r.json()

    print("daily_hits_left: {}".format(data["daily_hits_left"]))

    return data["result"] if data["result"] else None

In [8]:
# Lớp Crawler dùng để crawl data:
# Có 2 function chính là crawl và crawl_with_custom_config
# Function crawl: dùng method get_model_info và truyền vào id của laptop để thu thập data
# Function crawl_with_custom_config: dùng method get_model_info_all để tiến hành generate thêm data 
# dựa vào nhiều configuration rồi gọi method get_conf_info để thu thập giá tiền laptop.
class Crawler:
  def __init__(self):
    self.models = None
    self.cached_model = self.get_cache(file_name=MODEL_CACHE_FILE)
    self.cached_configurate_id = self.get_cache(file_name=CONFIGURATE_ID_CACHE_FILE)
    self.cached_gen_configurate = self.get_cache(file_name=GEN_CONFIGURATE_CACHE_FILE)

  def get_cache(self, file_name):
    if not os.path.isfile(file_name):
      return set()

    cached = set()
    with open(file_name) as f:
      for line in f:
        cached.add(int(line))

    return cached

  def get_conf_hash(self, configure):
    hashed = hash(frozenset(configure.items()))
    return hashed

  def generate_configurate(self, model_info_all):
    configures = []
    tried = 0
    MAX_ATTEMPT  = 20
    success_generated = 0
    MAX_GENERATE = 2

    while tried < MAX_ATTEMPT and success_generated < MAX_GENERATE:
      tried  += 1
      configure = {}
      info = {}
      changed = False

      for feature, config in feature_config_map.items():
        if feature == "model_info":
          configure["model_id"] = model_info_all["model_info"][0]["id"]
          info["model_info"] = model_info_all["model_info"]
          continue

        configure[config["param"]] = str(model_info_all[feature]["selected"])

        if "allow_custom" in config:
          options = list(model_info_all[feature].keys())

          options.remove(str(model_info_all[feature]["selected"]))
          options.remove("selected")

          if len(options) > 0:
            configure[config["param"]] = random.choice(options)
            changed = True

        info[feature] = model_info_all[feature][configure[config["param"]]]

      config_hash = self.get_conf_hash(configure)
      if changed and config_hash not in self.cached_gen_configurate:
        success_generated += 1
        self.cached_gen_configurate.add(config_hash)
        configures.append((configure, info))

    return configures

  def save_model(self, data):
    headers = "producer,processor prod,processor model,cores,core base speed (GHz),core boost speed (GHz),ram type,ram cap (GB),ssd (GB),hdd (GB),gpu prod,gpu size (MB),gpu base speed (GHz),gpu boost speed (GHz),screen type,screen size (inch),screen horizontal resolution,screen vertical resolution,sRGB (%),weight (kg),os,battery capacity (WHr),price(USD)\n"

    with open("data.csv", "a") as f:
      if os.path.getsize("data.csv") == 0:
        f.write(headers)
      f.write(f"{data}\n")

  def save_gen_configurate(self, configurate_id):
    with open(GEN_CONFIGURATE_CACHE_FILE, "a") as f:
      f.write(f"{configurate_id}\n")

  def save_cache(self):
    with open(CONFIGURATE_ID_CACHE_FILE, "w") as f:
      for config_id in self.cached_configurate_id:
        f.write(f"{config_id}\n")

    with open(MODEL_CACHE_FILE, "w") as f:
      for model_id in self.cached_model:
        f.write(f"{model_id}\n")

  def crawl_with_custom_config(self, producers=[None]):
    for producer in producers:
      models = Controller.get_list_model(model_name=producer)
      for model in models.values():
        model_id = model["model_info"][0]["id"]

        print(f"Generate configuration for model {model_id}")

        model_info_all = Controller.get_model_info_all(model_id)


        if not model_info_all:
          print("Empty info {}".format(model_id))
          continue

        gen_configures = self.generate_configurate(model_info_all)

        print(f"Generated {len(gen_configures)} config")

        saved_model = 0

        for config, model_info in gen_configures:
          extra_info = Controller.get_conf_info(config)

          if not extra_info:
            continue

          model_info["config_price_min"] = extra_info["config_price_min"]
          model_info["config_price_max"] = extra_info["config_price_max"]

          laptop = Laptop(model_info)

          if laptop.is_valid():
            self.save_model(laptop)
            self.save_gen_configurate(self.get_conf_hash(config))
            print(laptop)
            saved_model += 1

        print(f"Successful save {saved_model} generate model\n")

  def crawl(self, producers=[None]):
    for producer in producers:
      models = Controller.get_list_model(model_name=producer)

      for model in models.values():
        model_id = model["model_info"][0]["id"]
        if model_id in self.cached_model:
          print(f"Ignore model {model_id}")
          continue
        print(f"Crawl model {model_id}")

        model_info = Controller.get_model_info(model_id)

        if model_info is None:
          print("Empty info :(")
          continue

        laptop = Laptop(model_info)

        if laptop.is_valid():
          config_id = int(model_info["config_id"])
          if config_id in self.cached_configurate_id:
            print("Configuration already save")
            continue
          self.cached_configurate_id.add(config_id)
          self.cached_model.add(model_id)
          self.save_model(laptop)
          self.save_cache()
          print(f"{laptop}\n")

In [10]:
crawler = Crawler()

producers = ["hp"]

crawler.crawl(producers = producers)

crawler.crawl_with_custom_config(producers = producers)

daily_hits_left: 160
Crawl model 626
daily_hits_left: 159
Crawl model 877
daily_hits_left: 158
Crawl model 1083
daily_hits_left: 157
Crawl model 1270
daily_hits_left: 156
hp,intel, i5, 4, 1.6, 3.4,ddr4, 8,0,256,intel, 1700, 300, 1100,led ips, 15.6, 1920, 1080, 0,2.1,windows pro 10.00,48.0,1049.0

Crawl model 1280
daily_hits_left: 155
Empty info :(
Crawl model 1304
daily_hits_left: 154
hp,intel, i7, 4, 1.8, 4.0,ddr4, 16,0,512,intel, 1700, 300, 1150,led ips, 13.3, 1920, 1080, 90,1.25,windows home 10.00,60.0,1300.0

Crawl model 1475
daily_hits_left: 153
hp,intel, i5, 4, 1.6, 3.4,ddr4, 8,0,256,intel, 1700, 300, 1100,led ips, 13.3, 1920, 1080, 0,1.55,windows pro 10.00,50.0,1063.5

Crawl model 1476
daily_hits_left: 152
hp,intel, i5, 4, 1.6, 3.4,ddr4, 16,0,512,intel, 1700, 300, 1100,led ips, 14.0, 1920, 1080, 90,1.48,windows pro 10.00,50.0,1199.5

Crawl model 1552
daily_hits_left: 151
Crawl model 1704
daily_hits_left: 150
hp,amd, ryzen 5, 4, 2.0, 3.6,ddr4, 8,0,256,amd, 2048, 1100, 1200,led ip

daily_hits_left: 99
hp,intel, i7, 6, 2.6, 4.5,ddr4, 16,0,512,intel, 1700, 350, 1150,led ips, 15.6, 1920, 1080, 60,2.6,windows pro 10.00,90.0,2560.5

Crawl model 3557
daily_hits_left: 98
Crawl model 3559
daily_hits_left: 97
hp,amd, ryzen 3, 4, 2.1, 3.5,ddr4, 8,0,256,amd, 2048, 1100, 1500,led ips, 14.0, 1920, 1080, 60,1.51,windows pro 10.00,50.0,1007.0

Crawl model 3560
daily_hits_left: 96
hp,intel, i7, 4, 1.8, 4.9,ddr4, 16,0,512,nvidia, 4096, 1519, 1582,led ips, 17.3, 1920, 1080, 90,2.78,windows home 10.00,52.0,1156.5

Crawl model 3561
daily_hits_left: 95
hp,intel, i7, 4, 1.8, 4.9,ddr4, 8,0,256,intel, 1700, 300, 1150,led ips, 13.3, 1920, 1080, 80,1.17,windows home 10.00,53.2,895.5

Crawl model 3563
daily_hits_left: 94
hp,intel, i7, 4, 1.8, 4.9,ddr4, 8,0,512,intel, 1700, 300, 1150,led ips, 15.6, 1920, 1080, 60,2.05,windows home 10.00,53.2,885.5

Crawl model 3565
daily_hits_left: 93
hp,amd, ryzen 5, 4, 2.1, 3.7,ddr4, 8,0,256,amd, 2048, 1100, 1200,led ips, 13.3, 1920, 1080, 90,1.44,windows

daily_hits_left: 38
hp,intel, i5, 4, 2.5, 4.5,ddr4, 16,0,256,nvidia, 3072, 1366, 1442,led ips, 15.6, 1920, 1080, 60,2.23,windows home 10.00,52.5,630.0

Crawl model 4603
daily_hits_left: 37
Crawl model 4608
daily_hits_left: 36
hp,intel, i3, 2, 1.2, 3.4,ddr4, 8,0,256,intel, 8192, 300, 900,led tn, 15.6, 1366, 768, 0,1.7,windows home 10.00,41.0,445.0

Crawl model 4609
daily_hits_left: 35
hp,intel, i7, 4, 1.2, 4.5,ddr4, 16,0,256,intel, 1700, 300, 1150,led ips, 13.3, 1920, 1080, 90,1.47,windows home 10.00,54.28,1100.0

Crawl model 4621
daily_hits_left: 34
hp,intel, i7, 4, 1.8, 4.6,ddr4, 8,0,512,intel, 1700, 300, 1150,led ips, 15.6, 1920, 1080, 60,2.04,windows home 10.00,52.5,711.0

Crawl model 4624
daily_hits_left: 33
hp,intel, i5, 4, 1.6, 3.9,ddr4, 8,0,128,intel, 1700, 300, 1100,led ips, 14.0, 1920, 1080, 60,1.58,windows home 10.00,41.0,699.5

Crawl model 4633
daily_hits_left: 32
hp,amd, ryzen 3, 4, 2.7, 3.7,ddr4, 4,0,128,amd, 2048, 300, 1400,led ips, 13.3, 1920, 1080, 60,1.49,windows pro 1

KeyboardInterrupt: 