In [1]:
!pip install certifi==2020.12.5
!pip install chardet==4.0.0
!pip install idna==2.10
!pip install requests==2.25.1
!pip install urllib3==1.26.2

Collecting certifi==2020.12.5
  Using cached certifi-2020.12.5-py2.py3-none-any.whl (147 kB)
Installing collected packages: certifi
  Attempting uninstall: certifi
    Found existing installation: certifi 2020.6.20
    Uninstalling certifi-2020.6.20:
      Successfully uninstalled certifi-2020.6.20
Successfully installed certifi-2020.12.5
Collecting chardet==4.0.0

ERROR: spyder 4.1.4 requires pyqt5<5.13; python_version >= "3", which is not installed.
ERROR: spyder 4.1.4 requires pyqtwebengine<5.13; python_version >= "3", which is not installed.
ERROR: requests 2.24.0 has requirement chardet<4,>=3.0.2, but you'll have chardet 4.0.0 which is incompatible.



  Using cached chardet-4.0.0-py2.py3-none-any.whl (178 kB)
Installing collected packages: chardet
  Attempting uninstall: chardet
    Found existing installation: chardet 3.0.4
    Uninstalling chardet-3.0.4:
      Successfully uninstalled chardet-3.0.4
Successfully installed chardet-4.0.0
Collecting requests==2.25.1
  Using cached requests-2.25.1-py2.py3-none-any.whl (61 kB)
Installing collected packages: requests
  Attempting uninstall: requests
    Found existing installation: requests 2.24.0
    Uninstalling requests-2.24.0:
      Successfully uninstalled requests-2.24.0
Successfully installed requests-2.25.1
Collecting urllib3==1.26.2
  Using cached urllib3-1.26.2-py2.py3-none-any.whl (136 kB)
Installing collected packages: urllib3
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.25.9
    Uninstalling urllib3-1.25.9:
      Successfully uninstalled urllib3-1.25.9
Successfully installed urllib3-1.26.2


In [24]:
import json
import random
from copy import deepcopy
from os import path
import os
import requests

In [25]:
# config
API_URL = "https://noteb.com/api/webservice.php"
API_KEY = "112233aabbcc"

GEN_CONFIGURATE_CACHE_FILE = path.join("cache","cached_gen_configurate.txt")

In [26]:
# feature_domain
producer_domain = {"apple", "asus", "hp", "lenovo", "lg", "dell", "acer"}
processor_name_domain = {"intel", "amd", "cpu"}
processor_model_domain = {"i3", "i5", "i7", "ryzen 5", "ryzen 3", "ryzen 7", "integrated"}

config_map = {
    "model_info": "mode_id",
    "cpu": "cpu_id",
    "display": "display_id",
    "memory": "memory_id",
    "primary_storage": "primary_storage_id",
    "secondary_storage": "secondary_storage_id",
    "gpu": "gpu_id",
    "wireless": "wireless_id",
    "optical_drive": "optical_drive_id",
    "motherboard": "motherboard_id",
    "chassis": "chassis_id",
    "battery": "battery_id",
    "warranty": "warranty_id",
    "operating_system": "operating_system_id"
}

feature_config_map = dict()

for key, value in config_map.items():
  feature_config_map[key] = {
    "param": value
  }

require_feature = ["memory", "primary_storage", "secondary_storage", "operating_system"]
for feature in require_feature:
  feature_config_map[feature]["allow_custom"] = True

In [27]:
# feature
class Feature:
    def __init__(self):
        pass

    def is_valid(self):
        pass

class Producer(Feature):
    def __init__(self, data):
        self.name = self.extract_name(data["noteb_name"])

    def extract_name(self, raw_name):
        raw_name = raw_name.strip().lower()
        for producer_name in producer_domain:
            if raw_name.find(producer_name) != -1:
                return producer_name

        return "N/A"

    def is_valid(self):
        return self.name in producer_domain

    def __str__(self):
        return self.name

class Processor(Feature):
    def __init__(self, data):
        self.name = data["prod"].strip().lower()
        self.model = self.extract_model(data["model"])
        self.cores = int(data["cores"])
        self.core_speed = float(data["base_speed"])

    def extract_model(self, raw_model):
        raw_model = raw_model.strip().lower()
        for model_name in processor_model_domain:
            if raw_model.find(model_name) != -1:
                return model_name
        return "N/A"

    def is_valid(self):
        return self.name in processor_name_domain and self.model in processor_model_domain

    def __str__(self):
        return f"{self.name}, {self.model}, {self.cores}, {self.core_speed}"

class Ram(Feature):
    def __init__(self, data):
        self.type = data["type"].strip().lower()
        self.size = int(data["size"])

    def is_valid(self):
        return isinstance(self.size, int) and self.size > 0

    def __str__(self):
        return self.type + ', ' + str(self.size)

class HDDStorage(Feature):
    def __init__(self, data):
        self.size = self.extract_size(data)

    def extract_size(self, data):
        total = 0
        keys = ["primary_storage", "secondary_storage"]
        for key in keys:
            if data[key]["model"].find("HDD") != -1:
                total += int(data[key]["cap"])
        return total

    def is_valid(self):
        return isinstance(self.size, int) and self.size >= 0

    def __str__(self):
        return str(self.size)

class SSDStorage(Feature):
    def __init__(self, data):
        self.size = self.extract_size(data)

    def extract_size(self, data):
        total = 0
        keys = ["primary_storage", "secondary_storage"]
        for key in keys:
            if data[key]["model"].find("SSD") != -1:
                total += int(data[key]["cap"])
        return total

    def is_valid(self):
        return isinstance(self.size, int) and self.size >= 0

    def __str__(self):
        return str(self.size)

class Graphic(Feature):
    def __init__(self, data):
        self.size = int(data["memory_size"])
        self.prod = data["prod"].strip().lower()

    def is_valid(self):
        return isinstance(self.size, int) and isinstance(self.prod, str) and self.size > 0

    def __str__(self):
        return f"{self.prod}, {self.size}"

class Screen(Feature):
    def __init__(self, data):
        self.size = float(data["size"])
        self.type = data["type"].strip().lower()

    def is_valid(self):
        return isinstance(self.size, float) and isinstance(self.type, str) and self.size > 0

    def __str__(self):
        return f"{self.type}, {self.size}"

class Chassis(Feature):
    def __init__(self, data):
        self.weight = float(data["weight_kg"])

    def is_valid(self):
        return isinstance(self.weight, float) and self.weight > 0

    def __str__(self):
        return str(self.weight)

class OperatingSystem(Feature):
    def __init__(self, data):
        if isinstance(data, str):
            self.name = data.strip().lower()
        else:
            self.name = data["name"].strip().lower()

    def is_valid(self):
        return isinstance(self.name, str)

    def __str__(self):
        return self.name

In [28]:
# model
class Laptop:
    def __init__(self, data):
        self.features = {
            "producer": Producer(data["model_info"][0]),
            "processor": Processor(data["cpu"]),
            "ram": Ram(data["memory"]),
            "hdd_storage": HDDStorage(data),
            "ssd_storage":SSDStorage(data),
            "graphic": Graphic(data["gpu"]),
            "screen": Screen(data["display"]),
            "weight": Chassis(data["chassis"]),
            "os": OperatingSystem(data["operating_system"])
        }
        self.price = (int(data["config_price_min"]) + int(data["config_price_max"])) / 2

    def is_valid(self):
        for key, feature in self.features.items():
            if not feature.is_valid():
                return False
        return True

    def __str__(self):
        feature_list = list(self.features)
        return ','.join(map(str, (self.features[key] for key in feature_list))).strip() + ',' + str(self.price)

In [29]:
# controller
class Controller:
  @staticmethod
  def create_payload(method = None, params = None):
    payload = {
      "apikey": API_KEY,
      "method": method,
    }

    for key, value in params.items():
      param = f"param[{key}]"
      payload[param] = value

    return payload

  @staticmethod
  def get_list_model(model_name=None):
    params = {
      "model_name": model_name if model_name else '',
    }
    payload = Controller.create_payload(method = "list_models", params = params)


    r = requests.post(API_URL, data=payload)

    data = r.json()

    print("daily_hits_left: {}".format(data["daily_hits_left"]))

    return data["result"]

  @staticmethod
  def get_model_info(model_id):
    params = {
      "model_id": model_id
    }
    payload = Controller.create_payload(method="get_model_info", params = params)

    r = requests.post(API_URL, data=payload)

    data = r.json()

    print("daily_hits_left: {}".format(data["daily_hits_left"]))

    return data["result"]["0"] if data["result"] else None

  @staticmethod
  def get_model_info_all(model_id):
    params = {
      "model_id": model_id
    }
    payload = Controller.create_payload(method="get_model_info_all", params = params)

    r = requests.post(API_URL, data=payload)

    data = r.json()

    print("daily_hits_left: {}".format(data["daily_hits_left"]))

    return data["result"]["0"] if data["result"] else None

  @staticmethod
  def get_conf_info(config):
    payload = Controller.create_payload(method="get_conf_info", params=config)

    r = requests.post(API_URL, data=payload)

    data = r.json()

    print("daily_hits_left: {}".format(data["daily_hits_left"]))

    return data["result"] if data["result"] else None

In [40]:
# crawler
class Crawler:
  def __init__(self):
    self.models = None
    self.cached_gen_configurate = self.get_cache(file_name=GEN_CONFIGURATE_CACHE_FILE)

  def get_cache(self, file_name):
    if not os.path.isfile(file_name):
      return set()

    cached = set()
    with open(file_name) as f:
      for line in f:
        cached.add(int(line))

    return cached

  def get_conf_hash(self, configure):
    hashed = hash(frozenset(configure.items()))
    return hashed

  def generate_configurate(self, model_info_all):
    configures = []
    tried = 0
    MAX_ATTEMPT  = 20
    success_generated = 0
    MAX_GENERATE = 2

    while tried < MAX_ATTEMPT and success_generated < MAX_GENERATE:
      tried  += 1
      configure = {}
      info = {}
      changed = False

      for feature, config in feature_config_map.items():
        if feature == "model_info":
          configure["model_id"] = model_info_all["model_info"][0]["id"]
          info["model_info"] = model_info_all["model_info"]
          continue

        configure[config["param"]] = str(model_info_all[feature]["selected"])

        if "allow_custom" in config:
          options = list(model_info_all[feature].keys())

          options.remove(str(model_info_all[feature]["selected"]))
          options.remove("selected")

          if len(options) > 0:
            configure[config["param"]] = random.choice(options)
            changed = True

        info[feature] = model_info_all[feature][configure[config["param"]]]

      config_hash = self.get_conf_hash(configure)
      if changed and config_hash not in self.cached_gen_configurate:
        success_generated += 1
        self.cached_gen_configurate.add(config_hash)
        configures.append((configure, info))

    return configures

  def save_model(self, data):
    headers = "producer,processor prod,processor model,cores,core base speed (GHz),ram type,ram cap (GB),ssd (GB),hdd (GB),gpu prod,gpu size (MB),screen type,screen size (inch),weight (kg),os,price(USD)\n"

    with open("data.csv", "a") as f:
      if os.path.getsize("data.csv") == 0:
        f.write(headers)
      f.write(f"{data}\n")

  def save_gen_configurate(self, configurate_id):
    with open(GEN_CONFIGURATE_CACHE_FILE, "a") as f:
      f.write(f"{configurate_id}\n")

  def crawl_with_custom_config(self, producers=[None]):
    for producer in producers:
      models = Controller.get_list_model(model_name=producer)
      for model in models.values():
        model_id = model["model_info"][0]["id"]

        print(f"Generate configuration for model {model_id}")

        model_info_all = Controller.get_model_info_all(model_id)


        if not model_info_all:
          print("Empty info {}".format(model_id))
          continue

        gen_configures = self.generate_configurate(model_info_all)

        print(f"Generated {len(gen_configures)} config")

        saved_model = 0

        for config, model_info in gen_configures:
          extra_info = Controller.get_conf_info(config)

          if not extra_info:
            continue

          model_info["config_price_min"] = extra_info["config_price_min"]
          model_info["config_price_max"] = extra_info["config_price_max"]

          laptop = Laptop(model_info)

          if laptop.is_valid():
            self.save_model(laptop)
            self.save_gen_configurate(self.get_conf_hash(config))
            print(laptop)
            saved_model += 1

        print(f"Successful save {saved_model} generate model\n")

  def crawl(self, producers=[None]):
    for producer in producers:
      models = Controller.get_list_model(model_name=producer)

      for model in models.values():
        model_id = model["model_info"][0]["id"]
        model_info = Controller.get_model_info(model_id)

        if model_info is None:
          print("Empty info :(")
          continue

        laptop = Laptop(model_info)

        if laptop.is_valid():
          config_id = int(model_info["config_id"])
          self.save_model(laptop)
          print(f"{laptop}\n")

In [42]:
crawler = Crawler()

producers = ["apple"]

crawler.crawl(producers = producers)

crawler.crawl_with_custom_config(producers = producers)

daily_hits_left: 142
daily_hits_left: 141
apple,intel, i7, 6, 2.6,ddr4, 16,0,256,amd, 4096,led ips, 15.4,1.83,macos  11.00,2499.5

daily_hits_left: 140
apple,intel, i3, 2, 1.1,ddr4, 8,0,256,intel, 8192,led ips, 13.3,1.25,macos  11.00,869.5

daily_hits_left: 139
apple,intel, i5, 4, 1.4,ddr3, 8,0,256,intel, 128,led ips, 13.3,1.37,macos  11.00,1099.5

daily_hits_left: 138
apple,intel, i7, 6, 2.6,ddr4, 16,0,512,amd, 4096,led ips, 16.1,2.0,macos  11.00,2199.5

daily_hits_left: 137
apple,intel, i5, 4, 2.0,ddr4, 16,0,512,intel, 8192,led ips, 13.3,1.4,macos  11.00,1649.0

daily_hits_left: 136
daily_hits_left: 135
daily_hits_left: 134
Generate configuration for model 1848
daily_hits_left: 133
Generated 2 config
daily_hits_left: 132
apple,intel, i7, 6, 2.6,ddr4, 32,0,4096,amd, 4096,led ips, 15.4,1.83,macos  11.00,4410.0
daily_hits_left: 131
apple,intel, i7, 6, 2.6,ddr4, 32,0,2048,amd, 4096,led ips, 15.4,1.83,macos  11.00,3620.0
Successful save 2 generate model

Generate configuration for model 3