### Import packages

In [12]:
from bs4 import BeautifulSoup

import pandas as pd
import requests
import warnings
import re

### Load html file 

Load the html file (downloaded manually) from https://www.senheng.com.my/all-products/tvs-audios/tv.html. To parse the html file, we will use the BeautifulSoup package.

In [13]:
soup = BeautifulSoup(open("../data/data.html", encoding="utf8"), "html.parser")

Upon manual inspection, we find that the information of different TVs are stored within a HTML ordered list object with the class name "products-lists".

In [14]:
product_list = soup.find_all("ol", class_="products-list")
li_elems = product_list[0].find_all("li")

In [15]:
li_elems

[<li class="item odd" data-val='["HAI-LE32K6000T","Haier 32-Inch Full HD LED TV Model LE32K6000T"]' onclick="sendGa('HAI-LE32K6000T','Haier 32-Inch Full HD LED TV Model LE32K6000T')"><div class="item-area">
 <div class="product-image-area">
 <a class="product-image newamastylabels" href="https://www.senheng.com.my/haier-32-inch-full-hd-led-tv.html" title="Haier 32-Inch Full HD LED TV Model LE32K6000T">
 <img alt="Haier 32-Inch Full HD LED TV Model LE32K6000T" class="" id="product-collection-image-25677" src="./data_manual_files/hai-le32k6000t_front_6xx.jpg"/>
 </a>
 </div>
 <div class="details-area">
 <h2 class="product-name"><a href="https://www.senheng.com.my/haier-32-inch-full-hd-led-tv.html" title="Haier 32-Inch Full HD LED TV Model LE32K6000T">Haier 32-Inch Full HD LED TV Model LE32K6000T</a></h2>
 <div class="ratings">
 <div class="rating-box">
 <div class="rating" style="width:0"></div>
 </div>
 </div>
 <div class="short-desc">
 <p>Energy saving and Eco-friendly with auto contro

### Feature extraction

Now, we parse each HTML li element and extract the key information of each product (TV):
- product id
- name of the product
- old price 
- current price
- additional description

In addition to the information above, we will also parse the name of the product and determine the following attributes:
- brand of the product
- inch (of the TV)
- is it a 4K TV?
- is it an UHD TV?
- is it a LED?
- is it an OLED TV?
- is it an ULED TV?
- is it a QLED TV?

In [16]:
dict_ls = []

for i, li in enumerate(li_elems):
    product_info = {}
    
    # Raw features
    product_info['id'] = None
    product_info['name'] = None
    product_info["old_price"] = None
    product_info["current_price"] = None
    product_info["description"] = None
    
    # Derived features
    product_info['brand'] = None
    product_info['inch'] = None
    product_info['X4K'] = 0
    product_info['UHD'] = 0
    product_info['LED'] = 0
    product_info['OLED'] = 0
    product_info['ULED'] = 0
    product_info['QLED'] = 0
    
    ###########################################################################
    #  Get Raw Features                                                       #
    ###########################################################################
    product_info['id'] = li.attrs['data-val'].split(",")[0][2:-1]
    product_info['name'] = li.attrs['data-val'].split(",")[1][1:-2]

    price_ls = li.find_all('span', class_='price')[0:2]
    for price_elem in price_ls:
        if len(price_elem.attrs) > 1 and ('old' in price_elem.attrs['id']):
            product_info['old_price'] = price_elem.get_text().strip()[2:]
            product_info['old_price'] = product_info['old_price'].strip().replace('RM', '').replace(',','')
            product_info['old_price'] = float(product_info['old_price'])
            
        else:
            product_info['current_price'] = price_elem.get_text().strip()[2:]
            product_info['current_price'] = product_info['current_price'].strip().replace('RM', '').replace(',','')
            product_info['current_price'] = float(product_info["current_price"])
    
    if li.find('div', class_='short-desc').find('p'):
        product_info['description'] = li.find('div', class_='short-desc').find('p').get_text()
    
    ###########################################################################
    #  Get Derived Features                                                   #
    ###########################################################################
    product_info['brand'] = product_info['id'].split('-')[0]
    
    # get inch
    if re.search('(\d+)\s?[-]?(inch)', product_info['name'], re.IGNORECASE):
        product_info['inch'] = re.search('(\d+)\s?[-]?(inch)', product_info['name'], re.IGNORECASE).group()
        product_info['inch'] = float(re.search('(\d+)', product_info['name']).group())
        
    elif re.search('(\d+)\s?["]', product_info['name']):
        product_info['inch'] = float(re.search('(\d+)\s?["]', product_info['name']).group().replace('"', '').strip())
    
    # get 4K, UHD, OLED, QLED, LED or ULED
    if re.search('\s(4K)\s', product_info['name'], re.IGNORECASE):
        product_info["X4K"] = 1
    if re.search('\s(ultra\s?|u)hd\s', product_info['name'], re.IGNORECASE):
        product_info["UHD"] = 1
    if re.search('\s(led)\s', product_info['name'], re.IGNORECASE):
        product_info["LED"] = 1
    if re.search('\s(oled)\s', product_info['name'], re.IGNORECASE):
        product_info["OLED"] = 1
    if re.search('\s(uled)\s', product_info['name'], re.IGNORECASE):
        product_info["ULED"] = 1
    if re.search('\s(qled)\s', product_info['name'], re.IGNORECASE):
        product_info["QLED"] = 1

    dict_ls.append(product_info)

Now we convert the list of dictionaries to a Pandas DataFrame.

In [17]:
df = pd.DataFrame(dict_ls)
df

Unnamed: 0,LED,OLED,QLED,UHD,ULED,X4K,brand,current_price,description,id,inch,name,old_price
0,1,0,0,0,0,0,HAI,749.0,Energy saving and Eco-friendly with auto contr...,HAI-LE32K6000T,32.0,Haier 32-Inch Full HD LED TV Model LE32K6000T,889.0
1,1,0,0,1,0,1,HAI,1885.0,Smart Share function which connects with smart...,HAI-LE50U6600U,50.0,Haier 50-inch 4K Ultra HD Smart LED TV LE50U6600U,2829.0
2,1,0,0,0,0,0,HSE,1599.0,Hisense new VIDAA U Smart Tv Operating SystemO...,HSE-49A5700PW,49.0,Hisense 49-Inch HD LED Smart TV Model 49A5700PW,1899.0
3,1,0,0,0,0,0,HSE,1499.0,49 inch narrow frame Display3 HDMI and 1 USB P...,HSE-49N2173P,49.0,Hisense 49-inch LED TV HSE-49N2173P,1829.0
4,0,0,0,1,0,1,HSE,2399.0,Hisense new VIDAA Smart Tv Operating SystemSma...,HSE-55A6100UW,55.0,Hisense 55-inch 4K Ultra HD Smart TV Model 55A...,2699.0
5,0,0,0,1,0,0,HSE,2599.0,65-Inch Ultra HD 3840×2160 HDR Technology Remo...,HSE-55A6501UW,55.0,Hisense 55-Inch UHD TV A6501,2999.0
6,0,0,0,1,0,0,HSE,3899.0,65-Inch Ultra HD 3840×2160 HDR Technology Remo...,HSE-65A6501UW,65.0,Hisense 65-Inch UHD TV A6501,4999.0
7,0,0,0,1,0,0,LG,1799.0,High-Resolution for a razor sharp imageConsist...,LG-43UK6300,43.0,LG 43UK6300PTE 43-inch UK63 Series UHD HDR Sma...,2357.0
8,0,0,0,1,0,0,LG,1999.0,49-inchMulti-channelHigh-Resolution4K Active HDR,LG-49UK6320,49.0,LG 49-inch UHD TV LG-49UK6320,3017.0
9,0,0,0,1,0,0,LG,3499.0,"Rich, accurate color and deeper blackSpectacul...",LG-49SK8000,49.0,LG 49SK8000PTA 49-inch SK80 Series Super UHD H...,4299.0


### Save DataFrame as csv for later use

In [18]:
df.to_csv('../data/data.csv', index=False)