In [1]:
import pandas as pd
from loguru import logger
from hmd.core import app_config
from hmd.crawlers.detail_page import DetailPageCrawler
from tqdm import tqdm

Project Root: /Users/phuocbui/Desktop/Master/BigData/housing-market-dashboard


### Import list thuê nhà

#### Using dask

In [None]:
import dask
from dask.diagnostics import ProgressBar
import dask.dataframe as dd

In [2]:
list_post = dd.read_csv(app_config.DATA_PATH.joinpath("list_thue.csv"), index_col=False)
crawler = DetailPageCrawler()

In [2]:
def crawl_url(post_id, post_url):
    try:
        detail = crawler.crawl(post_url)
        return {'post_id': post_id, 'post_url': post_url, 'data': str(detail.dict())}
    except Exception as e:
        return {'post_id': post_id, 'post_url': post_url, 'error': str(e)}

In [5]:
results = list_post.map_partitions(lambda df: df.apply(lambda row: crawl_url(row['post_id'], row['post_url']), axis=1),
                                    meta={'post_id': 'str', 'url': 'str', 'data': 'str', 'error': 'str'})

In [None]:
# Use the Dask progress bar
with ProgressBar():
    computed_results = results.compute()

In [None]:
# Convert results to a DataFrame and save
results_df = pd.DataFrame(results)
results_df.to_csv('crawled_data.csv', index=False)

#### Using pandas

In [2]:
# Load the URLs from a CSV file into a Dask DataFrame
list_post = pd.read_csv(app_config.DATA_PATH.joinpath("list_thue.csv"), index_col=False)
# list_post = list_post.sample(n=5)
crawler = DetailPageCrawler()

In [3]:
result = []
for idx, row in tqdm(list_post.iterrows(), total=len(list_post), desc="Processing posts"):
    try:
        post_id = row["post_id"]
        post_url = row["post_url"]
        detail = crawler.crawl(post_url)
        result.append((post_id, post_url, detail))
    except Exception as e:
        logger.warning(f"Failed to crawl {post_url}")
        result.append((post_id, post_url, str(e)))
        print(e)

Processing posts:   0%|          | 0/1980 [00:00<?, ?it/s]

Processing posts:  15%|█▍        | 290/1980 [05:12<38:07,  1.35s/it]  



Processing posts:  15%|█▍        | 291/1980 [05:13<28:22,  1.01s/it]

'NoneType' object has no attribute 'find'


Processing posts:  15%|█▌        | 298/1980 [05:18<24:56,  1.12it/s]



Processing posts:  15%|█▌        | 299/1980 [05:18<19:18,  1.45it/s]

'NoneType' object has no attribute 'find'


Processing posts:  15%|█▌        | 300/1980 [05:19<15:06,  1.85it/s]

'NoneType' object has no attribute 'find'


Processing posts:  15%|█▌        | 301/1980 [05:19<12:14,  2.29it/s]

'NoneType' object has no attribute 'find'


Processing posts:  15%|█▌        | 302/1980 [05:19<10:39,  2.62it/s]

'NoneType' object has no attribute 'find'


Processing posts:  15%|█▌        | 303/1980 [05:19<09:00,  3.10it/s]

'NoneType' object has no attribute 'find'


Processing posts:  15%|█▌        | 304/1980 [05:19<08:02,  3.47it/s]

'NoneType' object has no attribute 'find'


Processing posts:  15%|█▌        | 305/1980 [05:20<07:22,  3.79it/s]

'NoneType' object has no attribute 'find'


Processing posts:  15%|█▌        | 306/1980 [05:20<06:43,  4.15it/s]

'NoneType' object has no attribute 'find'


Processing posts:  16%|█▌        | 307/1980 [05:20<06:20,  4.40it/s]

'NoneType' object has no attribute 'find'


Processing posts:  16%|█▌        | 308/1980 [05:20<06:04,  4.59it/s]

'NoneType' object has no attribute 'find'


Processing posts:  16%|█▌        | 309/1980 [05:20<06:05,  4.57it/s]

'NoneType' object has no attribute 'find'


Processing posts:  16%|█▌        | 310/1980 [05:21<06:00,  4.63it/s]

'NoneType' object has no attribute 'find'


Processing posts:  16%|█▌        | 311/1980 [05:21<05:49,  4.77it/s]

'NoneType' object has no attribute 'find'


Processing posts:  16%|█▌        | 312/1980 [05:22<10:34,  2.63it/s]

'NoneType' object has no attribute 'find'


Processing posts:  16%|█▌        | 313/1980 [05:22<09:00,  3.09it/s]

'NoneType' object has no attribute 'find'


Processing posts:  16%|█▌        | 314/1980 [05:22<07:59,  3.48it/s]

'NoneType' object has no attribute 'find'


Processing posts:  16%|█▌        | 315/1980 [05:22<07:10,  3.87it/s]

'NoneType' object has no attribute 'find'


Processing posts:  16%|█▌        | 316/1980 [05:22<06:34,  4.22it/s]

'NoneType' object has no attribute 'find'


Processing posts:  16%|█▌        | 317/1980 [05:23<06:32,  4.23it/s]

'NoneType' object has no attribute 'find'


Processing posts:  16%|█▌        | 318/1980 [05:23<06:17,  4.41it/s]

'NoneType' object has no attribute 'find'


Processing posts:  16%|█▌        | 319/1980 [05:23<06:12,  4.45it/s]

'NoneType' object has no attribute 'find'


Processing posts:  16%|█▌        | 320/1980 [05:23<05:57,  4.64it/s]

'NoneType' object has no attribute 'find'


Processing posts:  16%|█▌        | 321/1980 [05:23<05:38,  4.90it/s]

'NoneType' object has no attribute 'find'


Processing posts:  16%|█▋        | 322/1980 [05:24<05:37,  4.91it/s]

'NoneType' object has no attribute 'find'


Processing posts:  16%|█▋        | 323/1980 [05:24<05:34,  4.96it/s]

'NoneType' object has no attribute 'find'


Processing posts:  16%|█▋        | 324/1980 [05:24<05:33,  4.97it/s]

'NoneType' object has no attribute 'find'


Processing posts:  16%|█▋        | 325/1980 [05:24<05:34,  4.95it/s]

'NoneType' object has no attribute 'find'


Processing posts:  16%|█▋        | 326/1980 [05:24<05:29,  5.02it/s]

'NoneType' object has no attribute 'find'
'NoneType' object has no attribute 'find'


Processing posts:  17%|█▋        | 327/1980 [05:25<05:30,  5.01it/s]



Processing posts:  17%|█▋        | 328/1980 [05:25<05:25,  5.08it/s]

'NoneType' object has no attribute 'find'


Processing posts:  17%|█▋        | 329/1980 [05:25<05:41,  4.84it/s]

'NoneType' object has no attribute 'find'


Processing posts:  17%|█▋        | 330/1980 [05:25<05:31,  4.98it/s]

'NoneType' object has no attribute 'find'


Processing posts:  17%|█▋        | 331/1980 [05:25<05:24,  5.09it/s]

'NoneType' object has no attribute 'find'


Processing posts:  17%|█▋        | 332/1980 [05:26<05:20,  5.14it/s]

'NoneType' object has no attribute 'find'


Processing posts:  17%|█▋        | 333/1980 [05:26<05:24,  5.08it/s]

'NoneType' object has no attribute 'find'


Processing posts:  17%|█▋        | 334/1980 [05:26<05:20,  5.14it/s]

'NoneType' object has no attribute 'find'


Processing posts:  17%|█▋        | 335/1980 [05:26<05:29,  4.99it/s]

'NoneType' object has no attribute 'find'


Processing posts:  17%|█▋        | 336/1980 [05:26<05:25,  5.06it/s]

'NoneType' object has no attribute 'find'


Processing posts:  17%|█▋        | 337/1980 [05:27<05:36,  4.89it/s]

'NoneType' object has no attribute 'find'


Processing posts:  17%|█▋        | 338/1980 [05:27<05:25,  5.04it/s]

'NoneType' object has no attribute 'find'


Processing posts:  17%|█▋        | 339/1980 [05:27<05:37,  4.86it/s]

'NoneType' object has no attribute 'find'


Processing posts:  17%|█▋        | 340/1980 [05:27<05:37,  4.87it/s]

'NoneType' object has no attribute 'find'


Processing posts:  17%|█▋        | 341/1980 [05:27<05:35,  4.89it/s]

'NoneType' object has no attribute 'find'


Processing posts:  17%|█▋        | 342/1980 [05:28<05:28,  4.99it/s]

'NoneType' object has no attribute 'find'


Processing posts:  17%|█▋        | 343/1980 [05:28<05:25,  5.03it/s]

'NoneType' object has no attribute 'find'


Processing posts:  17%|█▋        | 344/1980 [05:28<05:39,  4.82it/s]

'NoneType' object has no attribute 'find'


Processing posts:  17%|█▋        | 345/1980 [05:28<05:27,  4.99it/s]

'NoneType' object has no attribute 'find'


Processing posts:  17%|█▋        | 346/1980 [05:29<05:42,  4.77it/s]

'NoneType' object has no attribute 'find'


Processing posts:  18%|█▊        | 347/1980 [05:29<05:31,  4.93it/s]

'NoneType' object has no attribute 'find'


Processing posts:  18%|█▊        | 348/1980 [05:29<05:41,  4.78it/s]

'NoneType' object has no attribute 'find'


Processing posts:  18%|█▊        | 349/1980 [05:29<05:35,  4.86it/s]

'NoneType' object has no attribute 'find'


Processing posts:  18%|█▊        | 350/1980 [05:29<05:32,  4.91it/s]

'NoneType' object has no attribute 'find'


Processing posts:  18%|█▊        | 351/1980 [05:30<05:31,  4.92it/s]

'NoneType' object has no attribute 'find'


Processing posts:  18%|█▊        | 352/1980 [05:30<05:42,  4.75it/s]

'NoneType' object has no attribute 'find'


Processing posts:  18%|█▊        | 353/1980 [05:30<05:29,  4.93it/s]

'NoneType' object has no attribute 'find'


Processing posts:  18%|█▊        | 354/1980 [05:30<05:32,  4.89it/s]

'NoneType' object has no attribute 'find'


Processing posts:  18%|█▊        | 355/1980 [05:30<05:25,  5.00it/s]

'NoneType' object has no attribute 'find'


Processing posts:  18%|█▊        | 356/1980 [05:31<05:23,  5.02it/s]

'NoneType' object has no attribute 'find'


Processing posts:  18%|█▊        | 357/1980 [05:31<05:40,  4.77it/s]

'NoneType' object has no attribute 'find'


Processing posts:  18%|█▊        | 358/1980 [05:31<05:36,  4.83it/s]

'NoneType' object has no attribute 'find'


Processing posts:  18%|█▊        | 359/1980 [05:31<05:31,  4.89it/s]

'NoneType' object has no attribute 'find'


Processing posts:  18%|█▊        | 360/1980 [05:31<05:22,  5.02it/s]

'NoneType' object has no attribute 'find'


Processing posts:  18%|█▊        | 361/1980 [05:32<05:17,  5.11it/s]

'NoneType' object has no attribute 'find'


Processing posts:  18%|█▊        | 362/1980 [05:32<05:16,  5.10it/s]

'NoneType' object has no attribute 'find'


Processing posts:  18%|█▊        | 363/1980 [05:32<05:39,  4.76it/s]

'NoneType' object has no attribute 'find'


Processing posts:  18%|█▊        | 364/1980 [05:32<05:32,  4.86it/s]

'NoneType' object has no attribute 'find'


Processing posts:  18%|█▊        | 364/1980 [05:32<24:37,  1.09it/s]


KeyboardInterrupt: 

In [4]:
len(result)

364

In [24]:
result = result[:290]

In [27]:
sample = result[0]
sample

(119292855,
 'https://www.nhatot.com/mua-ban-nha-dat-quan-12-tp-ho-chi-minh/119292855.htm#px=SR-special_display_ad-[PO-9][PL-default]',
 DetailPageData(post_title='NHÀ TÂN THỚI NHẤT TRƯỜNG CHINH GẦN QUỐC LỘ - GẦN 40M2- CHỈ HƠN 2 ĐỒNG.', tags=['1 PN', 'Nhà ngõ, hẻm'], post_desc='NHÀ TÂN THỚI NHẤT TRƯỜNG CHINH GẦN QUỐC LỘ - GẦN 40M2- CHỈ HƠN 2 ĐỒNG.\n****************************\n✔️Chính chủ trang bìa chưa qua mua bán.\n✔️Vị trí đẹp gần Trường Chinh sát Quốc Lộ tiện di chuyển đi trung tâm cũng như các tỉnh thành.\n✔️Kết cấu : nhà cấp 4 gồm phòng khách phòng ngủ vệ sinh ,phù hợp gia đình nhỏ hoặc xây mới ,khu xây dựng tự do, đất hiện hữu.\n✔️Sổ hồng vay bank.', address='Đường Tân Thới Nhất 1, Phường Tân Thới Nhất, Quận 12, Tp Hồ Chí Minh', last_update=datetime.datetime(2024, 10, 29, 11, 59, 13, 624854), params={'price_m2': 71.25, 'rooms': 1.0, 'property_legal_document': 'Đã có sổ', 'house_type': 'Nhà ngõ, hẻm', 'size': 40.0}))

In [30]:
sample[2].dict()

{'post_title': 'NHÀ TÂN THỚI NHẤT TRƯỜNG CHINH GẦN QUỐC LỘ - GẦN 40M2- CHỈ HƠN 2 ĐỒNG.',
 'tags': ['1 PN', 'Nhà ngõ, hẻm'],
 'post_desc': 'NHÀ TÂN THỚI NHẤT TRƯỜNG CHINH GẦN QUỐC LỘ - GẦN 40M2- CHỈ HƠN 2 ĐỒNG.\n****************************\n✔️Chính chủ trang bìa chưa qua mua bán.\n✔️Vị trí đẹp gần Trường Chinh sát Quốc Lộ tiện di chuyển đi trung tâm cũng như các tỉnh thành.\n✔️Kết cấu : nhà cấp 4 gồm phòng khách phòng ngủ vệ sinh ,phù hợp gia đình nhỏ hoặc xây mới ,khu xây dựng tự do, đất hiện hữu.\n✔️Sổ hồng vay bank.',
 'address': 'Đường Tân Thới Nhất 1, Phường Tân Thới Nhất, Quận 12, Tp Hồ Chí Minh',
 'last_update': datetime.datetime(2024, 10, 29, 11, 59, 13, 624854),
 'params': {'price_m2': 71.25,
  'rooms': 1.0,
  'property_legal_document': 'Đã có sổ',
  'house_type': 'Nhà ngõ, hẻm',
  'size': 40.0}}