In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [3]:
url = "https://cellphones.com.vn/laptop.html"

# Fetch the page content
response = requests.get(url)
html_content = response.content

# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

In [16]:
# Lists to store scraped data
laptop_names = []
laptop_prices = []
laptop_links = []
laptop_img_urls = []

# Extracting relevant data from the page
for product in soup.find_all('div', class_='product-info'):
    # print(product)
    # print('-------------------')
    
    # Get the laptop name
    name = product.find('div', class_='product__name')
    if name:
        laptop_names.append(name.text.strip())

    # Get the laptop price
    price = product.find('p', class_='product__price--show')
    if price:
        laptop_prices.append(price.text.strip())
    
    # Get the laptop link
    link = product.find('a', class_='product__link button__link')
    if link:
        laptop_links.append(link['href'])

    img_url = product.find('img')
    if img_url:
        laptop_img_urls.append(img_url['src'])

In [11]:
laptops_df = pd.DataFrame({
    'Name': laptop_names,
    'Price': laptop_prices,
    'Link': laptop_links,
    'Img_url': laptop_img_urls
})

In [13]:
laptops_df.to_csv('cellphones_laptops.csv')

In [67]:
# Load the original CSV
input_csv = 'cellphones_laptops.csv'  # Replace with your input CSV file path
output_csv = 'output_file.csv'  # Replace with the desired output file path

# Read the input CSV into a pandas DataFrame
df = pd.read_csv(input_csv)

# Initialize a new DataFrame with the desired columns
output_columns = [
    'Product Id', 'Product Handle', 'Product Title', 'Product Subtitle', 'Product Description',
    'Product Status', 'Product Thumbnail', 'Product Weight', 'Product Length', 'Product Width',
    'Product Height', 'Product HS Code', 'Product Origin Country', 'Product MID Code',
    'Product Material', 'Product Collection Title', 'Product Collection Handle', 'Product Type',
    'Product Tags', 'Product Discountable', 'Product External Id', 'Product Profile Name',
    'Product Profile Type', 'Variant Id', 'Variant Title', 'Variant SKU', 'Variant Barcode',
    'Variant Inventory Quantity', 'Variant Allow Backorder', 'Variant Manage Inventory',
    'Variant Weight', 'Variant Length', 'Variant Width', 'Variant Height', 'Variant HS Code',
    'Variant Origin Country', 'Variant MID Code', 'Variant Material', 'Price EUR', 'Price USD',
    'Option 1 Name', 'Option 1 Value', 'Image 1 Url', 'Image 2 Url'
]

output_df = pd.DataFrame(columns=output_columns)

# Populate the new DataFrame with the data from the old DataFrame
output_df['Product Title'] = df['Name']
output_df['Price USD'] = df['Price']  # Assuming the price is in USD
output_df['Product Thumbnail'] = df['Img_url']
output_df['Product Handle'] = df['Name'].str.lower().replace(' ', '-').replace('/', '-')  # Create handle by replacing spaces and slashes with dashes

# Fill other columns with default or placeholder values
output_df['Product Status'] = 'published'
output_df['Product Discountable'] = 'true'
output_df['Product Id'] = ''  # Keep Product Id blank as per request
output_df['Variant Id'] = ''  # Keep Variant Id blank as per request
output_df['Variant SKU'] = ''  # Keep Variant SKU blank as per request
output_df['Variant Inventory Quantity'] = '100'
output_df['Variant Allow Backorder'] = 'false'
output_df['Variant Manage Inventory'] = 'true'
output_df['Variant Title'] = df['Name']  # Use unique variant titles to avoid duplication error
output_df['Option 1 Name'] = 'Size'  # Follow template format
output_df['Option 1 Value'] = 'One Size'  # Use a default option value
output_df['Product Subtitle'] = ''  # Leaving blank as per template
output_df['Product Description'] = "'Every programmer's best friend.'"  # Placeholder description
output_df['Product Weight'] = '400'  # Placeholder weight as per template
output_df['Product Collection Title'] = ''
output_df['Product Collection Handle'] = ''
output_df['Product Type'] = ''
output_df['Product Tags'] = ''
output_df['Product External Id'] = ''
output_df['Product Profile Name'] = ''
output_df['Product Profile Type'] = ''
output_df['Variant Barcode'] = ''
output_df['Variant Weight'] = ''
output_df['Variant Length'] = ''
output_df['Variant Width'] = ''
output_df['Variant Height'] = ''
output_df['Variant HS Code'] = ''
output_df['Variant Origin Country'] = ''
output_df['Variant MID Code'] = ''
output_df['Variant Material'] = ''
output_df['Price EUR'] = ''
output_df['Image 1 Url'] = df['Img_url']  # First image URL from original data
output_df['Image 2 Url'] = ''  # Leaving blank for second image URL

# Remove duplicate rows from the DataFrame
output_df.drop_duplicates(inplace=True)

# Save the new DataFrame to the desired output CSV file
output_df.to_csv(output_csv, sep=';', index=False)

print(f'CSV converted and saved as {output_csv}')

CSV converted and saved as output_file.csv
