In [1]:
import requests
import time 
import pandas as pd
 
from bs4 import BeautifulSoup

In [2]:
url = "https://www.ss.com/en/for-children/toys/dolls/"
url

'https://www.ss.com/en/for-children/toys/dolls/'

In [3]:
req = requests.get(url) 
req.status_code

200

In [4]:
soup = BeautifulSoup(req.text, 'lxml') 

soup.title

<title>SS.COM Toys, swings - Dolls, Prices - Advertisements</title>

In [5]:
def getColList(soup):
    column_list = ["description","url"] 
    headline = soup.find("tr", {"id":"head_line"})
    headtds = headline.find_all("td")
    headcolumns = [el.text for el in headtds[1:]] 
    column_list += headcolumns
    return column_list

In [6]:
column_names = getColList(soup)
column_names

['description', 'url', 'Name', 'Cond.', 'Price']

In [7]:
def getRowList(soup):
    trows = soup.find_all('tr')
    aprows = [row for row in trows if row.get('id',"").startswith("tr_") and not row.get('id',"").startswith("tr_bnr") ]
    return aprows

In [8]:
def getRow(row, colist=column_names):
    row_tds = row.find_all('td')
    rowDict = {}
    if len(row_tds) < 3: # a little sanity check
        print("Hmm bad row")
        return rowDict
    
    rowDict[colist[0]] = row_tds[2].text # so the big assumption is that we always get description in 3rd column
    rowDict[colist[1]] = "https://ss.com" + row_tds[1].find('a').get('href')
    for td,key in zip(row_tds[3:],colist[2:]): 
        rowDict[key] = td.text
    return rowDict

In [9]:
def getRows(rowlist,colist=column_names):
    return [getRow(row, colist=colist) for row in rowlist]

In [10]:
def getDFfromURL(url):
    
    req = requests.get(url)
    if req.status_code != 200:
        print("Request Fail with", req.status_code)
        return None 
    soup = BeautifulSoup(req.text, 'lxml')
    column_names = getColList(soup)
    rowlist = getRowList(soup)
    rows = getRows(rowlist,colist=column_names)
    return pd.DataFrame(rows, columns=column_names)

In [14]:
dolls = "https://www.ss.com/en/for-children/toys/dolls/"
idf = getDFfromURL(dolls)
idf.head(15)

Unnamed: 0,description,url,Name,Cond.,Price
0,Disneja princese Frozen 2 komplekts Annas Box ...,https://ss.com/msg/en/for-children/toys/dolls/...,Disneja princese Frozen 2 ko,new,10 €
1,Продаем куклу beby born в коробке горшочек в п...,https://ss.com/msg/en/for-children/toys/dolls/...,Baby born,used,25 €
2,"Reborn lelle. \r\n50cm, 2kg. \r\n\r\nManeklīti...",https://ss.com/msg/en/for-children/toys/dolls/...,Reborn,used,70 €
3,"Продаю комплект кукол, б/у.Riga district, Baldone",https://ss.com/msg/en/for-children/toys/dolls/...,Lelles,used,10 €
4,"Pārdodu Barbi lelli. Kājas lokas, somiņa un ci...",https://ss.com/msg/en/for-children/toys/dolls/...,Barbi,used,5 €
5,"Комплект кукол, б/у, + новый пазл.Riga distric...",https://ss.com/msg/en/for-children/toys/dolls/...,Lelles,used,12 €
6,"Продаю комплект кукол в хорошем состоянии, б/у...",https://ss.com/msg/en/for-children/toys/dolls/...,Simba toys,used,15 €
7,Продаю Машеньку с коляской и бутылочкой в идеа...,https://ss.com/msg/en/for-children/toys/dolls/...,Маша и медведь,new,7 €
8,"Крупные персонажи, полные озорства, с дружелюб...",https://ss.com/msg/en/for-children/toys/dolls/...,Бренд: The Puppet Company,new,19 €
9,Советская кукла Красная шапочка высота 45см. В...,https://ss.com/msg/en/for-children/toys/dolls/...,Красная шапочка,used,20 €


In [19]:
idf.drop("url", inplace=True, axis=1)
idf

Unnamed: 0,description,Name,Cond.,Price
0,Disneja princese Frozen 2 komplekts Annas Box ...,Disneja princese Frozen 2 ko,new,10 €
1,Продаем куклу beby born в коробке горшочек в п...,Baby born,used,25 €
2,"Reborn lelle. \r\n50cm, 2kg. \r\n\r\nManeklīti...",Reborn,used,70 €
3,"Продаю комплект кукол, б/у.Riga district, Baldone",Lelles,used,10 €
4,"Pārdodu Barbi lelli. Kājas lokas, somiņa un ci...",Barbi,used,5 €
5,"Комплект кукол, б/у, + новый пазл.Riga distric...",Lelles,used,12 €
6,"Продаю комплект кукол в хорошем состоянии, б/у...",Simba toys,used,15 €
7,Продаю Машеньку с коляской и бутылочкой в идеа...,Маша и медведь,new,7 €
8,"Крупные персонажи, полные озорства, с дружелюб...",Бренд: The Puppet Company,new,19 €
9,Советская кукла Красная шапочка высота 45см. В...,Красная шапочка,used,20 €


In [15]:
idf_pivoted = pd.pivot_table(data=idf, columns=["Cond."], values=["Price"])
idf_pivoted

  idf_pivoted = pd.pivot_table(data=idf, columns=["Cond."], values=["Price"])


Cond.


In [None]:
from matplotlib import ticker # ticker is a module that helps with formatting plots

# dataframe plot returns an axis object that we can use to customize the plot
ax = idf.plot(
    kind='bar', rot=0, xlabel='', ylabel='Price',
    figsize=(8, 1.5), title='New and used dolls prices'
)

# use month abbreviations for the ticks on the x-axis
ax.set_xticklabels(calendar.month_abbr[1:])

# show y-axis labels in millions instead of scientific notation
ax.yaxis.set_major_formatter(ticker.EngFormatter())

# customize the legend
ax.legend(title='', loc='center', bbox_to_anchor=(0.5, -0.3), ncol=3, frameon=False)

# change plot size in centimeters
ax.figure.set_size_inches(12, 2.5)

In [16]:
idf.to_excel("dolls.xlsx", encoding="UTF-8") 

  return func(*args, **kwargs)


In [17]:
dolls = pd.read_csv('C:\Users\zirav\Desktop\Accenture\Day20_WebScraping\dolls.xlsx', encoding="utf-8")
dolls

SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape (731728630.py, line 1)