# TODO scrape any car data from ss.com
* clean up prices as numerics
* group by make and model
* plot the data - pricing for sure
* if you do more advanced analysis that also could be used for the final project

* for those who want scraping part of final project
* scrape data from somewhere else - maybe Lithuanian ss.com or wikipedia?

In [51]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import unicodedata

# Get URL and scrape the data, save to CSV file 

In [72]:
url = "https://www.ss.com/en/transport/cars/audi/"
url


'https://www.ss.com/en/transport/cars/audi/'

In [73]:
req = requests.get(url)  #extracting the html
req.status_code

200

In [74]:
soup = BeautifulSoup(req.text, 'lxml') 
soup.title

<title>SS.COM Cars - Audi, Prices - Advertisements</title>

In [75]:
# lets combine the above cells into a function which will always get us columns
def getColList(soup):
    column_list = ["description","url"] # we decided to that we need these two column names no matter the html
    headline = soup.find("tr", {"id":"head_line"}) #this will work as long as html has a table row with id head_line
    headtds = headline.find_all("td")
    headcolumns = [el.text for el in headtds[1:]]  # this will get all column names starting with 2nd in HTML
    column_list += headcolumns
    return column_list

In [76]:
column_names = getColList(soup)
column_names  # to be used later when creating our table

['description', 'url', 'Model', 'Year', 'Volume', 'Run', 'Price']

In [77]:
def getRowList(soup):
    trows = soup.find_all('tr')
    aprows = [row for row in trows if row.get('id',"").startswith("tr_") and not row.get('id',"").startswith("tr_bnr") ]
    return aprows

In [78]:
# from row get dictionary with values 
def getRow(row, colist=column_names):
    row_tds = row.find_all('td')
    rowDict = {}
    if len(row_tds) < 3: # a little sanity check
        print("Hmm bad row")
        return rowDict
    
    rowDict[colist[0]] = row_tds[2].text # so the big assumption is that we always get description in 3rd column
    rowDict[colist[1]] = "https://ss.com" + row_tds[1].find('a').get('href')
    for td,key in zip(row_tds[3:],colist[2:]): 
        rowDict[key] = td.text
    return rowDict

In [79]:
def getRows(rowlist,colist=column_names):
    return [getRow(row, colist=colist) for row in rowlist]

In [80]:
def getDFfromURL(url):
    # print("getting data from", url)
    req = requests.get(url)
    if req.status_code != 200:
        print("Request Fail with", req.status_code)
        return None # maybe return empty dataframe here
    soup = BeautifulSoup(req.text, 'lxml')
    column_names = getColList(soup)
    rowlist = getRowList(soup)
    rows = getRows(rowlist,colist=column_names)
    return pd.DataFrame(rows)
    

In [81]:
audi_cars = "https://www.ss.com/en/transport/cars/audi/"
idf = getDFfromURL(audi_cars)
idf.head()

Unnamed: 0,description,url,Model,Year,Volume,Run,Price
0,Mercedes Benz C220 2.0d. \r\n125 kw. Avantgard...,https://ss.com/msg/en/transport/cars/mercedes/...,C220,2012,2.0D,291 thd.,"9,900 €changing"
1,Audi A3 1.6 Tdi (115Zs) ar 6 ātrumu mehānisku ...,https://ss.com/msg/en/transport/cars/audi/a3/a...,A3,2016,1.6D,109 thd.,"17,550 €"
2,Vispilnākā komplektācija\r\n\r\nAudi A4 2.0 Td...,https://ss.com/msg/en/transport/cars/audi/a4/c...,A4,2018,2.0D,134 thd.,"24,150 €"
3,"Pārdodu/продаю Audi a4b6 1.9tdi, 2001g, \r\nFi...",https://ss.com/msg/en/transport/cars/audi/a4/h...,A4,2001,1.9D,294 thd.,"2,400 €"
4,"Audi A2 1.4tdi 55kw no Vācijas, Latvijā nav ek...",https://ss.com/msg/en/transport/cars/audi/a2/c...,A2,2001,1.4D,244 thd.,"3,500 €"


# Read CSV file, clean up prices, sort data by model and year

In [82]:
idf.to_csv("audi_sell_list.csv")

In [83]:
audi_df = pd.read_csv("audi_sell_list.csv")
audi_df.head()

Unnamed: 0.1,Unnamed: 0,description,url,Model,Year,Volume,Run,Price
0,0,Mercedes Benz C220 2.0d. \r\n125 kw. Avantgard...,https://ss.com/msg/en/transport/cars/mercedes/...,C220,2012,2.0D,291 thd.,"9,900 €changing"
1,1,Audi A3 1.6 Tdi (115Zs) ar 6 ātrumu mehānisku ...,https://ss.com/msg/en/transport/cars/audi/a3/a...,A3,2016,1.6D,109 thd.,"17,550 €"
2,2,Vispilnākā komplektācija\r\n\r\nAudi A4 2.0 Td...,https://ss.com/msg/en/transport/cars/audi/a4/c...,A4,2018,2.0D,134 thd.,"24,150 €"
3,3,"Pārdodu/продаю Audi a4b6 1.9tdi, 2001g, \r\nFi...",https://ss.com/msg/en/transport/cars/audi/a4/h...,A4,2001,1.9D,294 thd.,"2,400 €"
4,4,"Audi A2 1.4tdi 55kw no Vācijas, Latvijā nav ek...",https://ss.com/msg/en/transport/cars/audi/a2/c...,A2,2001,1.4D,244 thd.,"3,500 €"


In [84]:
audi_df['Price'] = audi_df['Price'].str.replace(unicodedata.lookup('EURO SIGN'), '')
audi_df['Price'] = audi_df['Price'].str.replace(",", '')

audi_df.head()



Unnamed: 0.1,Unnamed: 0,description,url,Model,Year,Volume,Run,Price
0,0,Mercedes Benz C220 2.0d. \r\n125 kw. Avantgard...,https://ss.com/msg/en/transport/cars/mercedes/...,C220,2012,2.0D,291 thd.,9900 changing
1,1,Audi A3 1.6 Tdi (115Zs) ar 6 ātrumu mehānisku ...,https://ss.com/msg/en/transport/cars/audi/a3/a...,A3,2016,1.6D,109 thd.,17550
2,2,Vispilnākā komplektācija\r\n\r\nAudi A4 2.0 Td...,https://ss.com/msg/en/transport/cars/audi/a4/c...,A4,2018,2.0D,134 thd.,24150
3,3,"Pārdodu/продаю Audi a4b6 1.9tdi, 2001g, \r\nFi...",https://ss.com/msg/en/transport/cars/audi/a4/h...,A4,2001,1.9D,294 thd.,2400
4,4,"Audi A2 1.4tdi 55kw no Vācijas, Latvijā nav ek...",https://ss.com/msg/en/transport/cars/audi/a2/c...,A2,2001,1.4D,244 thd.,3500


In [85]:
audi_df['Price'].describe()  

count         30
unique        28
top       1100  
freq           2
Name: Price, dtype: object

In [86]:
audi_df['Price'] = pd.to_numeric(audi_df['Price'], errors='coerce')  #convert dtyper: object to dtype:float64
audi_df['Price'].describe()

count       26.000000
mean      6491.884615
std       6214.375620
min        489.000000
25%       2317.500000
50%       4600.000000
75%       7687.500000
max      24150.000000
Name: Price, dtype: float64

In [87]:
audi_sorted_by_price = audi_df.sort_values(by="Price",ascending=True)
audi_sorted_by_price.tail(15)

Unnamed: 0.1,Unnamed: 0,description,url,Model,Year,Volume,Run,Price
11,11,"3.0tdi, Quattro, melns recaro ādas salons, sēd...",https://ss.com/msg/en/transport/cars/audi/allr...,Allroad,2006,3.0D,-,4990.0
23,23,Pārdodu Audi A6 3.0 tdi 165kW. Auto labā stāvo...,https://ss.com/msg/en/transport/cars/audi/a6/a...,A6,2005,3.0D,367 thd.,5200.0
12,12,"S- Line, 3.0tdi, Quattro, melns s-line recaro ...",https://ss.com/msg/en/transport/cars/audi/a6/a...,A6,2006,3.0D,-,5790.0
19,19,"Pārdodu Audi A6 Allroad 2, 7 Tdi dīzelis. 4X4,...",https://ss.com/msg/en/transport/cars/audi/allr...,Allroad,2007,2.7D,250 thd.,6900.0
9,9,"Pārdodu Audi A6 C6 Facelift, s-line, 3.0 tdi, ...",https://ss.com/msg/en/transport/cars/audi/a6/g...,A6,2009,3.0D,317 thd.,7950.0
7,7,Tikko no Holandes. Audi A3 S-Line. Servisa vēs...,https://ss.com/msg/en/transport/cars/audi/a3/b...,A3,2015,1.6D,-,12990.0
5,5,Uzņēmums pārdod saudzīgi lietotu 2010. gada S-...,https://ss.com/msg/en/transport/cars/audi/a5/c...,A5,2010,3.0D,175 thd.,13800.0
8,8,"Ambition Lux, 2.0Tdi. 130Kw. Atvesta no Franci...",https://ss.com/msg/en/transport/cars/audi/q3/b...,Q3,2012,2.0D,247 thd.,14900.0
6,6,"Audi A3, 1.6 Tdi (110z/s) ar mehānisko pārnesu...",https://ss.com/msg/en/transport/cars/audi/a3/b...,A3,2017,1.6D,103 thd.,16400.0
1,1,Audi A3 1.6 Tdi (115Zs) ar 6 ātrumu mehānisku ...,https://ss.com/msg/en/transport/cars/audi/a3/a...,A3,2016,1.6D,109 thd.,17550.0


In [97]:
audi_sorted_by_year = audi_df.sort_values(['Year'], ascending=[True])
audi_sorted_by_year.head()

Unnamed: 0.1,Unnamed: 0,description,url,Model,Year,Volume,Run,Price
28,28,В хорошем состоянии,https://ss.com/msg/en/transport/cars/audi/100/...,100,1994,2.0,326 thd.,1100.0
26,26,"Auto ir apbruzats laikā gaitā, bet kalpo labi....",https://ss.com/msg/en/transport/cars/audi/a4/b...,A4,1996,1.9D,605 thd.,1100.0
25,25,"Pārdodu Audi kungu, kurš piedzimis iepriekšējā...",https://ss.com/msg/en/transport/cars/audi/a3/b...,A3,1997,1.6,220 thd.,489.0
13,13,"2.5tdi, 85kw, mehānika, 5-ātrumi, ādas recaro ...",https://ss.com/msg/en/transport/cars/audi/a6/b...,A6,1997,2.5D,-,
27,27,Pardodu projekta auto. motors 100%kapitali izr...,https://ss.com/msg/en/transport/cars/volkswage...,Bora,1999,2.3,203 thd.,


In [88]:
audi_df.groupby(['Model']).agg({'Price':['min','max','count']})  #A3 model max price should be 18595eur,but it's not here why?

Unnamed: 0_level_0,Price,Price,Price
Unnamed: 0_level_1,min,max,count
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
100,1100.0,1100.0,1
A2,3500.0,3500.0,1
A3,489.0,17550.0,6
A4,1100.0,24150.0,7
A5,13800.0,13800.0,1
A6,1650.0,7950.0,6
Allroad,2490.0,6900.0,3
Bora,,,0
C220,,,0
Q3,14900.0,14900.0,1


In [94]:
audi_df.groupby(['Model']).describe()['Price']


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
100,1.0,1100.0,,1100.0,1100.0,1100.0,1100.0,1100.0
A2,1.0,3500.0,,3500.0,3500.0,3500.0,3500.0,3500.0
A3,6.0,9086.5,7467.783172,489.0,2917.5,8895.0,15547.5,17550.0
A4,7.0,5230.0,8368.141968,1100.0,1695.0,2400.0,2785.0,24150.0
A5,1.0,13800.0,,13800.0,13800.0,13800.0,13800.0,13800.0
A6,6.0,4996.666667,2046.906609,1650.0,4547.5,5095.0,5642.5,7950.0
Allroad,3.0,4793.333333,2211.568071,2490.0,3740.0,4990.0,5945.0,6900.0
Bora,0.0,,,,,,,
C220,0.0,,,,,,,
Q3,1.0,14900.0,,14900.0,14900.0,14900.0,14900.0,14900.0


In [95]:
audi_df.groupby(['Year']).describe()['Price']

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1994,1.0,1100.0,,1100.0,1100.0,1100.0,1100.0,1100.0
1996,1.0,1100.0,,1100.0,1100.0,1100.0,1100.0,1100.0
1997,1.0,489.0,,489.0,489.0,489.0,489.0,489.0
1999,0.0,,,,,,,
2000,1.0,1400.0,,1400.0,1400.0,1400.0,1400.0,1400.0
2001,2.0,2950.0,777.817459,2400.0,2675.0,2950.0,3225.0,3500.0
2002,1.0,1650.0,,1650.0,1650.0,1650.0,1650.0,1650.0
2003,1.0,1990.0,,1990.0,1990.0,1990.0,1990.0,1990.0
2004,2.0,2390.0,141.421356,2290.0,2340.0,2390.0,2440.0,2490.0
2005,2.0,3995.0,1704.127343,2790.0,3392.5,3995.0,4597.5,5200.0


# Plot the data

In [None]:
bmw_df_price = audi_df.sort_values(['Year'], ascending=[True])
bmw_df_price = pd.DataFrame(bmw_df_price,columns=['Year','Price'])
bmw_df_price.plot(x ='Year', y='Price', kind = 'scatter')
plt.ylabel('Price, \u20ac')
plt.show()