# Autotrader Extraction Project

In [1]:
import numpy as np
import pandas as pd
import calendar
import matplotlib.pyplot as plt
import math
from datetime import datetime

In [2]:
pd.set_option('display.max.columns', 20)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max.rows', 1000)
pd.set_option("display.float_format", lambda x: "%.2f" % x )

## Load Dataset

In [109]:
df = pd.read_csv("./results/car_dealers_dataset.csv")
df.head(1)

Unnamed: 0,name,fullname,rating,description,address,Website,Phone number,link
0,Group1 Toyota Wirral,Group1 Toyota Wirral,4.9 out of 5,"FCA Regulated and Approved, Finance Available, Home Delivery Available","Docks Link, , Wallasey, Merseyside, CH44 3EQ",www.group1auto.co.uk,(0151) 382 4783,https://www.autotrader.co.uk/dealers/group1-toyota-wirral-10044382?channel=cars


### Staging Dataset

In [165]:
df1 = df.copy()
df1.head(1)

Unnamed: 0,name,fullname,rating,description,address,Website,Phone number,link
0,Group1 Toyota Wirral,Group1 Toyota Wirral,4.9 out of 5,"FCA Regulated and Approved, Finance Available, Home Delivery Available","Docks Link, , Wallasey, Merseyside, CH44 3EQ",www.group1auto.co.uk,(0151) 382 4783,https://www.autotrader.co.uk/dealers/group1-toyota-wirral-10044382?channel=cars


## Overview

In [166]:
#Checking ths shape of the dataframe
df1.shape

(240, 8)

In [167]:
#Checking the columns
df1.columns

Index(['name', 'fullname', 'rating', 'description', 'address', 'Website',
       'Phone number', 'link'],
      dtype='object')

In [168]:
#The datatypes of the columnms
df.dtypes

name            object
fullname        object
rating          object
description     object
address         object
Website         object
Phone number    object
link            object
dtype: object

In [169]:
#Dataset preview
df1.head(1)

Unnamed: 0,name,fullname,rating,description,address,Website,Phone number,link
0,Group1 Toyota Wirral,Group1 Toyota Wirral,4.9 out of 5,"FCA Regulated and Approved, Finance Available, Home Delivery Available","Docks Link, , Wallasey, Merseyside, CH44 3EQ",www.group1auto.co.uk,(0151) 382 4783,https://www.autotrader.co.uk/dealers/group1-toyota-wirral-10044382?channel=cars


## Data Cleaning

In [170]:
#Checking ths shape of the dataframe
df1.shape

(240, 8)

In [171]:
#Get the ID columnm
#The ID contained in the link is used as the ID
df1.loc[:, 'ID'] = df1['link'].apply(lambda x: str(x).split("/")[-1]).apply(lambda x: str(x).split("?")[0]).apply(lambda x: str(x).split("-")[-1])

In [172]:
#Removing invalid and nan values
df1 = df1[~(df1['ID'] == "nan")]

In [173]:
#Converting the ID column to int32
df1['ID'] = df1['ID'].astype("int32")

In [174]:
#Rearranging and reordering Columns
df1 = df1[['ID', 'name', 'rating', 'description', 'address', 'Phone number', 'Website']]

In [175]:
#Converting columns heading to title snake case
df1.columns = df1.columns.str.title()
df1.columns = df1.columns.str.replace(" ", "_")

In [176]:
#Rename the column to caps
df1 = df1.rename(columns = {
    "Id": "ID"
})

In [177]:
#Cleaning the rating colums
df1.loc[:,'Rating'] = df1.loc[:,'Rating'].apply(lambda x: str(x).replace("No recent reviews", "0" )).apply(lambda x: str(x).replace("out of 5", "" )).str.strip()
df1['Rating'] = df1['Rating'].astype("float")

In [178]:
#Cleaning the Address column
df1.loc[:,'Address'] = df1.loc[:,'Address'].apply(lambda x: str(x).replace(", , ", ", " )).apply(lambda x: str(x).strip())

In [179]:
#Cleaning NA
df1 = df1.fillna("")

In [180]:
#Cleaning the Phone_Number
df1['Phone_Number'] = df1['Phone_Number'].str.replace(" ", "-")

In [181]:
df1.head(10)

Unnamed: 0,ID,Name,Rating,Description,Address,Phone_Number,Website
0,10044382,Group1 Toyota Wirral,4.9,"FCA Regulated and Approved, Finance Available, Home Delivery Available","Docks Link, Wallasey, Merseyside, CH44 3EQ",(0151)-382-4783,www.group1auto.co.uk
1,10003711,Williams Liverpool Mini,0.0,"Family Business for 110 years, 100% Approved Used Vehicles, Over 200 Approved Used MINIs","4 GREAT HOWARD STREET, Liverpool, Mersyside, L3 7HT",(0151)-382-7853,www.williamsgroup.co.uk
2,3896,Williams Liverpool Bmw,4.6,"Family Business for 110 years, 100% Approved Used Vehicles, Over 650 Approved Used BMWs","4 Great Howard Street, Liverpool, Merseyside, L3 7HT",(0151)-382-7977,www.williamsgroup.co.uk
3,10004792,The Van Place Ltd,4.0,,"UNIT 1A SANDON INDUSTRIAL ESTATE, SANDON WAY, Liverpool, Lancashire, L5 9YN",(07441)-914750,www.thevanplace.com
4,27433,Mercedes-Benz Of Liverpool,4.7,"Manufacturer Approved Dealer., Price checked to market, Aftersales facilities","66-68 Pall Mall, Liverpool, Merseyside, L3 7DB",(0151)-382-7799,www.group1auto.co.uk
5,27515,Smart Of Liverpool,0.0,,"66-68 Pall Mall, Liverpool, Merseyside, L3 7DB",(0151)-382-8750,www.group1auto.co.uk
6,14266,Stoneacre Wallasey,1.5,"We offer competitive valuation, Now available to all customers, A large panel of lenders","Rowson Street, New Brighton, Wallasey, Merseyside, CH45 2NA",(0151)-382-7597,www.stoneacre.co.uk
7,10011029,Johnsons Skoda Liverpool,4.9,,"SANDHILLS LANE, Liverpool, Lancashire, L5 9XN",(0151)-382-8177,www.johnsonscars.co.uk
8,6840,Liverpool Audi,4.0,"Franchised Audi Dealer, 95% Of Customers Recommend Us, Exceptional Audi Aftercare","41 Sandhills Lane, Liverpool, Merseyside, L5 9XN",(0151)-382-8421,www.stratstone.com
9,749157,Johnsons Seat Liverpool,4.8,,"Pall Mall, Liverpool, Merseyside, L3 6AL",(0151)-382-7754,www.johnsonscars.co.uk


## Save Result

In [185]:
df1.to_csv("./results/car_dealers_dataset_cleaned.csv", index = False)