___

<p style="text-align: center;"><img src="https://docs.google.com/uc?id=1lY0Uj5R04yMY3-ZppPWxqCr5pvBLYPnV" class="img-fluid" alt="CLRSWY"></p>

___

#  The Capstone Project of Data Analytics Module

# Car Price Prediction EDA

## Introduction
Welcome to "***Car Price Prediction EDA Project***". This is the capstone project of ***Data Analytics*** Module. **Auto Scout** data which using for this project, scraped from the on-line car trading company in 2019, contains many features of 9 different car models. In this project, you will have the opportunity to apply many commonly used algorithms for Data Cleaning and Exploratory Data Analysis by using many Python libraries such as Numpy, Pandas, Matplotlib, Seaborn, Scipy.

The project consists of 3 parts:
* First part is related with 'data cleaning'. It deals with Incorrect Headers, Incorrect Format, Anomalies, Dropping useless columns.
* Second part is related with 'filling data'. It deals with Missing Values. Categorical to numeric transformation is done.
* Third part is related with 'handling outliers of data' via Visualisation libraries. Some insights are extracted.

# PART- 1 `( Data Cleaning )`

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")
warnings.warn("this will not show")

%matplotlib inline
# %matplotlib notebook

plt.rcParams["figure.figsize"] = (10,6)
# plt.rcParams['figure.dpi'] = 100

sns.set_style("whitegrid")
pd.set_option('display.float_format', lambda x: '%.3f' % x)

pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

In [None]:
df = pd.read_json("data/scout_car.json", lines=True)

In [None]:
df.head(3).T

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df["Comfort_Convenience"] = df["\nComfort & Convenience\n"]
df["Entertainment_Media"] = df["\nEntertainment & Media\n"]
df["Extras"] = df["\nExtras\n"]
df["Safety_Security"] = df["\nSafety & Security\n"]

In [None]:
drop_columns = ["\nComfort & Convenience\n","\nEntertainment & Media\n","\nExtras\n","\nSafety & Security\n"]
df.drop(drop_columns, axis = 1, inplace = True)

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.isnull().sum()*100/df.shape[0]

**Droping columns that have %90 percent and higher of missing values.**

In [None]:
def show_nans(df, limit):
    missing = df.isnull().sum()*100/df.shape[0]
    return missing.loc[lambda x : x >= limit]

def perc_nans(serial):
    # display percentage of nans in a Series
    return serial.isnull().sum()/serial.shape[0]*100

In [None]:
show_nans(df,90)

In [None]:
drop_columns = show_nans(df,90).index
drop_columns

In [None]:
df.drop(drop_columns, axis = 1, inplace = True)

In [None]:
df.drop("null", axis = 1, inplace = True)

In [None]:
df.info()

## Let's examine and clean all the columns/features one by one

## url

In [None]:
df.url.value_counts(dropna=False)

In [None]:
df.drop("url", axis = 1, inplace = True)

## make_model

In [None]:
df.make_model.value_counts(dropna=False)

## Make

In [None]:
df.Make.value_counts(dropna=False)

In [None]:
df["Make"] = df.Make.str.strip('\n')

In [None]:
df.Make.value_counts(dropna=False)

In [None]:
df.drop("Make", axis=1, inplace=True)

## Model

In [None]:
df.Model.value_counts(dropna=False)

In [None]:
df["Model"] = df.Model.str[1]

In [None]:
df.Model.value_counts(dropna=False)

In [None]:
df.drop("Model", axis=1, inplace=True)

## short_description

In [None]:
df.short_description.value_counts(dropna=False)

In [None]:
df.drop("short_description", axis = 1, inplace = True)

## description

In [None]:
df.description.value_counts(dropna=False)

In [None]:
df.drop("description", axis = 1, inplace = True)

## body_type

In [None]:
df.body_type.value_counts(dropna=False)

## Body

In [None]:
df.Body.value_counts(dropna=False)

In [None]:
df.drop("Body", axis = 1, inplace = True)

## price (target column)

In [None]:
df.price.isnull().sum()

In [None]:
sns.boxplot(df.price);

In [None]:
sns.distplot(df.price);

## vat
The Value Added Tax, or VAT, in the European Union is a general, broadly based consumption tax assessed on the value added to goods and services.

In [None]:
df.vat.value_counts(dropna=False)

## km

In [None]:
df.km.value_counts(dropna=False)

In [None]:
df.km.str.replace(',','').str.extract('(\d{1,8})')

In [None]:
df["km"] = df.km.str.replace(',','').str.extract('(\d{1,8})')[0].astype('float')
#Function of extract returns a dataframe, that's why we have to use slicing method before changing the type of column

In [None]:
df.km.value_counts(dropna=False)

## registration & First Registration

In [None]:
df.registration.value_counts(dropna=False)

In [None]:
df["First Registration"].value_counts(dropna=False)

I prefer "First Registration" column because I don't need month values

In [None]:
df['First Registration'] = df['First Registration'].str[1].astype('float')

In [None]:
df["First Registration"].value_counts(dropna=False)

In [None]:
df['age'] = 2019 - df['First Registration']

In [None]:
df.age.value_counts(dropna=False)

In [None]:
df.drop(["registration", "First Registration"], axis=1, inplace=True)

## prev_owner & Previous Owners

In [None]:
df.prev_owner.value_counts(dropna=False)

In [None]:
df["Previous Owners"].value_counts(dropna=False)

I prefer "Previous Owners" column because it has less NaN values

In [None]:
[item[0] if type(item) == list else item for item in df["Previous Owners"]]

In [None]:
df["Previous_Owners"] = [item[0] if type(item) == list else item for item in df["Previous Owners"]]
df["Previous_Owners"]

In [None]:
df["Previous_Owners2"] = df["Previous Owners"].apply(lambda item: item[0] if type(item) == list else item)
df["Previous_Owners2"]

In [None]:
df["Previous_Owners"] = df["Previous_Owners"].str.strip("\n").astype('float')

In [None]:
df["Previous_Owners"].value_counts(dropna=False)

In [None]:
df.drop(["prev_owner", "Previous Owners", "Previous_Owners2"], axis=1, inplace=True)

## hp

In [None]:
df.hp.value_counts(dropna=False)

In [None]:
df["hp_kW"] = df.hp.str.extract('(\d{1,4})')[0].astype('float')

In [None]:
#Alternative method
#df.replace({"hp" : {" kW" : ""}}, regex = True, inplace = True)
#df["hp_kw"] = pd.to_numeric(df.hp)

In [None]:
df.drop('hp', axis=1, inplace=True)

## Type

In [None]:
df.Type.value_counts(dropna=False)

In [None]:
df["Type"] = df.Type.str[1]

In [None]:
df['Type'].value_counts(dropna=False)

## Next Inspection & Inspection new

In [None]:
df["Next Inspection"].value_counts(dropna=False)

In [None]:
df["Inspection new"].value_counts(dropna=False)

In [None]:
df["Inspection_new"] = [item[0] if type(item) == list else item for item in df["Inspection new"]]
df["Inspection_new"] = df["Inspection_new"].str.strip("\n")

In [None]:
df["Inspection_new"].value_counts(dropna=False)

In [None]:
df.drop(["Next Inspection", "Inspection new"], axis=1, inplace=True)

## Warranty

In [None]:
df.Warranty.value_counts(dropna=False)

In [None]:
df["Warranty"] = df.Warranty.apply(lambda x : x[0] if type(x)==list else x)
df["Warranty"] = df.Warranty.str.strip("\n").str.extract('(\d{1,2})')[0].astype("float")

In [None]:
df.Warranty.value_counts(dropna=False)

## Full Service

In [None]:
df['Full Service'].value_counts(dropna=False)

In [None]:
df.drop("Full Service", axis=1, inplace=True)

## Non-smoking Vehicle

In [None]:
df['Non-smoking Vehicle'].value_counts(dropna=False)

In [None]:
df.drop("Non-smoking Vehicle", axis=1, inplace=True)

## Offer Number

In [None]:
df['Offer Number'].value_counts(dropna=False)

In [None]:
df.drop("Offer Number", axis=1, inplace=True)

## Body Color

In [None]:
df['Body Color'].value_counts(dropna=False)

In [None]:
df['Body_Color'] = df['Body Color'].str[1]

In [None]:
df['Body_Color'].value_counts(dropna=False)

In [None]:
df.drop("Body Color", axis=1, inplace=True)

## Body Color Original

In [None]:
df['Body Color Original'].value_counts(dropna=False)

In [None]:
df.drop("Body Color Original", axis=1, inplace=True)

## Paint Type

In [None]:
df['Paint Type'].value_counts(dropna=False)

In [None]:
df['Paint_Type'] = df['Paint Type'].str[0].str.strip('\n')

In [None]:
df['Paint_Type'].value_counts(dropna=False)

In [None]:
df.drop("Paint Type", axis=1, inplace=True)

## Upholstery

In [None]:
df['Upholstery'].value_counts(dropna=False)

In [None]:
df["Upholstery"] = [item[0] if type(item) == list else item for item in df.Upholstery]

In [None]:
df["Upholstery"] = df.Upholstery.str.strip("\n").str.split(", ")

In [None]:
df["Upholstery"].value_counts(dropna=False)

In [None]:
u_type = ["Cloth", 'Part leather', 'Full leather', 'Velour', 'alcantara']
df["Upholstery_type"] = df["Upholstery"].apply(lambda x : x[0] if type(x) == list and x[0] in u_type else np.nan)

In [None]:
df["Upholstery_type"].value_counts(dropna=False)

In [None]:
color = ['Black', 'Grey', 'Brown', 'Beige', 'White', 'Blue', 'Red', 'Yellow', 'Orange']

def finder(x):
    if type(x) == list and len(x) == 2:
        return x[1]
    elif type(x) == list and x[0] in color:
        return x[0]
    else:
        return np.nan
    
df['Upholstery_color'] = df.Upholstery.apply(finder)

In [None]:
df["Upholstery_color"].value_counts(dropna=False)

In [None]:
df.drop("Upholstery", axis=1, inplace=True)

## Nr. of Doors

In [None]:
df["Nr. of Doors"].value_counts(dropna=False)

In [None]:
df['Nr_of_Doors'] = df['Nr. of Doors'].str[0].str.strip('\n').astype('float')

In [None]:
df["Nr_of_Doors"].value_counts(dropna=False)

In [None]:
df.drop("Nr. of Doors", axis=1, inplace=True)

## Nr. of Seats

In [None]:
df["Nr. of Seats"].value_counts(dropna=False)

In [None]:
df['Nr_of_Seats'] = df['Nr. of Seats'].str[0].str.strip('\n').astype('float')

In [None]:
df["Nr_of_Seats"].value_counts(dropna=False)

In [None]:
df.drop("Nr. of Seats", axis=1, inplace=True)

## Model Code

In [None]:
df["Model Code"].value_counts(dropna=False)

In [None]:
df.drop("Model Code", axis=1, inplace=True)

## Gearing Type

In [None]:
df["Gearing Type"].value_counts(dropna=False)

In [None]:
df['Gearing_Type'] = df['Gearing Type'].str[1]

In [None]:
df["Gearing_Type"].value_counts(dropna=False)

In [None]:
df.drop("Gearing Type", axis=1, inplace=True)

## Displacement

In [None]:
df["Displacement"].value_counts(dropna=False)

In [None]:
df["Displacement"] = df.Displacement.str[0].str.strip('\n').str.replace(',','').str.extract('(\d{1,5})')[0].astype("float")

In [None]:
df["Displacement_cc"] = df["Displacement"]

In [None]:
df["Displacement_cc"].value_counts(dropna=False)

In [None]:
df.drop("Displacement", axis=1, inplace=True)

## Cylinders

In [None]:
df["Cylinders"].value_counts(dropna=False)

In [None]:
df["Cylinders"] = df.Cylinders.str[0].str.strip('\n').astype("float")

In [None]:
df["Cylinders"].value_counts(dropna=False)

## Weight

In [None]:
df["Weight"].value_counts(dropna=False)

In [None]:
df["Weight_kg"] = df.Weight.str[0].str.strip('\n').str.replace(',','').str.extract('(\d{1,6})')[0].astype('float')

In [None]:
df["Weight_kg"].value_counts(dropna=False)

In [None]:
df.drop("Weight", axis=1, inplace=True)

## Drive chain

In [None]:
df["Drive chain"].value_counts(dropna=False)

In [None]:
df['Drive_chain'] = df['Drive chain'].str[0].str.strip('\n')

In [None]:
df["Drive_chain"].value_counts(dropna=False)

In [None]:
df.drop("Drive chain", axis=1, inplace=True)

## Fuel

In [None]:
df["Fuel"].value_counts(dropna=False)

In [None]:
df["Fuel"] = df.Fuel.str[1].str.split("/").str[0].str.strip()

In [None]:
df["Fuel"].value_counts(dropna=False)

In [None]:
df["Fuel"] = df.Fuel.str.split("(").str[0].str.strip()

In [None]:
df["Fuel"].value_counts(dropna=False)

In [None]:
benzine = ["Gasoline", "Super 95","Regular","Super E10 95","Super Plus 98","Super Plus E10 98", "Others"]
lpg = ["LPG","Liquid petroleum gas", "CNG", "Biogas", "Domestic gas H"]
def fueltype(x):
    if x in benzine:
        return "Benzine"
    elif x in lpg:
        return "LPG/CNG"
    else:
        return x
df["Fuel"] = df.Fuel.apply(fueltype)

In [None]:
df.Fuel.value_counts(dropna=False)

In [None]:
#alternative method
#df["fuel_new"] = df.Fuel.str[1]
#diesel_bool = df["fuel_new"].str.contains("diesel", case = False, regex = True)
#lpg_bool = df["fuel_new"].str.contains("lpg|cng|bio|domestic|electric", case = False, regex = True)
#df.loc[diesel_bool, "fuel_new"] = "Diesel"
#df.loc[lpg_bool, "fuel_new"] = "LPG/CNG"
#benz = list(df.fuel_new.loc[lambda x : x != "Diesel"][lambda x : x != "LPG/CNG"].index)
#df.fuel_new.iloc[benz] = "Benzine"

## Consumption

In [None]:
df["Consumption"].value_counts(dropna=False)

In [None]:
def parser1(x):
    if type(x) == float:
        return np.nan
    elif type(x[0]) == list:
        if x[0] != []:
            return x[0][0]
        else:
            return np.nan
    else:
        return x[1]

def parser2(x):
    if type(x) == float:
        return np.nan
    elif type(x[0]) == list:
        if x[1] != []:
            return x[1][0]
        else:
            return np.nan
    elif x[3].endswith(')'):
        return x[3]
    else:
        return np.nan

def parser3(x):
    if type(x) == float:
        return np.nan
    elif type(x[0]) == list:
        if x[2] != []:
            return x[2][0]
        else:
            return np.nan
    elif type(x[0]) != list and x[3].endswith(')'):
        return x[5]
    else:
        return np.nan

In [None]:
df['cons_comb'] = df.Consumption.apply(parser1).str.extract('(\d{1,2}.\d|\d{1,3})')[0].astype("float")
df['cons_city'] = df.Consumption.apply(parser2).str.extract('(\d{1,2}.\d|\d{1,3})')[0].astype("float")
df['cons_country'] = df.Consumption.apply(parser3).str.extract('(\d{1,2}.\d|\d{1,3})')[0].astype("float")

In [None]:
df["cons_comb"].value_counts(dropna=False).head()

In [None]:
df.drop('Consumption', axis=1,inplace=True)

## CO2 Emission

In [None]:
df["CO2 Emission"].value_counts(dropna=False)

In [None]:
df["CO2_Emission"] = [item[0] if type(item) == list else item for item in df["CO2 Emission"]]
df["CO2_Emission"] = df["CO2_Emission"].str.strip("\n").str.rstrip(" g CO2/km (comb)").str.replace(",", ".").astype("float")

In [None]:
df["CO2_Emission"].value_counts(dropna=False)

In [None]:
df.drop("CO2 Emission", axis=1, inplace=True)

## Emission Class

In [None]:
df["Emission Class"].value_counts(dropna=False)

In [None]:
df["Emission_Class"] = [item[0] if type(item) == list else item for item in df["Emission Class"]]
df["Emission_Class"] = df["Emission_Class"].str.strip("\n")

In [None]:
df["Emission_Class"].value_counts(dropna=False)

In [None]:
df.replace({"Emission_Class" : {"Euro 6d-TEMP":"Euro 6", "Euro 6c":"Euro 6", "Euro 6d":"Euro 6"}}, regex = True, inplace = True)

In [None]:
df["Emission_Class"].value_counts(dropna=False)

In [None]:
df.drop("Emission Class", axis=1, inplace=True)

## Emission Label

In [None]:
df["Emission Label"].value_counts(dropna=False)

In [None]:
df.drop("Emission Label", axis=1, inplace=True)

## Gears

In [None]:
df["Gears"].value_counts(dropna=False)

In [None]:
df["Gears"] = df.Gears.str[0].str.strip('\n')

In [None]:
df["Gears"].value_counts(dropna=False)

## Country version

In [None]:
df["Country version"].value_counts(dropna=False)

In [None]:
df.drop("Country version", axis=1, inplace=True)

## Comfort_Convenience

In [None]:
df["Comfort_Convenience"].value_counts(dropna=False).head()

In [None]:
df["Comfort_Convenience"] = [",".join(item) if type(item) == list else item for item in df["Comfort_Convenience"]]

In [None]:
df["Comfort_Convenience"].value_counts(dropna=False).head()

## Entertainment_Media

In [None]:
df["Entertainment_Media"].value_counts(dropna=False).head()

In [None]:
df["Entertainment_Media"] = [",".join(item) if type(item) == list else item for item in df["Entertainment_Media"]]

In [None]:
df["Entertainment_Media"].value_counts(dropna=False).head()

## Extras

In [None]:
df["Extras"].value_counts(dropna=False).head()

In [None]:
df["Extras"] = [",".join(item) if type(item) == list else item for item in df["Extras"]]

In [None]:
df["Extras"].value_counts(dropna=False).head()

## Safety_Security

In [None]:
df["Safety_Security"].value_counts(dropna=False).head()

In [None]:
df["Safety_Security"] = [",".join(item) if type(item) == list else item for item in df["Safety_Security"]]

In [None]:
df["Safety_Security"].value_counts(dropna=False).head()

## The End of Part-1

In [None]:
df.shape

In [None]:
df.head().T

In [None]:
df.info()

In [None]:
df.to_csv("clean_scout.csv", index=False)