In [139]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import PowerTransformer
from scipy.stats import mode
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import r2_score
from math import sqrt
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from xgboost import XGBRegressor

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Problem Statement**

A Chinese automobile company Geely Auto aspires to enter the US market by setting up their manufacturing unit there and producing cars locally to give competition to their US and European counterparts.

They have contracted an automobile consulting company to understand the factors on which the pricing of cars depends. Specifically, they want to understand the factors affecting the pricing of cars in the American market, since those may be very different from the Chinese market. The company wants to know:

Which variables are significant in predicting the price of a car
How well those variables describe the price of a car
Based on various market surveys, the consulting firm has gathered a large data set of different types of cars across the America market.

# **Hypothesis Generation**

From the above problem statement, we have to consider the factors(features) affecting the market. The factors that will affect the car price as the Chinese company gets into US markets are - 
1. Spare Parts - Availability and production of spare parts
2. Fuel type - As gas prices have been rapidly hiking up
3. Engine CC - This is a important feature as higher the cc vehile higher the speed, HP, torque and price too.
4. Car dimensions - As traffic has been a increasing issue, so the car dimensions make a huge impact.
5. Carbon Emission - Cars these days are been built with the least carbon emission, but this factor adds upto the country's global warming factor.

**Let's make a copy of the original dataset to avoid the loss of data and display first five rows**

In [140]:
car = pd.read_csv("/kaggle/input/car-price-prediction/CarPrice_Assignment.csv")
car_copy = car.copy()
pd.set_option('display.max_columns', None)
car.head()

# Getting a basic information regarding data

In [141]:
car.info()

In [142]:
car.describe().T

In [143]:
pd.DataFrame(car.nunique())

In [144]:
car.duplicated().sum()

# EDA 

Let's firdt drop the car ID and model name columns

In [145]:
car = car.drop(['car_ID','CarName'], axis=1)

# Categorial values
1. Feul type
2. Aspiration
3. Door number
4. Car body
5. Drive wheel
6. Engine location
7. Engine type
8. Cylinder number
9. Feul system
10. Symboling

In [146]:
def value_cnts(fea):
    for i in fea:
        print(car[i].value_counts())

In [147]:
cat_fea = ['fueltype','aspiration','doornumber','carbody','drivewheel','enginelocation','enginetype','cylindernumber','fuelsystem','symboling']


In [148]:
def cat_vis(fea):
    plt.figure(1)
    plt.figure(figsize=(16,5))
    plt.subplot(1,2,1)
    sns.countplot(car[fea])


In [149]:
for i in cat_fea:
    cat_vis(i)

**From above graphs**
1. We can conclude that there is a lot of imbalance in the categories(i.e. one category has high value count and the other has very low count.
2. This categorical data needs to be treated before applying model on it

# Relation between Categorical values and Target (Price)

In [150]:
def rel_cat(fea):
    plt.figure(1)
    plt.figure(figsize=(16,5))
    plt.subplot(1,2,1)
    sns.stripplot(y=car['price'], x=car[fea])


In [151]:
for i in cat_fea:
    rel_cat(i)
    

**From above graphs**
1. Correlation of each category with our target variable i.e. price is moderate as compared to it's value counts.
2. Even though many of categories having high count correlate less with the price.

# **Numerical Values**
1. Wheel base
2. Car length
3. Car width
4. Car height
5. Curb weight
6. Engine size
7. Bore ratio
8. Stroke
9. Compressionratio
10. Horsepower
11. Peak rpm
12. City mpg
13. Highway mpg


In [152]:
num_fea = ['wheelbase', 'carlength', 'carwidth', 'carheight', 'curbweight', 'enginesize', 'boreratio', 'stroke','compressionratio', 'horsepower', 'peakrpm', 'citympg', 'highwaympg']

In [153]:
def num_visual(fea):
    plt.figure(1)
    plt.figure(figsize=(16,5))
    plt.subplot(121)
    sns.distplot(fea)
    plt.subplot(122)
    sns.boxplot(fea)
    plt.show()

In [154]:
for i in num_fea:
    num_visual(car[i])

# Observations
1. Almost all the features are negatively skewwed.
2. Some features have outliers which are needed to be treated before modelling

# Relation between Numerical values and Target variable (Price)

In [155]:
num_fea

In [156]:
def rel_num(fea):
    plt.figure(1)
    plt.figure(figsize=(16,8))
    plt.subplot(121)
    sns.scatterplot(y = car['price'], x = car[fea])
    plt.grid()

In [157]:
for i in num_fea:
    rel_num(i)

# Treating Skewed columns

In [158]:
skew_limit = 0.75 
skew_vals = car[num_fea].skew()
skew_cols= skew_vals[abs(skew_vals)> skew_limit].sort_values(ascending=False)
skew_cols

# Correlation of Whole Dataset

In [159]:
plt.figure(figsize=(16,8))
sns.heatmap(car.corr(), annot=True, cmap='BuPu')

# Handling missing values

In [160]:
car.isna().sum()

**We can see that there are no missing values.**

# Transforming Categorical data into numbers

Here, we have used dummies() instead of ohc() because, we are applying on whole dataset.

In [161]:
car2 = pd.get_dummies(car, columns=cat_fea, drop_first=True)
car2.head()

# Train test Split

In [162]:
X = car2.drop('price', axis=1)
y = car2['price']

x_train, x_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 42) 


In [163]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [164]:
num_fea2 = car2.drop(['price'], axis=1).select_dtypes('number').columns
num_fea2

# Creating Model
**Transforming the numerical data to normal distribution and then passing the normalised data to every model to check which model works the best.**

In [165]:
rmse_test1 =[]
r2_test1 = []
model_names = []

std = StandardScaler()
pw = PowerTransformer()
lr = LinearRegression()
knn = KNeighborsRegressor()
svr = SVR()

models = [lr,knn,svr]

for m in models:
    ct = make_column_transformer((std,num_fea2),(pw,skew_cols.index),remainder='passthrough') 
    pipe = make_pipeline(ct, m)
    pipe.fit(x_train, y_train)
    y_pred = pipe.predict(x_test)
    rmse_test1.append(round(np.sqrt(mean_squared_error(y_test, y_pred)),2))
    r2_test1.append(round(r2_score(y_test, y_pred),2))
    print(f'Model:{m} and rmse_score : {round(np.sqrt(mean_squared_error(y_test, y_pred)),2)}, r2 score is {round(r2_score(y_test, y_pred),2)}')
Models = ['Linear Regression','KNeighbors','SVR']
res_df = pd.DataFrame({'RMSE':rmse_test1,'r2 score':r2_test1}, index=Models)
res_df

In [166]:
plt.figure(figsize=(10,8))
plt.subplot(1,2,1)
sns.barplot(Models, rmse_test1)
plt.subplot(1,2,2)
sns.barplot(Models, r2_test1)

**We can observe that linear regression gave the best r2 score and least rmse**

# Ensembling models

In [167]:
rmse_test2 =[]
r2_test2 = []
model_names = []

x = car2.drop('price', axis=1)
y = car2['price']

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state = 42)

rf = RandomForestRegressor(random_state=42)
gb = GradientBoostingRegressor(random_state=42)
xgb = XGBRegressor(random_state=42)

models = [rf,gb,xgb]

for m in models:
    m.fit(x_train, y_train)
    y_pred = m.predict(x_test)
    rmse_test2.append(round(np.sqrt(mean_squared_error(y_test, y_pred)),2))
    r2_test2.append(round(r2_score(y_test, y_pred),2))
    print(f'Model: {m}, RMSE score:{round(np.sqrt(mean_squared_error(y_test, y_pred)),2)}, R2 score:{round(r2_score(y_test, y_pred),2)}' )
m_n = ['Random Forest','Gradient Boosting','XG Boosting']
res_df = pd.DataFrame({"RMSE":rmse_test2, "R2 score":r2_test2}, index=m_n)
res_df

In [168]:
plt.figure(figsize=(10,8))
plt.subplot(1,2,1)
sns.barplot(m_n, rmse_test2)
plt.subplot(1,2,2)
sns.barplot(Models, r2_test2)

# Observations
1. Random forest got 0.94 R2 and lowest RMSE.
2. XGBoost also did a good job with 0.91 R2