In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import scatter_matrix
from matplotlib import rc
import plotly.graph_objs as go
from sklearn import preprocessing
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import RidgeCV, LassoCV, Ridge, Lasso
from sklearn import metrics
from sklearn.metrics import r2_score
import math
from sklearn.metrics import mean_squared_error as MSE
from sklearn import tree
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestRegressor

### Import Data

In [None]:
vehicles = pd.read_csv("vehicles.csv")
vehicles.info()

### Clean data

In [None]:
#select useful columns and drop null values
data = vehicles[['region', 'price', 'year', 'manufacturer','model', 'condition', 'cylinders', 'fuel',
             'odometer', 'title_status','transmission', 'drive', 'size', 'type', 'paint_color','state','lat','long','posting_date']]
data = data.dropna()
data.info()

In [None]:
data.shape

In [None]:
#convert data type
#convert 'year' and 'post_date' data type
data['year'] = data['year'].astype(int)
data['posting_date'] = pd.to_datetime(data['posting_date'], utc=True)
data.info()

In [None]:
#only analyze cars produced after 2000
data = data.loc[data.year>1999, :]
data['age'] = data.year.apply(lambda x: int(2021-x))
data.shape

### Descriptive analysis

#### The top manufacturers, types, models and colors

In [None]:
fig, ax =plt.subplots(2,2,figsize=(25, 15))
plt.rcParams["axes.labelsize"] = 20
plt.rcParams["axes.grid"] = False
a = sns.countplot(x ='manufacturer',data = data,order=data.manufacturer.value_counts().iloc[:10].index, 
              ax = ax[0][0],palette="ch:start=.2,rot=-.3")
a.set_xlabel("Manufacturer", fontsize = 20)
a.set_ylabel("Count", fontsize = 20)
a.tick_params(labelsize=15)

b = sns.countplot(x ='type',data = data,order=data.type.value_counts().iloc[:10].index, 
              ax = ax[0][1],palette="ch:s=-.2,r=.6")
b.set_xlabel("Type", fontsize = 20)
b.set_ylabel("Count", fontsize = 20)
b.tick_params(labelsize=15)

c = sns.countplot(x ='model',data = data,order=data.model.value_counts().iloc[:10].index, 
              ax = ax[1][0],palette="YlOrBr")
c.set_xlabel("Model", fontsize = 20)
c.set_ylabel("Count", fontsize = 20)
c.tick_params(labelsize=15)
c.set_xticklabels(c.get_xticklabels(),rotation=20)

d = sns.countplot(x ='paint_color',data = data,order=data.paint_color.value_counts().iloc[:10].index, 
              ax = ax[1][1],palette="crest")
d.set_xlabel("Color", fontsize = 20)
d.set_ylabel("Count", fontsize = 20)
d.tick_params(labelsize=15);

#### The distribution of the age of the used cars

In [None]:
# age distribution
plt.figure(figsize=(10,5))
plt.rcParams["axes.labelsize"] = 15
plt.title("Density plot of the Age of the Used Cars")
plt.xlabel("Age of the Used Car")
stat = sns.kdeplot(data.age)

In [None]:
# Types of used cars from different manufacturers for sale

manu_type = data.groupby('manufacturer').type.value_counts()
manu_type_df = pd.DataFrame(manu_type.unstack())

plt.subplots(figsize=(20, 10))
h1 = sns.heatmap(manu_type_df, cmap='Blues', linecolor='white', linewidth=1)
h1.set_xlabel("Type", fontsize = 20)
h1.set_ylabel("Manufacturer", fontsize = 20)
h1.tick_params(labelsize=15);

#### Manufacturer with most used cars for sale each year

In [None]:
manf_sale_year = data.groupby('year').manufacturer.value_counts()
manf_sale_year_df = pd.DataFrame(manf_sale_year.unstack())

plt.subplots(figsize=(20, 10))
h2 = sns.heatmap(manf_sale_year_df, cmap='icefire', linecolor='white', linewidth=1)
h2.set_xlabel("Manufacturer", fontsize = 20)
h2.set_ylabel("Year", fontsize = 20)
h2.tick_params(labelsize=15);

In [None]:
# Time series for for-sale used cars of top 5 manufacurers each year
manf_sale_year_df2 = manf_sale_year_df[['ford','chevrolet','toyota','honda','nissan']].copy()
fig, ax = plt.subplots(figsize=(18, 10))
m = sns.lineplot(data=manf_sale_year_df2)
ax.set_xlim(2000,2021)
ax.set_xticks(range(2000,2021))
m.set_xlabel("Year", fontsize = 20)
m.set_ylabel("Count", fontsize = 20)
m.tick_params(labelsize=15)
plt.show()

#### The posted price of the used cars

In [None]:
#price mean
price_mean_manu = data[['price','manufacturer']].groupby('manufacturer').mean().sort_values(by=['price'], ascending=False)
price_mean_type = data[['price','type']].groupby('type').mean().sort_values(by=['price'], ascending=False)
price_mean_color = data[['price','paint_color']].groupby('paint_color').mean().sort_values(by=['price'], ascending=False)


fig, ax =plt.subplots(1,3,figsize=(25, 15))
fig.tight_layout(pad=10)
ap = sns.barplot(x = 'price',y = price_mean_manu.index[:10],data=price_mean_manu.iloc[:10], 
              ax = ax[0],palette="ch:start=.2,rot=-.3")
ap.set_xlabel("Price Mean", fontsize = 20)
ap.set_ylabel("Manufacturer", fontsize = 20)
ap.tick_params(labelsize=15)

bp = sns.barplot(x = 'price',y = price_mean_type.index[:10],data=price_mean_type.iloc[:10], 
              ax = ax[1],palette="ch:s=-.2,r=.6")
bp.set_xlabel("Price Mean", fontsize = 20)
bp.set_ylabel("Type", fontsize = 20)
bp.tick_params(labelsize=15)

cp = sns.barplot(x = 'price',y = price_mean_color.index[:10],data=price_mean_color.iloc[:10], 
              ax = ax[2],palette="YlOrBr")
cp.set_xlabel("Price Mean", fontsize = 20)
cp.set_ylabel("Color", fontsize = 20)
cp.tick_params(labelsize=15);

#### The region comparison of the for-sale used cars

In [None]:
location = data[(data['lat']>24) & (data['lat']<50) & (data['long']>-125) & (data['long']<-65)]
manu_loc = location[(location['manufacturer']=='ford') | 
                (location['manufacturer']=='chevrolet')| 
                (location['manufacturer']=='toyota')| 
                (location['manufacturer']=='honda')| 
                (location['manufacturer']=='nissan')]
plt.figure(figsize=(18,10))
plt.title('Manufacturer Compare by State')
sns.scatterplot(data=manu_loc, x="long",y="lat",hue='manufacturer')
plt.show()

### Correlation analysis

#### Using Pearson Correlation

In [None]:
vehicles.info()

In [None]:
vehicles.drop(['id', 'url','region_url', 'VIN', 'image_url', 'lat', 'long', 'description'], axis=1, inplace=True)
print(vehicles.isnull().sum())

In [None]:
# sorting columns based on their total null
null_val = pd.DataFrame(vehicles.isnull().sum(), columns = ['Nan_sum'])
null_val = null_val[null_val['Nan_sum']>0]
null_val['Percentage'] = (null_val['Nan_sum']/len(vehicles))*100
null_val = null_val.sort_values(by=['Nan_sum'], ascending=False)
null_val

We can treat those columns whose null values differently depends on the project's goal. For the sake of this project, we are only going to keep columns which have less than 40% of missing values.

In [None]:
vehicles_cleaned = vehicles[['cylinders','paint_color','drive','type','odometer','manufacturer','model','fuel','title_status','transmission','year','posting_date']]
vehicles_cleaned.shape

we will drop all rows with missing values.

In [None]:
vehicles_data = vehicles_cleaned.dropna()
vehicles_data

**Handling outliers.**

In [None]:
plt.figure(figsize=(3,6))
sns.boxplot(y='price', data=vehicles_df,showfliers=False);

In [None]:
vehicles_df.price.min()

Price of the vehicle can never be zero. So, we will remove rows with price as 0.

In [None]:
vehicles_df = vehicles_df[vehicles_df['price']>0]

In [None]:
vehicles_df.shape

In [None]:
y = vehicles_df['price']
removed_outliers = y.between(y.quantile(.05), y.quantile(.95))
removed_outliers

In [None]:
print(removed_outliers.value_counts())

In [None]:
index_names = vehicles_df[~removed_outliers].index # INVERT removed_outliers!!
vehicles_df.drop(index_names, inplace=True)
vehicles_df.describe()

In [None]:
plt.figure(figsize=(3,6))
sns.boxplot(y='odometer', data=vehicles_df,showfliers=False);

In [None]:
vehicles_df

In [None]:
vehicles_df.price.min()

Price of the vehicle can never be zero. So, we will remove rows with price as 0.

vehicles_df = vehicles_df[vehicles_df['price']>0]

vehicles_df.shape

y = vehicles_df['price']
removed_outliers = y.between(y.quantile(.05), y.quantile(.95))
removed_outliers

print(removed_outliers.value_counts())

index_names = vehicles_df[~removed_outliers].index # INVERT removed_outliers!!
vehicles_df.drop(index_names, inplace=True)
vehicles_df.describe()

plt.figure(figsize=(3,6))
sns.boxplot(y='odometer', data=vehicles_df,showfliers=False);

vehicles_df

**Put price at the front of all column since we are predicting price relations.**

In [None]:
vehicles_df = vehicles_df[['price','region','year','manufacturer','model','cylinders','fuel','odometer','title_status','transmission','drive','type','paint_color','state']]

In [None]:
sns.catplot(y="manufacturer", x="price",kind="boxen", data=vehicles_df)

In [None]:
sns.violinplot(x=vehicles_df.fuel, y=vehicles_df.price)

In [None]:
sns.catplot(y="type", x="price",kind="violin", data=vehicles_df)

In [None]:
y = vehicles_df['price']
x = vehicles_df['year']
plt.scatter(x, y)
plt.xlabel('year')
plt.ylabel('price')
plt.show()

We will use Label Encoder since we have many categories for categorical variables. Label encoding can help us converting each value in a column to a number which makes it easier to use in predictive modeling.

In [None]:
laborE = preprocessing.LabelEncoder()

In [None]:
vehicles_df

In [None]:
vehicles_df[['region','manufacturer','model','cylinders','fuel','title_status','transmission','drive'
             ,'type','paint_color','state']] = vehicles_df[['region','manufacturer','model','cylinders','fuel','title_status',
                                                            'transmission','drive','type','paint_color','state']].apply(laborE.fit_transform)

In [None]:
vehicles_df

**Odometer is a feature with larger magnitude compare to rest of other features. We need to reduce the scale of it to prevent from dominating the prediction model and generate inaccurate result. MinMaxScaler can be used here to solve to problem.**

In [None]:
from sklearn.preprocessing import MinMaxScaler
vehicles_df["odometer"] = np.sqrt(preprocessing.minmax_scale(vehicles_df["odometer"]));

In [None]:
vehicles_df

### Predictive Modeling.

**Multiple Linear Regression**


**(1) Filter Method**

In [None]:
plt.figure(figsize=(12,10))
cor = vehicles_df.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()

**As we can see from the heatmap that drive, odometer, cylinders, transmission, and year have relatively high postive and negative relationship with price. So we will drop the features apart from this.**

**Independent variables need to be independent with each other. We also need to check if these variables are related with each other.**

In [None]:
print(vehicles_df[["drive","odometer"]].corr())
print(vehicles_df[["odometer","cylinders"]].corr())
print(vehicles_df[["cylinders","transmission"]].corr())
print(vehicles_df[["transmission","year"]].corr())
print(vehicles_df[["year","drive"]].corr())

**It doesn't seem that none of these feature have high relation with each other.**

In [None]:
features_p = vehicles_df[["drive","odometer","cylinders","transmission","year"]]
target_p = vehicles_df[["price"]]

In [None]:
#splitting our dataset randomly with the test data containing 25% of the data,
x_train, x_test, y_train, y_test = train_test_split(features_p,target_p, 
                                                    test_size=0.25, 
                                                    random_state=0)

print('Our training prediction variable contains :',len(y_train) ,'rows')
print('Our training independent variable contains :',len(x_train) ,'rows')
print('Our testing prediction variable contains :',len(y_test) ,'rows')
print('Our testing independent variable contains :',len(x_test) ,'rows')

In [None]:
#applied regression model with Pearson Correlation method
reg_model_p = LinearRegression()

In [None]:
reg_model_p.fit(x_train, y_train)
print('Intercept :', reg_model_p.intercept_[0], '\n')
print(pd.DataFrame({'features':x_train.columns,'coeficients':reg_model_p.coef_[0]}))


In [None]:
#prediction
lr_pred_p = reg_model_p.predict(x_test)

In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, lr_pred_p))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, lr_pred_p))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, lr_pred_p)))

In [None]:
score = r2_score(y_test, lr_pred_p)
score

**(2) Backward Elimination Method**

In [None]:
features = vehicles_df.loc[:,vehicles_df.columns != 'price']
target = vehicles_df.loc[:,vehicles_df.columns == 'price']

In [None]:
#Adding constant column of ones, mandatory for sm.OLS model
X_1 = sm.add_constant(features)
X_1
#Fitting sm.OLS model
model = sm.OLS(target,X_1).fit()
model.pvalues

In [None]:
#Backward Elimination
cols = list(features.columns)
pmax = 1
while (len(cols)>0):
    p= []
    X_1 = features[cols]
    X_1 = sm.add_constant(X_1)
    model = sm.OLS(target,X_1).fit()
    p = pd.Series(model.pvalues.values[1:],index = cols)      
    pmax = max(p)
    feature_with_p_max = p.idxmax()
    if(pmax>0.05):
        cols.remove(feature_with_p_max)
    else:
        break
selected_features_BE = cols
print(selected_features_BE)

In [None]:
features_be = vehicles_df.loc[:,vehicles_df.columns != 'price']
target_be = vehicles_df.loc[:,vehicles_df.columns == 'price']

In [None]:
#splitting our dataset randomly with the test data containing 25% of the data,
X_train, X_test, y_train, y_test = train_test_split(features_be,target_be, 
                                                    test_size=0.25, 
                                                    random_state=0)

print('Our training prediction variable contains :',len(y_train) ,'rows')
print('Our training independent variable contains :',len(X_train) ,'rows')
print('Our testing prediction variable contains :',len(y_test) ,'rows')
print('Our testing independent variable contains :',len(X_test) ,'rows')

In [None]:
# run the regression model with backward elimination
reg_model_back = LinearRegression()

In [None]:
#fitting the training data to the model,
reg_model_back.fit(X_train, y_train)
#outputs the coefficients
print('Intercept :', reg_model_back.intercept_[0], '\n')
print(pd.DataFrame({'features':X_train.columns,'coeficients':reg_model_back.coef_[0]}))

In [None]:
lr_pred_back = reg_model_back.predict(X_test)

In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, lr_pred_back))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, lr_pred_back))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, lr_pred_back)))

In [None]:
score = r2_score(y_test, lr_pred_back)
score

### Random Forest Regressor

In [None]:
rf = RandomForestRegressor(random_state=1).fit(x_train, y_train.values.ravel())
rf_pred = rf.predict(x_test)

In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, rf_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, rf_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, rf_pred)))

In [None]:
score = r2_score(y_test, rf_pred)
score