In [27]:
import os
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.style as style
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns
from scipy import stats
import warnings
import copy

from lightgbm import LGBMClassifier
from sklearn import metrics
from sklearn import model_selection
from sklearn import preprocessing
from sklearn.datasets import make_classification
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBClassifier
from xgboost.sklearn import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor,AdaBoostRegressor,BaggingRegressor, RandomForestRegressor
from sklearn.neural_network import MLPRegressor
import xgboost as xgb

In [28]:
from powerbiclient import Report, models

In [None]:
from powerbiclient.authentication import DeviceCodeLoginAuthentication

In [None]:
device_auth = DeviceCodeLoginAuthentication()

In [None]:
embed_url = "https://app.powerbi.com/groups/me/reports/1be2156a-1ef6-4d3e-9ec2-9c03f69c745a/ReportSectionb3d88c09a3a08e94e414"

In [None]:
report = Report(embed_url=embed_url, auth=device_auth)

In [None]:
report

In [None]:
#importing test datasets and train datasets
data = pd.read_csv('train-data.csv')
test = pd.read_csv('test-data.csv')



# Data Preprocessing

In [None]:
#checking first five rows of each dataset
test.head()


In [None]:
data.head()

In [None]:
#checking datatypes of each dataset
data.dtypes

In [None]:
test.dtypes

In [None]:
print(f"Rows in dataset are : {data.shape[0]} \nColumns in dataset are : {data.shape[1]}")

In [None]:
print(f"Rows in test dataset are : {test.shape[0]} \nColumns in test dataset are : {test.shape[1]}")

In [None]:
#checking stats of each dataset(including max,min, std,etc...)
data.describe()

In [None]:
test.describe()

In [None]:
#checking for missing values in test datset
data.isna().sum()

In [None]:
#dropping New_price as it contains large number of missing values
data = data.drop('New_Price', axis=1)
data = data.dropna(how='any')


In [None]:
#checking for missing values in Name cloumn in both datasets
listtrain = data['Name']
listtest = test['Name']
print("Missing values in first list:", (set(listtest).difference(listtrain))) 

In [None]:
data['Cars'] = data['Name'].str.split(" ").str[0] + ' ' +data['Name'].str.split(" ").str[1]
test['Cars'] = test['Name'].str.split(" ").str[0] + ' ' +test['Name'].str.split(" ").str[1]

In [None]:
set(test['Cars']).issubset(set(data['Cars']))

In [None]:
listtrain = data['Cars']
listtest = test['Cars']
print("Missing values in first list:", (set(listtest).difference(listtrain))) 

In [None]:
test.drop(test[test['Cars'].isin(['Toyota Land', 'Hindustan Motors', 'Fiat Abarth', 'Nissan 370Z', 
                                  'Isuzu MU', 'Bentley Flying', 'OpelCorsa 1.4Gsi'])].index, inplace = True)

In [None]:
listtrain = data['Cars']
listtest = test['Cars']
print("Missing values in first list:", (set(listtest).difference(listtrain))) 

In [None]:
#cleaning the dataset using replace function to achieve better accuracy of model
data['Mileage'] = data['Mileage'].str.replace(' kmpl','')
data['Mileage'] = data['Mileage'].str.replace(' km/kg','')
data['Engine'] = data['Engine'].str.replace(' CC','')
data['Power'] = data['Power'].str.replace('null bhp','112')
data['Power'] = data['Power'].str.replace(' bhp','')

In [None]:
data.isna().sum()

In [None]:
#converting following cloumns to float values
data['Mileage'] = data['Mileage'].astype(float)
data['Mileage'] = data['Mileage'].astype(float)
data['Engine'] = data['Engine'].astype(float)
data['Power'] = data['Power'].astype(float)


In [None]:
data.isna().sum()

In [None]:
#dropping extra values of test dataset
test = test.drop('New_Price', axis=1)
test = test.dropna(how='any')

In [None]:
test = test.drop('Unnamed: 0', axis=1)

In [None]:
#cleaning test dataset for better acuuracy
test['Mileage'] = test['Mileage'].str.replace(' kmpl','')
test['Mileage'] = test['Mileage'].str.replace(' km/kg','')
test['Engine'] = test['Engine'].str.replace(' CC','')
test['Power'] = test['Power'].str.replace('null bhp','112')
test['Power'] = test['Power'].str.replace(' bhp','')

In [None]:
test.isna().sum()

In [None]:
#converting following columns to float
test['Mileage'] = test['Mileage'].astype(float)
test['Mileage'] = test['Mileage'].astype(float)
test['Engine'] = test['Engine'].astype(float)
test['Power'] = test['Power'].astype(float)

In [None]:
data.dtypes
test.dtypes

In [None]:
#specifying the features as input that will help us in creating a good model
dffeature=['Cars','Location','Year', 'Kilometers_Driven','Fuel_Type','Transmission', 
           'Owner_Type','Mileage','Engine','Power','Seats','Price']
data = pd.DataFrame(data, columns=dffeature)

testfeature = ['Cars','Location','Year','Kilometers_Driven','Fuel_Type','Transmission', 
            'Owner_Type','Mileage','Engine','Power','Seats']
test = pd.DataFrame(test, columns=testfeature)

# Data Analysis

In [None]:
 # finding features relative to target Price
correlation = data.corr()
correlation.sort_values(["Price"], ascending = False, inplace = True)
print(correlation.Price)

In [None]:
#analysis price of cars acc to features for better output of the model
plt.plot(data['Price'])


In [None]:
res = stats.probplot(data['Price'], plot=plt)

In [None]:
#comparing price values acc to different fuel types
import plotly.graph_objects as go
fig = go.Figure(data=[go.Pie(labels=data['Fuel_Type'], values=data['Price'], hole=.3)])
fig.update_layout(legend=dict(orientation="h", yanchor="bottom",y=1.02,xanchor="right",x=1))
fig.show()

In [None]:
#analysing price acc to diff years acc to tranmission(manual, automatic)
plt.figure(figsize=(15,10))
xprop = 'Year'
yprop = 'Price'
sns.boxplot(data=data, x=xprop, y=yprop, hue='Transmission')
plt.xlabel('{} range'.format(xprop), size=14)
plt.ylabel('Number of {}'.format(yprop), size=15)
plt.title('Boxplot of {}'.format(yprop), size=15)
plt.show()

In [None]:
#analysing price acc to diff years acc to fule types
plt.figure(figsize=(15,10))
xprop = 'Year'
yprop = 'Price'
sns.boxplot(data=data, x=xprop, y=yprop, hue='Fuel_Type')
plt.xlabel('{} range'.format(xprop), size=14)
plt.ylabel('Number of {}'.format(yprop), size=15)
plt.title('Boxplot of {}'.format(yprop), size=15)
plt.show()

In [None]:
fig = px.box(data, x='Fuel_Type',y='Price', color='Transmission', notched=True)
fig.update_layout(legend=dict(orientation="h",yanchor="bottom",y=1.02,xanchor="right",x=1))
fig.show()

# Model fitting 

In [None]:
import copy
df_train=copy.deepcopy(data)
df_test=copy.deepcopy(test)

cols=np.array(data.columns[data.dtypes != object])
for i in df_train.columns:
    if i not in cols:
        df_train[i]=df_train[i].map(str)
        df_test[i]=df_test[i].map(str)
df_train.drop(columns=cols,inplace=True)
df_test.drop(columns=np.delete(cols,len(cols)-1),inplace=True)

In [None]:
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict

# initializing dictionary function
cols=np.array(data.columns[data.dtypes != object])
d = defaultdict(LabelEncoder)

# only for categorical columns apply dictionary by calling fit_transform 
df_train = df_train.apply(lambda x: d[x.name].fit_transform(x))
df_test = df_test.apply(lambda x: d[x.name].transform(x))
df_train[cols] = data[cols]
df_test[np.delete(cols,len(cols)-1)]=test[np.delete(cols,len(cols)-1)]

In [None]:
#train the features for accurate model
ftrain = ['Cars', 'Location', 'Year', 'Kilometers_Driven', 'Fuel_Type', 'Transmission', 
          'Owner_Type', 'Mileage', 'Engine', 'Power', 'Seats','Price']
#training data using LabelEncoding
def Definedata():
    data2 = df_train[ftrain]
    X = data2.drop(columns=['Price']).values
    y0 = data2['Price'].values
    lab_enc = preprocessing.LabelEncoder()
    y = lab_enc.fit_transform(y0)
    return X, y

In [None]:
def Models(models):
    
    model = models
    X, y = Definedata()
    #splitting dataset into test and train
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 25)
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    y_total = model.predict(X)
    

def Featureimportances(models):
    model = models
    model.fit(X_train,y_train)
    importances = model.feature_importances_
    features = df_test.columns[:9]
    imp = pd.DataFrame({'Features': ftest, 'Importance': importances})
    imp['Sum Importance'] = imp['Importance'].cumsum()
    imp = imp.sort_values(by = 'Importance')
    return imp

In [None]:
#analyzing using heatmaps
style.use('ggplot')
plt.subplots(figsize = (15,5))
mask = np.zeros_like(df_train.corr(), dtype=bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(df_train.corr(), cmap=sns.diverging_palette(20, 220, n=200), annot=True, mask=mask, center = 0, );

In [None]:
Acc = pd.DataFrame(index=None, columns=['model','Root Mean Squared  Error','Accuracy on Traing set','Accuracy on Testing set'])

In [None]:
#comparing accuracy of different models
X, y = Definedata()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 25)
    
regressors = [['DecisionTreeRegressor',DecisionTreeRegressor()],
              ['XGBRegressor', XGBRegressor()],
              ['RandomForestRegressor', RandomForestRegressor()],
              ['MLPRegressor',MLPRegressor()],
              ['AdaBoostRegressor',AdaBoostRegressor()],
              ['ExtraTreesRegressor',ExtraTreesRegressor()]]

for mod in regressors:
    name = mod[0]
    model = mod[1]
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    
    RMSE = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
    ATrS =  model.score(X_train,y_train)
    ATeS = model.score(X_test,y_test)
    
    Acc = pd.concat([Acc, pd.DataFrame.from_records([{'model':name, 'Root Mean Squared  Error': RMSE,'Accuracy on Traing set':ATrS,'Accuracy on Testing set':ATeS}])],ignore_index=True)

In [None]:
Acc.sort_values(by='Accuracy on Testing set')

In [None]:
#according to the accuracy of models predicting price of used cars
feature1 = ['Cars', 'Location', 'Year', 'Kilometers_Driven', 'Fuel_Type', 'Transmission', 
            'Owner_Type', 'Mileage', 'Engine', 'Power', 'Seats']

X0 = df_test[feature1]
X, y = Definedata()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 25)
model.fit(X_train,y_train)
y_predicted = model.predict(X0)

pdc = pd.DataFrame({'Car_id':test.index,'Price':y_predicted}) 
pdc.head(5)