In [660]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('./input/cars_raw.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9379 entries, 0 to 9378
Data columns (total 32 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Year                   9379 non-null   int64  
 1   Make                   9379 non-null   object 
 2   Model                  9379 non-null   object 
 3   Used/New               9379 non-null   object 
 4   Price                  9379 non-null   object 
 5   ConsumerRating         9379 non-null   float64
 6   ConsumerReviews        9379 non-null   int64  
 7   SellerType             9379 non-null   object 
 8   SellerName             9379 non-null   object 
 9   SellerRating           9379 non-null   float64
 10  SellerReviews          9379 non-null   int64  
 11  StreetName             9379 non-null   object 
 12  State                  9379 non-null   object 
 13  Zipcode                9379 non-null   object 
 14  DealType               9157 non-null   object 
 15  Comf

In [661]:
print(df.shape)
df = df[df['Price'] != 'Not Priced' ]
df['Price'] = [price.replace('$','') for price in df['Price']]
df['Price'] = [int(price.replace(',','')) for price in df['Price']]

(9379, 32)


In [662]:
df['Used/New'] = ['Certified Pre-Owned' if 'certified' in item.casefold()
                  else 'Used' for item in df['Used/New']]

In [663]:
df['Used/New'].unique()

array(['Used', 'Certified Pre-Owned'], dtype=object)

In [664]:
df['Drivetrain'].unique()

array(['Front-wheel Drive', 'Four-wheel Drive', 'Rear-wheel Drive',
       'All-wheel Drive', '4WD', 'AWD', 'RWD', 'FWD', 'Front Wheel Drive',
       '–'], dtype=object)

In [665]:
df = df[df['Drivetrain'] != '–']

In [666]:
df['Drivetrain'] = ['4WD' if 'four' in item.casefold()
                   else 'AWD' if 'all' in item.casefold()
                   else 'RWD' if 'rear' in item.casefold()
                   else 'FWD' for item in df['Drivetrain']]

In [667]:
df['Drivetrain'].unique()

array(['FWD', '4WD', 'RWD', 'AWD'], dtype=object)

In [668]:

df = df[df['FuelType'] != '–']

df['FuelType'].unique()


array(['Gasoline', 'Gasoline Fuel', 'Electric Fuel System',
       'E85 Flex Fuel', 'Electric', 'Hybrid', 'Plug-In Electric/Gas',
       'Flex Fuel Capability', 'Diesel', 'Diesel Fuel',
       'Gasoline/Mild Electric Hybrid', 'Flexible Fuel'], dtype=object)

In [669]:
used_new_ec = LabelEncoder()
SellerType_ec = LabelEncoder()
df['Used/New'] = used_new_ec.fit_transform(df['Used/New'])
df['SellerType'] = SellerType_ec.fit_transform(df['SellerType'])

In [670]:
print(df['SellerType'].unique())
print(df['Used/New'].unique())
print(used_new_ec.inverse_transform([0,1]))
print(SellerType_ec.inverse_transform([0,1]))

[0 1]
[1 0]
['Certified Pre-Owned' 'Used']
['Dealer' 'Private']


In [671]:
df['FuelType'].unique()

df['FuelType'] = ['Hybrid' if 'hybrid' in item.casefold()
                 else 'Hybrid' if 'plug' in item.casefold()
                 else 'Flex' if 'flex' in item.casefold()
                 else 'Electric' if 'electric' in item.casefold()
                 else 'Diesel' if 'diesel' in item.casefold()
                 else 'Gas' for item in df['FuelType']]

In [672]:
print(df['DealType'].isnull().sum())
df = df[df['DealType'].isnull() == False]

df['Transmission'].unique()
df['Transmission'] = ['Automatic' if 'auto' in item.casefold() else 
                        'Manual' if 'manual' in item.casefold()
                        else 'Other' for item in df['Transmission']]
df['Transmission'].unique()

221


array(['Automatic', 'Other', 'Manual'], dtype=object)

In [673]:
make = pd.get_dummies(df['Make'])
drivetrain = pd.get_dummies(df['Drivetrain'])
fuelType = pd.get_dummies(df['FuelType'])

In [674]:
column_to_drop = ['Make','Drivetrain', 'FuelType', 'Transmission','SellerName'
,'StreetName','ExteriorColor','InteriorColor','Engine','VIN','Stock#','Model','State','MaxMPG','MinMPG','Zipcode']

In [675]:
df.drop(column_to_drop, axis=1, inplace=True)
pd.concat([df,make,drivetrain,fuelType], axis=1)

Unnamed: 0,Year,Used/New,Price,ConsumerRating,ConsumerReviews,SellerType,SellerRating,SellerReviews,DealType,ComfortRating,...,Volvo,4WD,AWD,FWD,RWD,Diesel,Electric,Flex,Gas,Hybrid
0,2019,1,39998,4.6,45,0,3.3,3,Great,4.7,...,0,0,0,1,0,0,0,0,1,0
1,2018,1,49985,4.8,817,0,4.8,131,Good,4.9,...,0,1,0,0,0,0,0,0,1,0
2,2017,1,41860,4.7,495,0,4.6,249,Good,4.8,...,0,1,0,0,0,0,0,0,1,0
4,2020,1,49000,4.8,76,0,4.8,4755,Good,4.9,...,0,0,0,1,0,0,0,0,1,0
5,2012,1,23541,4.7,34,0,4.4,1071,Fair,4.7,...,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9374,2019,1,27374,4.7,205,0,4.4,443,Good,4.7,...,0,0,1,0,0,0,0,0,1,0
9375,2019,1,61998,4.8,27,0,4.8,1789,Fair,4.9,...,0,0,1,0,0,0,0,0,0,1
9376,2017,1,26944,4.8,137,0,4.7,831,Good,4.9,...,0,0,1,0,0,0,0,0,1,0
9377,2019,1,28568,4.7,279,0,4.4,680,Good,4.8,...,0,0,1,0,0,0,0,0,1,0


In [676]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9116 entries, 0 to 9378
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Year                   9116 non-null   int64  
 1   Used/New               9116 non-null   int32  
 2   Price                  9116 non-null   int64  
 3   ConsumerRating         9116 non-null   float64
 4   ConsumerReviews        9116 non-null   int64  
 5   SellerType             9116 non-null   int32  
 6   SellerRating           9116 non-null   float64
 7   SellerReviews          9116 non-null   int64  
 8   DealType               9116 non-null   object 
 9   ComfortRating          9116 non-null   float64
 10  InteriorDesignRating   9116 non-null   float64
 11  PerformanceRating      9116 non-null   float64
 12  ValueForMoneyRating    9116 non-null   float64
 13  ExteriorStylingRating  9116 non-null   float64
 14  ReliabilityRating      9116 non-null   float64
 15  Mile

In [677]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix


In [678]:
print(df.head())
X = np.array(df.drop('DealType', axis=1))
y = np.array(df['Price'])
print(y)


   Year  Used/New  Price  ConsumerRating  ConsumerReviews  SellerType  \
0  2019         1  39998             4.6               45           0   
1  2018         1  49985             4.8              817           0   
2  2017         1  41860             4.7              495           0   
4  2020         1  49000             4.8               76           0   
5  2012         1  23541             4.7               34           0   

   SellerRating  SellerReviews DealType  ComfortRating  InteriorDesignRating  \
0           3.3              3    Great            4.7                   4.6   
1           4.8            131     Good            4.9                   4.8   
2           4.6            249     Good            4.8                   4.7   
4           4.8           4755     Good            4.9                   4.8   
5           4.4           1071     Fair            4.7                   4.6   

   PerformanceRating  ValueForMoneyRating  ExteriorStylingRating  \
0           

In [683]:
from sklearn.linear_model import LinearRegression

X_train, X_test, y_train, y_test = train_test_split(X,y)
print(X_train[:5])


model = LinearRegression()


model.fit(X_train, y_train)

model.score(X_train, y_train)



[[2.0190e+03 1.0000e+00 4.2500e+04 4.7000e+00 6.9000e+01 0.0000e+00
  4.6000e+00 5.8000e+02 4.9000e+00 4.7000e+00 4.8000e+00 4.6000e+00
  4.8000e+00 4.6000e+00 1.9060e+04]
 [2.0160e+03 1.0000e+00 2.4995e+04 4.8000e+00 7.3000e+01 0.0000e+00
  4.5000e+00 1.1000e+02 4.8000e+00 4.8000e+00 4.8000e+00 4.5000e+00
  4.8000e+00 4.7000e+00 8.5748e+04]
 [2.0190e+03 1.0000e+00 2.3988e+04 4.7000e+00 2.7900e+02 0.0000e+00
  4.7000e+00 1.9970e+03 4.8000e+00 4.7000e+00 4.6000e+00 4.7000e+00
  4.7000e+00 4.8000e+00 5.7527e+04]
 [2.0210e+03 1.0000e+00 4.7998e+04 4.6000e+00 1.6000e+01 0.0000e+00
  4.0000e+00 4.0000e+00 4.6000e+00 4.4000e+00 4.7000e+00 4.2000e+00
  4.5000e+00 4.7000e+00 1.3923e+04]
 [2.0210e+03 1.0000e+00 3.2940e+04 4.8000e+00 2.0000e+01 0.0000e+00
  4.9000e+00 8.2100e+02 4.8000e+00 4.8000e+00 4.6000e+00 4.6000e+00
  4.9000e+00 4.8000e+00 3.9164e+04]]


1.0

In [684]:
y_pred = model.predict(X_test)

In [686]:
from sklearn.metrics import accuracy_score, confusion_matrix, r2_score
from sklearn.metrics import mean_absolute_error

r2_score(y_test, y_pred)


1.0