In [1]:
# importing libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
# uploading the dataset
df = pd.read_csv(r"Clean_Used_Cars")

In [3]:
# for comparison
df_test = df.copy()

In [4]:
df.shape

(359724, 19)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 359724 entries, 0 to 359723
Data columns (total 19 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   region        359724 non-null  object 
 1   price         359724 non-null  int64  
 2   year          359724 non-null  float64
 3   manufacturer  359724 non-null  object 
 4   model         359724 non-null  object 
 5   condition     227705 non-null  object 
 6   cylinders     359724 non-null  object 
 7   fuel          359724 non-null  object 
 8   odometer      359724 non-null  float64
 9   title_status  359724 non-null  object 
 10  transmission  359724 non-null  object 
 11  VIN           359724 non-null  int64  
 12  drive         359724 non-null  object 
 13  size          359724 non-null  object 
 14  type          359724 non-null  object 
 15  paint_color   359724 non-null  object 
 16  state         359724 non-null  object 
 17  lat           359724 non-null  float64
 18  long

In [6]:
df.isnull().sum()

region               0
price                0
year                 0
manufacturer         0
model                0
condition       132019
cylinders            0
fuel                 0
odometer             0
title_status         0
transmission         0
VIN                  0
drive                0
size                 0
type                 0
paint_color          0
state                0
lat                  0
long                 0
dtype: int64

In [7]:
df.nunique()

region            404
price           14072
year              110
manufacturer       42
model           26093
condition           6
cylinders           8
fuel                5
odometer        93504
title_status        6
transmission        3
VIN                 2
drive               3
size                4
type               13
paint_color        12
state              51
lat             48854
long            49383
dtype: int64

In [8]:
df.columns

Index(['region', 'price', 'year', 'manufacturer', 'model', 'condition',
       'cylinders', 'fuel', 'odometer', 'title_status', 'transmission', 'VIN',
       'drive', 'size', 'type', 'paint_color', 'state', 'lat', 'long'],
      dtype='object')

In [9]:
# changing data types for less memory usage

df['VIN'] = df['VIN'].astype('int32')

categories =['region', 'manufacturer', 'model', 'condition',
       'cylinders', 'fuel', 'title_status', 'transmission',
       'drive', 'size', 'type', 'paint_color', 'state']

for column in categories:
    df[column] = df[column].astype('category')
    
    
numeric  = ['price', 'year', 'odometer', 'lat', 'long']

for column in numeric:
    df[column] = df[column].astype('float32')

    
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 359724 entries, 0 to 359723
Data columns (total 19 columns):
 #   Column        Non-Null Count   Dtype   
---  ------        --------------   -----   
 0   region        359724 non-null  category
 1   price         359724 non-null  float32 
 2   year          359724 non-null  float32 
 3   manufacturer  359724 non-null  category
 4   model         359724 non-null  category
 5   condition     227705 non-null  category
 6   cylinders     359724 non-null  category
 7   fuel          359724 non-null  category
 8   odometer      359724 non-null  float32 
 9   title_status  359724 non-null  category
 10  transmission  359724 non-null  category
 11  VIN           359724 non-null  int32   
 12  drive         359724 non-null  category
 13  size          359724 non-null  category
 14  type          359724 non-null  category
 15  paint_color   359724 non-null  category
 16  state         359724 non-null  category
 17  lat           359724 non-null

In [10]:
df.head()

Unnamed: 0,region,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,VIN,drive,size,type,paint_color,state,lat,long
0,las vegas,7900.0,2002.0,acura,2002 rsx type s,good,4 cylinders,gas,95861.0,clean,manual,0,fwd,compact,coupe,black,nv,36.197292,-115.264389
1,norfolk / hampton roads,3500.0,1997.0,acura,3.0cl,excellent,6 cylinders,gas,190000.0,clean,automatic,0,fwd,compact,coupe,silver,va,36.858501,-76.0019
2,richmond,1000.0,1997.0,acura,3.0cl,fair,6 cylinders,gas,300000.0,clean,automatic,0,fwd,compact,coupe,red,va,37.1819,-77.385399
3,philadelphia,9999.0,2003.0,acura,3.2 cl type s,excellent,6 cylinders,gas,136371.0,clean,manual,1,fwd,full-size,coupe,grey,pa,40.017799,-75.0895
4,philadelphia,9999.0,2003.0,acura,3.2 cl type s,excellent,6 cylinders,gas,136371.0,clean,manual,1,fwd,full-size,coupe,grey,pa,40.017799,-75.0895


In [11]:
# list of categorical columns to be encoded
columns_to_encode = ['condition', 'cylinders', 'fuel',
                     'title_status', 'transmission',
                     'drive', 'size', 'type','region', 
                     'model', 'paint_color', 'manufacturer', 
                     'state']

In [12]:
# empty lists for results
unique_values = []  # unique values of each column
encoded_values = [] # the corresponding encoded value
columns = []        # column name

for column in columns_to_encode:
    
    values = df[column].dropna().unique()
    
    for i in range(len(values)):
            # appending the lists with the results       
            unique_values.append(values[i])
            encoded_values.append(i)
            columns.append(column)


In [15]:
# creating a new data frame to save unique values with their encoded ones
df_encoded = pd.DataFrame({'value': unique_values, 'encoded': encoded_values, 'column_name': columns})

In [16]:
df_encoded.head(5)

Unnamed: 0,value,encoded,column_name
0,good,0,condition
1,excellent,1,condition
2,fair,2,condition
3,like new,3,condition
4,new,4,condition


In [20]:
# checking
print(df[columns_to_encode].nunique().sum())
print(len(df_encoded))

26650
26650


In [22]:
list(df_encoded[df_encoded['column_name'] == 'size']['value'].unique())

['compact', 'full-size', 'mid-size', 'sub-compact']

In [23]:
list(df_encoded[df_encoded['column_name'] == 'size']['encoded'].unique())

[0, 1, 2, 3]

In [24]:
# looping through all columns and replace the original values with the encoded ones
for column in columns_to_encode:
    # list contains only the unique values for the current column in the loop
    to_replace_list = list(df_encoded[df_encoded['column_name'] == column]['value'].unique())
    
    # list contains only the encoded values for the current column in the loop
    new_values = list(df_encoded[df_encoded['column_name'] == column]['encoded'].unique())
    
    # replacing values
    df[column]= df[column].replace(to_replace = to_replace_list, value = new_values)
        
        

In [25]:
# checking before and after encoding
for column in columns_to_encode:
    print(df[column].value_counts(normalize = True) * 100, '\n')
    print(df_test[column].value_counts(normalize = True) * 100, '\n')

condition
0    50.591774
1    38.184054
3     8.198766
2     2.447904
4     0.401836
5     0.175666
Name: proportion, dtype: float64 

condition
good         50.591774
excellent    38.184054
like new      8.198766
fair          2.447904
new           0.401836
salvage       0.175666
Name: proportion, dtype: float64 

cylinders
1    36.390399
0    35.624534
4    24.775105
3     0.936551
2     0.916814
6     0.772259
7     0.496492
5     0.087845
Name: proportion, dtype: float64 

cylinders
6 cylinders     36.390399
4 cylinders     35.624534
8 cylinders     24.775105
other            0.936551
5 cylinders      0.916814
10 cylinders     0.772259
3 cylinders      0.496492
12 cylinders     0.087845
Name: proportion, dtype: float64 

fuel
0    84.950962
2     7.322280
3     5.987646
1     1.306001
4     0.433110
Name: proportion, dtype: float64 

fuel
gas         84.950962
other        7.322280
diesel       5.987646
hybrid       1.306001
electric     0.433110
Name: proportion, dtype: float64 


In [26]:
# after encoding
df.head()

Unnamed: 0,region,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,VIN,drive,size,type,paint_color,state,lat,long
0,0,7900.0,2002.0,0,0,0,0,0,95861.0,0,0,0,0,0,0,0,0,36.197292,-115.264389
1,1,3500.0,1997.0,0,1,1,1,0,190000.0,0,1,0,0,0,0,1,1,36.858501,-76.0019
2,2,1000.0,1997.0,0,1,2,1,0,300000.0,0,1,0,0,0,0,2,1,37.1819,-77.385399
3,3,9999.0,2003.0,0,2,1,1,0,136371.0,0,0,1,0,1,0,3,2,40.017799,-75.0895
4,3,9999.0,2003.0,0,2,1,1,0,136371.0,0,0,1,0,1,0,3,2,40.017799,-75.0895


# Preprocessing 

In [27]:
# test data where the 'condition' is not null
df_classified = df[df['condition'].notnull()]
x = df_classified.drop('condition', axis = 1)
y = df_classified['condition']

In [28]:
# null values to predict
x_unclassified = df[df['condition'].isnull()].drop('condition', axis = 1)

In [29]:
# splitting to train & test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = .2)

In [30]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

# Training the model

In [31]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)

In [32]:
# Checking accuracy
from sklearn.metrics import accuracy_score

rfc_pred = rfc.predict(x_test)
accuracy_score(rfc_pred, y_test)

0.8228629147361718

# Fitting the model with the whole data to predict our 'condition' column

In [33]:
rfc2 = RandomForestClassifier()
rfc2.fit(x, y)

In [34]:
# prediction
result = rfc2.predict(x_unclassified)
result

array([2, 1, 1, ..., 1, 1, 1], dtype=int64)

# Combining the data again

In [35]:
# adding the results to the data frame
x_unclassified['condition'] = result

In [36]:
# data with predicted 'condition'
x_unclassified.head()

Unnamed: 0,region,price,year,manufacturer,model,cylinders,fuel,odometer,title_status,transmission,VIN,drive,size,type,paint_color,state,lat,long,condition
15,13,2300.0,2004.0,0,3,1,0,187000.0,0,1,0,0,2,1,5,12,39.766499,-105.020401,2
20,17,3000.0,2000.0,0,3,1,0,203000.0,0,1,0,0,2,1,4,6,38.553902,-121.369301,1
21,18,6750.0,2006.0,0,3,1,0,118624.0,0,1,0,0,2,1,4,4,26.13859,-81.759537,1
22,19,4500.0,2004.0,0,3,1,0,190000.0,0,1,0,0,2,1,4,3,33.553101,-84.400299,0
23,20,5500.0,2002.0,0,4,1,0,150195.0,0,1,1,0,2,1,4,2,39.903999,-76.0401,1


In [37]:
# data with original 'condition' 
x = x.join(y)
x.head()

Unnamed: 0,region,price,year,manufacturer,model,cylinders,fuel,odometer,title_status,transmission,VIN,drive,size,type,paint_color,state,lat,long,condition
0,0,7900.0,2002.0,0,0,0,0,95861.0,0,0,0,0,0,0,0,0,36.197292,-115.264389,0
1,1,3500.0,1997.0,0,1,1,0,190000.0,0,1,0,0,0,0,1,1,36.858501,-76.0019,1
2,2,1000.0,1997.0,0,1,1,0,300000.0,0,1,0,0,0,0,2,1,37.1819,-77.385399,2
3,3,9999.0,2003.0,0,2,1,0,136371.0,0,0,1,0,1,0,3,2,40.017799,-75.0895,1
4,3,9999.0,2003.0,0,2,1,0,136371.0,0,0,1,0,1,0,3,2,40.017799,-75.0895,1


In [38]:
# new data frame with all data
new_df = pd.concat([x, x_unclassified])

In [39]:
new_df.head()

Unnamed: 0,region,price,year,manufacturer,model,cylinders,fuel,odometer,title_status,transmission,VIN,drive,size,type,paint_color,state,lat,long,condition
0,0,7900.0,2002.0,0,0,0,0,95861.0,0,0,0,0,0,0,0,0,36.197292,-115.264389,0
1,1,3500.0,1997.0,0,1,1,0,190000.0,0,1,0,0,0,0,1,1,36.858501,-76.0019,1
2,2,1000.0,1997.0,0,1,1,0,300000.0,0,1,0,0,0,0,2,1,37.1819,-77.385399,2
3,3,9999.0,2003.0,0,2,1,0,136371.0,0,0,1,0,1,0,3,2,40.017799,-75.0895,1
4,3,9999.0,2003.0,0,2,1,0,136371.0,0,0,1,0,1,0,3,2,40.017799,-75.0895,1


In [41]:
# decoding values in categorical columns back
for column in columns_to_encode:
    
    to_replace_list = list(df_encoded[df_encoded['column_name'] == column]['value'].unique())
    new_values = list(df_encoded[df_encoded['column_name'] == column]['encoded'].unique())
    
    # reverse the lists
    new_df[column]= new_df[column].replace(to_replace = new_values, value = to_replace_list)    

In [42]:
# checking 
print(new_df['condition'].value_counts(normalize = True) * 100, '\n')
print(df_test['condition'].value_counts(normalize = True) * 100, '\n')

condition
excellent    50.722220
good         40.171354
like new      6.945047
fair          1.714926
new           0.334423
salvage       0.112030
Name: proportion, dtype: float64 

condition
good         50.591774
excellent    38.184054
like new      8.198766
fair          2.447904
new           0.401836
salvage       0.175666
Name: proportion, dtype: float64 



In [43]:
new_df.isnull().sum()

region          0
price           0
year            0
manufacturer    0
model           0
cylinders       0
fuel            0
odometer        0
title_status    0
transmission    0
VIN             0
drive           0
size            0
type            0
paint_color     0
state           0
lat             0
long            0
condition       0
dtype: int64

In [44]:
# Saving the results to a csv file
new_df.to_csv('Used_Cars_Final_Results', index = False)