In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# the parsed outcomes files
outcomes_path = "/home/joseph/Documents/XAI-ILDA/datasets/cars/parsed_data_outcomes.csv"

# load the parsed outcomes into a dataframe 
df = pd.read_csv(outcomes_path, index_col=0)
df.head()

Unnamed: 0,Make,Year,BasePrice,Transmission,Gears,Horsepower,Turbo,Liters/Cylinders,Body,Size,Type,City/Highway,Persons,Color,Images
0,Other,2019,med,Auto,9,vhigh,no,med,SUV,Mid-size,Luxury,vhigh,7,Red,"[""093263_2019_acura_MDX.jpg"", ""092423_2019_acu..."
1,Other,2019,high,Auto,8,high,no,med,Sedan,Compact,Luxury,med,5,White,"[""092540_2019_acura_TLX.jpg"", ""092541_2019_acu..."
2,Audi,2019,med,Auto,7,med,yes,small,SUV,Compact,Luxury,med,5,Gray,"[""092670_2019_audi_Q5.jpg"", ""092669_2019_audi_..."
3,Audi,2019,vhigh,Auto,7,vhigh,yes,small,Sedan,Mid-size,Luxury,med,5,Gray,"[""092382_2019_audi_A6.jpg"", ""092381_2019_audi_..."
4,Audi,2019,high,Auto,7,high,yes,small,Sedan,Sub-compact,Luxury,high,5,Red,"[""092661_2019_audi_A3.jpg"", ""092660_2019_audi_..."


In [2]:
# drop images column
df.drop('Images', axis=1, inplace=True)

In [3]:
# check data types
df.dtypes

Make                object
Year                 int64
BasePrice           object
Transmission        object
Gears                int64
Horsepower          object
Turbo               object
Liters/Cylinders    object
Body                object
Size                object
Type                object
City/Highway        object
Persons              int64
Color               object
dtype: object

In [4]:
# check for null values
df[df.isnull().any(axis=1)].head()

Unnamed: 0,Make,Year,BasePrice,Transmission,Gears,Horsepower,Turbo,Liters/Cylinders,Body,Size,Type,City/Highway,Persons,Color


In [5]:
# encode the ordinal labels using 'find and replace'
ordinal_cols = ['BasePrice','Horsepower','City/Highway']
ordinal_encoding = {'vlow':0,'low':1,'med':2,'high':3,'vhigh':4, # ordinal_cols
            'vsmall':0,'small':1,'med':2,'large':3,'vlarge':4, # Liters/Cylinders
            'Sub-compact':0,'Compact':1,'Mid-size':2,'Large':3, # Size
            'no':0, 'yes':1, } # Turbo
df.replace(ordinal_encoding, inplace=True)
df.head()

Unnamed: 0,Make,Year,BasePrice,Transmission,Gears,Horsepower,Turbo,Liters/Cylinders,Body,Size,Type,City/Highway,Persons,Color
0,Other,2019,2,Auto,9,4,0,2,SUV,2,Luxury,4,7,Red
1,Other,2019,3,Auto,8,3,0,2,Sedan,1,Luxury,2,5,White
2,Audi,2019,2,Auto,7,2,1,1,SUV,1,Luxury,2,5,Gray
3,Audi,2019,4,Auto,7,4,1,1,Sedan,2,Luxury,2,5,Gray
4,Audi,2019,3,Auto,7,3,1,1,Sedan,0,Luxury,3,5,Red


In [6]:
# encode the non-ordinal labels using scikit-learn's label encoder
non_ordinal_cols = ['Make','Year','Transmission','Gears','Body','Type','Persons','Color']
encoders = [LabelEncoder() for _ in non_ordinal_cols]
labels = [enc.fit_transform(df[col]) for enc,col in zip(encoders,non_ordinal_cols)] 
for col, label in zip(non_ordinal_cols, labels): df[col] = label
df.head()

Unnamed: 0,Make,Year,BasePrice,Transmission,Gears,Horsepower,Turbo,Liters/Cylinders,Body,Size,Type,City/Highway,Persons,Color
0,8,1,2,0,5,4,0,2,4,2,3,4,5,5
1,8,1,3,0,4,3,0,2,5,1,3,2,3,7
2,0,1,2,0,3,2,1,1,4,1,3,2,3,2
3,0,1,4,0,3,4,1,1,5,2,3,2,3,2
4,0,1,3,0,3,3,1,1,5,0,3,3,3,5


In [7]:
# drop noisy attributes for now
df.drop(['Year','Color'], axis=1,inplace=True)

In [12]:
# now we have some similarities between objects
pd.DataFrame(cosine_similarity(df)).loc[47].sort_values()

31    0.653730
17    0.664654
16    0.668854
24    0.682750
2     0.698163
12    0.715128
66    0.724051
3     0.727892
4     0.730423
6     0.736179
38    0.736207
13    0.741305
23    0.743345
7     0.743435
20    0.744387
57    0.749510
14    0.753403
22    0.754617
26    0.756460
15    0.757853
5     0.759078
32    0.772056
27    0.777579
30    0.789016
28    0.789377
9     0.803863
45    0.807609
44    0.818008
19    0.824483
21    0.824747
        ...   
67    0.877447
40    0.881969
39    0.881995
11    0.884281
56    0.893722
68    0.893905
53    0.896377
55    0.903694
10    0.903694
63    0.906544
8     0.907583
0     0.907583
34    0.915631
69    0.920461
18    0.922118
1     0.923524
70    0.924102
35    0.927623
52    0.930604
54    0.932412
29    0.945373
64    0.949399
33    0.952040
46    0.954201
43    0.955237
62    0.961939
61    0.974826
41    0.996538
60    0.997231
47    1.000000
Name: 47, Length: 71, dtype: float64