In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity

# the parsed outcomes files
outcomes_path = "/home/joseph/Documents/XAI-ILDA/datasets/cars/parsed_data_outcomes.csv"

# load the parsed outcomes into a dataframe 
df = pd.read_csv(outcomes_path, index_col=0)
df.head()

Unnamed: 0,Make,Year,BasePrice,Transmission,Gears,Horsepower,Turbo,Liters/Cylinders,Body,Size,Type,City/Highway,Persons,Color,Images
0,Other,2019,med,Auto,9,vhigh,no,med,SUV,Mid-size,Luxury,vhigh,7,Red,"[""093263_2019_acura_MDX.jpg"", ""092423_2019_acu..."
1,Other,2019,high,Auto,8,high,no,med,Sedan,Compact,Luxury,med,5,White,"[""092540_2019_acura_TLX.jpg"", ""092541_2019_acu..."
2,Audi,2019,med,Auto,7,med,yes,small,SUV,Compact,Luxury,med,5,Gray,"[""092670_2019_audi_Q5.jpg"", ""092669_2019_audi_..."
3,Audi,2019,vhigh,Auto,7,vhigh,yes,small,Sedan,Mid-size,Luxury,med,5,Gray,"[""092382_2019_audi_A6.jpg"", ""092381_2019_audi_..."
4,Audi,2019,high,Auto,7,high,yes,small,Sedan,Sub-compact,Luxury,high,5,Red,"[""092661_2019_audi_A3.jpg"", ""092660_2019_audi_..."


In [2]:
# drop images column
df.drop('Images', axis=1, inplace=True)

In [3]:
# check data types
df.dtypes

Make                object
Year                 int64
BasePrice           object
Transmission        object
Gears                int64
Horsepower          object
Turbo               object
Liters/Cylinders    object
Body                object
Size                object
Type                object
City/Highway        object
Persons              int64
Color               object
dtype: object

In [4]:
# check for null values
df[df.isnull().any(axis=1)].head()

Unnamed: 0,Make,Year,BasePrice,Transmission,Gears,Horsepower,Turbo,Liters/Cylinders,Body,Size,Type,City/Highway,Persons,Color


In [5]:
# encode the ordinal labels using 'find and replace'
ordinal_cols = ['BasePrice','Horsepower','City/Highway']
ordinal_encoding = {'vlow':0,'low':1,'med':2,'high':3,'vhigh':4, # ordinal_cols
            'vsmall':0,'small':1,'med':2,'large':3,'vlarge':4, # Liters/Cylinders
            'Sub-compact':0,'Compact':1,'Mid-size':2,'Large':3, # Size
            'no':0, 'yes':1, } # Turbo
df.replace(ordinal_encoding, inplace=True)
df.head()

Unnamed: 0,Make,Year,BasePrice,Transmission,Gears,Horsepower,Turbo,Liters/Cylinders,Body,Size,Type,City/Highway,Persons,Color
0,Other,2019,2,Auto,9,4,0,2,SUV,2,Luxury,4,7,Red
1,Other,2019,3,Auto,8,3,0,2,Sedan,1,Luxury,2,5,White
2,Audi,2019,2,Auto,7,2,1,1,SUV,1,Luxury,2,5,Gray
3,Audi,2019,4,Auto,7,4,1,1,Sedan,2,Luxury,2,5,Gray
4,Audi,2019,3,Auto,7,3,1,1,Sedan,0,Luxury,3,5,Red


In [6]:
# encode the non-ordinal labels using scikit-learn's label encoder
non_ordinal_cols = ['Make','Year','Transmission','Gears','Body','Type','Persons','Color']
encoders = [LabelEncoder() for _ in non_ordinal_cols]
labels = [enc.fit_transform(df[col]) for enc,col in zip(encoders,non_ordinal_cols)] 
for col, label in zip(non_ordinal_cols, labels): df[col] = label
df.head()

Unnamed: 0,Make,Year,BasePrice,Transmission,Gears,Horsepower,Turbo,Liters/Cylinders,Body,Size,Type,City/Highway,Persons,Color
0,8,1,2,0,5,4,0,2,4,2,3,4,5,5
1,8,1,3,0,4,3,0,2,5,1,3,2,3,7
2,0,1,2,0,3,2,1,1,4,1,3,2,3,2
3,0,1,4,0,3,4,1,1,5,2,3,2,3,2
4,0,1,3,0,3,3,1,1,5,0,3,3,3,5


In [None]:
# drop noisy attributes for now
df.drop(['Make','Year','Transmission','Color','Gears'])

In [7]:
# now we have some similarities between objects
pd.DataFrame(cosine_similarity(df))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,61,62,63,64,65,66,67,68,69,70
0,1.000000,0.958667,0.784330,0.750815,0.782386,0.827496,0.702151,0.774812,0.978700,0.897885,...,0.920512,0.929691,0.928032,0.954956,0.921126,0.821921,0.933546,0.932962,0.947023,0.936103
1,0.958667,1.000000,0.748331,0.724882,0.799796,0.798475,0.675136,0.705015,0.902494,0.922722,...,0.957314,0.950379,0.898087,0.976206,0.923631,0.883457,0.954320,0.914219,0.922806,0.881669
2,0.784330,0.748331,1.000000,0.962334,0.950019,0.986013,0.886360,0.945992,0.771845,0.709538,...,0.688089,0.661095,0.766273,0.723827,0.630484,0.429298,0.730095,0.734011,0.746447,0.747139
3,0.750815,0.724882,0.962334,1.000000,0.923026,0.943880,0.848485,0.903095,0.747659,0.669555,...,0.695795,0.675405,0.737125,0.715305,0.571150,0.395148,0.751007,0.737939,0.748347,0.764116
4,0.782386,0.799796,0.950019,0.923026,1.000000,0.929526,0.825866,0.844879,0.719166,0.697155,...,0.773389,0.720862,0.700387,0.787521,0.614981,0.506785,0.722387,0.678955,0.719828,0.698246
5,0.827496,0.798475,0.986013,0.943880,0.929526,1.000000,0.876460,0.963343,0.823564,0.767869,...,0.731022,0.720464,0.840000,0.766965,0.711228,0.498184,0.784293,0.794137,0.805629,0.799024
6,0.702151,0.675136,0.886360,0.848485,0.825866,0.876460,1.000000,0.865981,0.718339,0.641944,...,0.626216,0.619892,0.746115,0.689450,0.571150,0.403929,0.651384,0.737939,0.740300,0.748834
7,0.774812,0.705015,0.945992,0.903095,0.844879,0.963343,0.865981,1.000000,0.816941,0.684771,...,0.605989,0.657226,0.836732,0.696631,0.636870,0.397918,0.732080,0.766254,0.798272,0.804829
8,0.978700,0.902494,0.771845,0.747659,0.719166,0.823564,0.718339,0.816941,1.000000,0.856543,...,0.852749,0.899673,0.958922,0.913071,0.896978,0.751913,0.917573,0.948765,0.963480,0.970367
9,0.897885,0.922722,0.709538,0.669555,0.697155,0.767869,0.641944,0.684771,0.856543,1.000000,...,0.866428,0.859864,0.872299,0.871614,0.891284,0.792082,0.916444,0.909893,0.879812,0.819801
