In [68]:
import pandas as pd

data = pd.read_csv('auto-mpg.csv', delimiter=',')
data = data.drop(columns=['car name'])  
data = data.dropna()  

numeric_columns = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year', 'origin']
data[numeric_columns] = data[numeric_columns].apply(pd.to_numeric, errors='coerce')  
data = data.dropna(subset=numeric_columns)  # drop non numeric

# standardize
data_standardized = data[numeric_columns].apply(lambda x: (x - x.mean()) / x.std())

# covariance matrix on standardized data
cov_matrix = data_standardized.cov()

print("Standardized Covariance Matrix:")
print(cov_matrix)


Standardized Covariance Matrix:
                   mpg  cylinders  displacement  horsepower    weight  \
mpg           1.000000  -0.777618     -0.805127   -0.778427 -0.832244   
cylinders    -0.777618   1.000000      0.950823    0.842983  0.897527   
displacement -0.805127   0.950823      1.000000    0.897257  0.932994   
horsepower   -0.778427   0.842983      0.897257    1.000000  0.864538   
weight       -0.832244   0.897527      0.932994    0.864538  1.000000   
acceleration  0.423329  -0.504683     -0.543800   -0.689196 -0.416839   
year          0.580541  -0.345647     -0.369855   -0.416361 -0.309120   
origin        0.565209  -0.568932     -0.614535   -0.455171 -0.585005   

              acceleration      year    origin  
mpg               0.423329  0.580541  0.565209  
cylinders        -0.504683 -0.345647 -0.568932  
displacement     -0.543800 -0.369855 -0.614535  
horsepower       -0.689196 -0.416361 -0.455171  
weight           -0.416839 -0.309120 -0.585005  
acceleration    

In [69]:
import altair as alt

# change to long format
cov_long = cov_matrix.reset_index().melt(id_vars="index", var_name="Feature_X", value_name="Covariance")
cov_long.rename(columns={"index": "Feature_Y"}, inplace=True)

heatmap = alt.Chart(cov_long).mark_rect().encode(
    x=alt.X("Feature_X:N", title="Feature X"),
    y=alt.Y("Feature_Y:N", title="Feature Y"),
    color=alt.Color("Covariance:Q", scale=alt.Scale(scheme="blues"), title="Covariance"),
    tooltip=["Feature_X", "Feature_Y", "Covariance"]
).properties(
    width=300,
    height=300,
    title="Covariance Matrix"
)

# text values
text = alt.Chart(cov_long).mark_text(size=10, color="black").encode(
    x=alt.X("Feature_X:N"),
    y=alt.Y("Feature_Y:N"),
    text=alt.Text("Covariance:Q", format=".2f")  # two decimal places
)

# Combine Heatmap and Text
final_chart = heatmap + text

# Display the Final Chart
final_chart


In [70]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import altair as alt

df = pd.DataFrame(data)

# standardize
features = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year']
X = df[features]
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)

pca = PCA(n_components=2)
pca_result = pca.fit_transform(X_standardized)
df['PC1'] = pca_result[:, 0]
df['PC2'] = pca_result[:, 1]

# labels
origin_map = {1: 'USA', 2: 'Europe', 3: 'Japanese'}
df['Origin_Label'] = df['origin'].map(origin_map)

scatter = alt.Chart(df).mark_circle(size=100).encode(
    x='PC1',
    y='PC2',
    color='Origin_Label',
    tooltip=['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year', 'Origin_Label']
).properties(
    title='PCA Projection of Cars Dataset',
    width=600,
    height=400
)

scatter


The plot shows that the part of cars from USA are much different from the rest. We can spot it by looking at the right part of the plot. On the left side we have all 3 groups mixed, meaning that there are not so big differences between them, which means that the part of cars from USA, all Japanese and all from Europe are pretty much the same to each other. On the most left we can see few cars from Europe which are unusual and are not so cluster together with others.

In [80]:
pca = PCA() # calculate PCA for not only 2 components
pca.fit(X_standardized) 

explained_variance_ratio = pca.explained_variance_ratio_

for i, var in enumerate(explained_variance_ratio, start=1):
    print(f"PC{i}: {var:.2%} of variance")
    
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_explained_variance = np.cumsum(explained_variance_ratio)

pca_df = pd.DataFrame({
    'Number of Components': range(1, len(cumulative_explained_variance) + 1),
    'Cumulative Explained Variance': cumulative_explained_variance
})

chart = alt.Chart(pca_df).mark_line(point=True).encode(
    x=alt.X('Number of Components:Q', title='Number of Components'),
    y=alt.Y('Cumulative Explained Variance:Q', title='Cumulative Explained Variance'),
    tooltip=['Number of Components', 'Cumulative Explained Variance']
).properties(
    title='Cumulative Explained Variance by Number of Components',
    width=600,
    height=400
)
# add line at 90%
threshold = alt.Chart(pd.DataFrame({'y': [0.9]})).mark_rule(color='red').encode(
    y='y:Q'
)
chart + threshold


PC1: 71.58% of variance
PC2: 12.37% of variance
PC3: 10.41% of variance
PC4: 2.63% of variance
PC5: 1.74% of variance
PC6: 0.78% of variance
PC7: 0.50% of variance


From the plot we can observe that we need at least 3 compontents(2.5 but we can only take into account whole numbers) to explain 90% of variance. For each point we can observe how many percents of variance is explained.

In [77]:
pca = PCA() # calculate PCA for not only 2 components
pca.fit(X_standardized) 

loadings = pd.DataFrame(pca.components_, columns=features, index=['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7'])
print(loadings)
# transform for visualization
loadings_melted = loadings.reset_index().melt(id_vars='index', var_name='Feature', value_name='Loading')
loadings_melted.rename(columns={'index': 'Principal Component'}, inplace=True)
loadings_melted['Absolute Loading'] = loadings_melted['Loading'].abs() # it does not matter in which direction is the relationship
heatmap = alt.Chart(loadings_melted).mark_rect().encode(
    x=alt.X('Feature:N', title='Feature'),
    y=alt.Y('Principal Component:N', title='Principal Component'),
    color=alt.Color('Absolute Loading:Q', scale=alt.Scale(scheme='blues'), title='Loading'),
    tooltip=['Principal Component', 'Feature', 'Loading']
).properties(
    title='PCA Loadings',
    width=600,
    height=400
)

heatmap

          mpg  cylinders  displacement  horsepower    weight  acceleration  \
PC1 -0.398135   0.416124      0.429283    0.422813  0.414046     -0.284897   
PC2  0.206759   0.198541      0.180362    0.085242  0.224675     -0.006972   
PC3 -0.257215   0.139159      0.100316   -0.169684  0.276103      0.893308   
PC4  0.750966   0.477306      0.297847   -0.042076 -0.107735      0.121124   
PC5  0.340776  -0.493222     -0.056581    0.711289  0.265158      0.230755   
PC6  0.209759  -0.332548      0.142967   -0.522803  0.696518     -0.223785   
PC7 -0.092212  -0.431716      0.812877   -0.064385 -0.367154      0.052799   

         year  
PC1 -0.229510  
PC2  0.909675  
PC3 -0.037246  
PC4 -0.302435  
PC5 -0.088961  
PC6 -0.128195  
PC7  0.051132  


In [74]:
# now heatmap for only 2 components

pca = PCA(n_components=2) # calculate PCA for not only 2 components
pca.fit(X_standardized) 

loadings = pd.DataFrame(pca.components_, columns=features, index=['PC1', 'PC2'])
# transform for visualization
loadings_melted = loadings.reset_index().melt(id_vars='index', var_name='Feature', value_name='Loading')
loadings_melted.rename(columns={'index': 'Principal Component'}, inplace=True)
loadings_melted['Absolute Loading'] = loadings_melted['Loading'].abs() # it does not matter in which direction is the relationship
heatmap = alt.Chart(loadings_melted).mark_rect().encode(
    x=alt.X('Feature:N', title='Feature'),
    y=alt.Y('Principal Component:N', title='Principal Component'),
    color=alt.Color('Absolute Loading:Q', scale=alt.Scale(scheme='blues'), title='Loading'),
    tooltip=['Principal Component', 'Feature', 'Loading']
).properties(
    title='PCA Loadings',
    width=600,
    height=400
)

heatmap


From both the heatmaps we can observe how strong is the relationship between the features and the PCA components. The darker the cell is, the higher the relationship (no matter the direction because we used the absolut value). Keep in mind how big share of a variance is explained by each component (3 cells above).
In our opinion PCA is good choice, because many of the features are correlated with each other and provide almost nothing but redundancy. Taking as example displacement and horsepower, if the car's engine's displacement is high, most likely is that the number of horsepower is also high. PCA reduces the redundancy in the information. It can also reduce the dimensionality to fewer components, while explaining particular share of variance. Our variables are numerical, which is also a good case to use the PCA. The downside is obviously the ability to interpret the components, because after using PCA we cannot refer to particular features such as MPGs or the production year.