1. Data Loading and Initial Exploration       

Displaying the first few rows of the dataset.

In [34]:
import pandas as pd
column_names = ["symboling", "normalized-losses", "make", "fuel-type", "aspiration", "num-of-doors", "body-style", "drive-wheels", "engine-location", "wheel-base", "length", "width", "height", "curb-weight", "engine-type", "num-of-cylinders", "engine-size", "fuel-system", "bore", "stroke", "compression-ratio", "horsepower", "peak-rpm", "city-mpg", "highway-mpg", "price"]
df = pd.read_csv('imports-85.data.txt', names=column_names, sep=',')
df.to_csv('data.csv', index=False)

df.shape

(205, 26)

Checking for the presence of missing values.

In [35]:
import numpy as np
df.replace('?',np.nan,inplace=True )
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [36]:
df.isnull().sum()

symboling             0
normalized-losses    41
make                  0
fuel-type             0
aspiration            0
num-of-doors          2
body-style            0
drive-wheels          0
engine-location       0
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-type           0
num-of-cylinders      0
engine-size           0
fuel-system           0
bore                  4
stroke                4
compression-ratio     0
horsepower            2
peak-rpm              2
city-mpg              0
highway-mpg           0
price                 4
dtype: int64

Describing the basic statistics of numerical features.

In [37]:
df.describe()

Unnamed: 0,symboling,wheel-base,length,width,height,curb-weight,engine-size,compression-ratio,city-mpg,highway-mpg
count,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0
mean,0.834146,98.756585,174.049268,65.907805,53.724878,2555.565854,126.907317,10.142537,25.219512,30.75122
std,1.245307,6.021776,12.337289,2.145204,2.443522,520.680204,41.642693,3.97204,6.542142,6.886443
min,-2.0,86.6,141.1,60.3,47.8,1488.0,61.0,7.0,13.0,16.0
25%,0.0,94.5,166.3,64.1,52.0,2145.0,97.0,8.6,19.0,25.0
50%,1.0,97.0,173.2,65.5,54.1,2414.0,120.0,9.0,24.0,30.0
75%,2.0,102.4,183.1,66.9,55.5,2935.0,141.0,9.4,30.0,34.0
max,3.0,120.9,208.1,72.3,59.8,4066.0,326.0,23.0,49.0,54.0


2. Handling Missing Data 

Identify and handle missing data in the dataset.

Identify columns with missing values.

In [38]:
df.isnull().sum()

symboling             0
normalized-losses    41
make                  0
fuel-type             0
aspiration            0
num-of-doors          2
body-style            0
drive-wheels          0
engine-location       0
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-type           0
num-of-cylinders      0
engine-size           0
fuel-system           0
bore                  4
stroke                4
compression-ratio     0
horsepower            2
peak-rpm              2
city-mpg              0
highway-mpg           0
price                 4
dtype: int64

Handling these missing values, whether by imputation or removal.

In [39]:
# here we are converting the categorical one to numerical
convert = {"num-of-doors": {"four": 4, "two": 2}}
df.replace(convert, inplace=True) 
df

  df.replace(convert, inplace=True)


Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,alfa-romero,gas,std,2.0,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,,alfa-romero,gas,std,2.0,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,,alfa-romero,gas,std,2.0,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,4.0,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.40,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,4.0,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.40,8.0,115,5500,18,22,17450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,-1,95,volvo,gas,std,4.0,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114,5400,23,28,16845
201,-1,95,volvo,gas,turbo,4.0,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,8.7,160,5300,19,25,19045
202,-1,95,volvo,gas,std,4.0,sedan,rwd,front,109.1,...,173,mpfi,3.58,2.87,8.8,134,5500,18,23,21485
203,-1,95,volvo,diesel,turbo,4.0,sedan,rwd,front,109.1,...,145,idi,3.01,3.40,23.0,106,4800,26,27,22470


In [40]:
columns_with_missing = df.columns[df.isnull().any()]
columns_with_missing

Index(['normalized-losses', 'num-of-doors', 'bore', 'stroke', 'horsepower',
       'peak-rpm', 'price'],
      dtype='object')

In [41]:
df["normalized-losses"].fillna(df["normalized-losses"].astype('float').mean(), inplace=True)
df["bore"].fillna(df["bore"].astype('float').mean(), inplace=True)
df["num-of-doors"].fillna(df["num-of-doors"].astype('float').mean(), inplace=True)
df["stroke"].fillna(df["stroke"].astype('float').mean(), inplace=True)
df["horsepower"].fillna(df["horsepower"].astype('float').mean(), inplace=True)
df["peak-rpm"].fillna(df["peak-rpm"].astype('float').mean(), inplace=True)

df.dropna(subset = ['price'], axis = 0, inplace = True)
df.shape

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["normalized-losses"].fillna(df["normalized-losses"].astype('float').mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["bore"].fillna(df["bore"].astype('float').mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never w

(201, 26)

In [42]:
df.isnull().any()

symboling            False
normalized-losses    False
make                 False
fuel-type            False
aspiration           False
num-of-doors         False
body-style           False
drive-wheels         False
engine-location      False
wheel-base           False
length               False
width                False
height               False
curb-weight          False
engine-type          False
num-of-cylinders     False
engine-size          False
fuel-system          False
bore                 False
stroke               False
compression-ratio    False
horsepower           False
peak-rpm             False
city-mpg             False
highway-mpg          False
price                False
dtype: bool

In [43]:
df.dtypes

symboling              int64
normalized-losses     object
make                  object
fuel-type             object
aspiration            object
num-of-doors         float64
body-style            object
drive-wheels          object
engine-location       object
wheel-base           float64
length               float64
width                float64
height               float64
curb-weight            int64
engine-type           object
num-of-cylinders      object
engine-size            int64
fuel-system           object
bore                  object
stroke                object
compression-ratio    float64
horsepower            object
peak-rpm              object
city-mpg               int64
highway-mpg            int64
price                 object
dtype: object

3. Data Transformation and Encoding

Identify columns with categorical data that need to be transformed into a numeric format. 
Apply suitable encoding techniques (e.g., one-hot encoding or label encoding) to convert categorical data into numeric data.

In [44]:
df[["bore", "stroke"]] = df[["bore", "stroke"]].astype("float")
df[["normalized-losses"]] = df[["normalized-losses"]].astype("float")
df[["price"]] = df[["price"]].astype("float")
df[["peak-rpm"]] = df[["peak-rpm"]].astype("float")
df.dtypes

symboling              int64
normalized-losses    float64
make                  object
fuel-type             object
aspiration            object
num-of-doors         float64
body-style            object
drive-wheels          object
engine-location       object
wheel-base           float64
length               float64
width                float64
height               float64
curb-weight            int64
engine-type           object
num-of-cylinders      object
engine-size            int64
fuel-system           object
bore                 float64
stroke               float64
compression-ratio    float64
horsepower            object
peak-rpm             float64
city-mpg               int64
highway-mpg            int64
price                float64
dtype: object

In [45]:
categorical_columns = df.select_dtypes(include=['object']).columns
categorical_columns

Index(['make', 'fuel-type', 'aspiration', 'body-style', 'drive-wheels',
       'engine-location', 'engine-type', 'num-of-cylinders', 'fuel-system',
       'horsepower'],
      dtype='object')

In [46]:
df = pd.get_dummies(df, columns=categorical_columns)
df.head()

Unnamed: 0,symboling,normalized-losses,num-of-doors,wheel-base,length,width,height,curb-weight,engine-size,bore,...,horsepower_82,horsepower_84,horsepower_85,horsepower_86,horsepower_88,horsepower_90,horsepower_92,horsepower_94,horsepower_95,horsepower_97
0,3,122.0,2.0,88.6,168.8,64.1,48.8,2548,130,3.47,...,False,False,False,False,False,False,False,False,False,False
1,3,122.0,2.0,88.6,168.8,64.1,48.8,2548,130,3.47,...,False,False,False,False,False,False,False,False,False,False
2,1,122.0,2.0,94.5,171.2,65.5,52.4,2823,152,2.68,...,False,False,False,False,False,False,False,False,False,False
3,2,164.0,4.0,99.8,176.6,66.2,54.3,2337,109,3.19,...,False,False,False,False,False,False,False,False,False,False
4,2,164.0,4.0,99.4,176.6,66.4,54.3,2824,136,3.19,...,False,False,False,False,False,False,False,False,False,False


4. Outlier Detection and Treatment 

Apply an outlier detection method (e.g., IQR or Z-score) to detect outliers.

In [47]:
numerical_columns = df.select_dtypes(include=['int64', 'float64'])
def detect_and_treat_outliers(df, column_name):
    Q1 = df[column_name].quantile(0.25)
    Q3 = df[column_name].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = df[(df[column_name] < lower_bound) | (df[column_name] > upper_bound)]
    return outliers

# Detect and treat outliers for each numerical column
outliers_dict = {}
for column in numerical_columns.columns:
    outliers = detect_and_treat_outliers(df, column)
    outliers_dict[column] = outliers

# Display the outliers in table format
for column, outliers in outliers_dict.items():
    if not outliers.empty:
        print(f"Outliers in {column}:")
        print(outliers)

Outliers in normalized-losses:
     symboling  normalized-losses  num-of-doors  wheel-base  length  width  \
10           2              192.0           2.0       101.2   176.8   64.8   
11           0              192.0           4.0       101.2   176.8   64.8   
104          3              194.0           2.0        91.3   170.7   67.9   
105          3              194.0           2.0        91.3   170.7   67.9   
106          1              231.0           2.0        99.2   178.5   67.9   
178          3              197.0           2.0       102.9   183.5   67.7   
179          3              197.0           2.0       102.9   183.5   67.7   
190          3              256.0           2.0        94.5   165.7   64.0   

     height  curb-weight  engine-size  bore  ...  horsepower_82  \
10     54.3         2395          108  3.50  ...          False   
11     54.3         2395          108  3.50  ...          False   
104    49.7         3071          181  3.43  ...          False  

5. Feature Scaling

Feature scaling to the numerical features to ensure that all features are on a similar scale. 

In [48]:
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
data_normalized = min_max_scaler.fit_transform(numerical_columns)
data_normalized = pd.DataFrame(data_normalized, columns=numerical_columns.columns)
print(data_normalized)

     symboling  normalized-losses  num-of-doors  wheel-base    length  \
0          1.0           0.298429           0.0    0.058309  0.413433   
1          1.0           0.298429           0.0    0.058309  0.413433   
2          0.6           0.298429           0.0    0.230321  0.449254   
3          0.8           0.518325           1.0    0.384840  0.529851   
4          0.8           0.518325           1.0    0.373178  0.529851   
..         ...                ...           ...         ...       ...   
196        0.2           0.157068           1.0    0.655977  0.711940   
197        0.2           0.157068           1.0    0.655977  0.711940   
198        0.2           0.157068           1.0    0.655977  0.711940   
199        0.2           0.157068           1.0    0.655977  0.711940   
200        0.2           0.157068           1.0    0.655977  0.711940   

        width    height  curb-weight  engine-size      bore    stroke  \
0    0.324786  0.083333     0.411171     0.260377 

In [49]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_scaled = scaler.fit_transform(numerical_columns)
print(df_scaled)

[[ 1.72504964  0.         -1.15189655 ... -0.65224901 -0.54228772
   0.03631396]
 [ 1.72504964  0.         -1.15189655 ... -0.65224901 -0.54228772
   0.41538505]
 [ 0.1271926   0.         -1.15189655 ... -0.96439676 -0.689386
   0.41538505]
 ...
 [-1.47066444 -0.84595589  0.87688728 ... -1.12047063 -1.13068086
   1.04422678]
 [-1.47066444 -0.84595589  0.87688728 ...  0.12812034 -0.54228772
   1.16848137]
 [-1.47066444 -0.84595589  0.87688728 ... -0.96439676 -0.83648429
   1.18803412]]
