In [54]:
import  pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,accuracy_score  
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 



Load in the Car DataSet, We are going to create a multiclass classifier (Can we determine the maker of the car depending on the other features)
We are going to do the follow
1. Load the Data
2. Clean/Remove
3. Standarized (if needed)
4. Train Model
5. Compare Model against one another 

In [4]:
car_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
df_car = pd.read_csv(car_url,names=['symboling','normalized-losses','make','fuel-type','aspiration','num-of-doors','body-style','drive-wheels','engine-location','wheel-base','length','width','height','curb-weight','engine-type','num-of-cylinders','engine-size','fuel-system','bore','stroke','compression-ratio','horsepower','peak-rpm','city-mpg','highway-mpg','price'])

In [5]:
df_car

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.40,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,five,136,mpfi,3.19,3.40,8.0,115,5500,18,22,17450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,-1,95,volvo,gas,std,four,sedan,rwd,front,109.1,188.8,68.9,55.5,2952,ohc,four,141,mpfi,3.78,3.15,9.5,114,5400,23,28,16845
201,-1,95,volvo,gas,turbo,four,sedan,rwd,front,109.1,188.8,68.8,55.5,3049,ohc,four,141,mpfi,3.78,3.15,8.7,160,5300,19,25,19045
202,-1,95,volvo,gas,std,four,sedan,rwd,front,109.1,188.8,68.9,55.5,3012,ohcv,six,173,mpfi,3.58,2.87,8.8,134,5500,18,23,21485
203,-1,95,volvo,diesel,turbo,four,sedan,rwd,front,109.1,188.8,68.9,55.5,3217,ohc,six,145,idi,3.01,3.40,23.0,106,4800,26,27,22470


We need to remove the symboling column and normailized-losses, this was used in their paper ( we will not be using these here )

In [6]:
df_car.drop(columns=['symboling','normalized-losses'],inplace=True)
df_car

Unnamed: 0,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,audi,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.40,10.0,102,5500,24,30,13950
4,audi,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,five,136,mpfi,3.19,3.40,8.0,115,5500,18,22,17450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,volvo,gas,std,four,sedan,rwd,front,109.1,188.8,68.9,55.5,2952,ohc,four,141,mpfi,3.78,3.15,9.5,114,5400,23,28,16845
201,volvo,gas,turbo,four,sedan,rwd,front,109.1,188.8,68.8,55.5,3049,ohc,four,141,mpfi,3.78,3.15,8.7,160,5300,19,25,19045
202,volvo,gas,std,four,sedan,rwd,front,109.1,188.8,68.9,55.5,3012,ohcv,six,173,mpfi,3.58,2.87,8.8,134,5500,18,23,21485
203,volvo,diesel,turbo,four,sedan,rwd,front,109.1,188.8,68.9,55.5,3217,ohc,six,145,idi,3.01,3.40,23.0,106,4800,26,27,22470


We need to check if we have NULL values, In the dataset missing values are represent by a '?'

In [8]:
df_car_is_nan = df_car.replace('?',np.nan)
print(df_car_is_nan.isnull().sum(axis = 0))

make                 0
fuel-type            0
aspiration           0
num-of-doors         2
body-style           0
drive-wheels         0
engine-location      0
wheel-base           0
length               0
width                0
height               0
curb-weight          0
engine-type          0
num-of-cylinders     0
engine-size          0
fuel-system          0
bore                 4
stroke               4
compression-ratio    0
horsepower           2
peak-rpm             2
city-mpg             0
highway-mpg          0
price                4
dtype: int64


We see that we have 18 instances that are missing from our data, considering that some of these are categorical and some are numerial I think we can remove then (out of our 205X24 datapoints)

In [9]:
df_car = df_car_is_nan.dropna()
df_car

Unnamed: 0,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,audi,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.40,10.0,102,5500,24,30,13950
4,audi,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,five,136,mpfi,3.19,3.40,8.0,115,5500,18,22,17450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,volvo,gas,std,four,sedan,rwd,front,109.1,188.8,68.9,55.5,2952,ohc,four,141,mpfi,3.78,3.15,9.5,114,5400,23,28,16845
201,volvo,gas,turbo,four,sedan,rwd,front,109.1,188.8,68.8,55.5,3049,ohc,four,141,mpfi,3.78,3.15,8.7,160,5300,19,25,19045
202,volvo,gas,std,four,sedan,rwd,front,109.1,188.8,68.9,55.5,3012,ohcv,six,173,mpfi,3.58,2.87,8.8,134,5500,18,23,21485
203,volvo,diesel,turbo,four,sedan,rwd,front,109.1,188.8,68.9,55.5,3217,ohc,six,145,idi,3.01,3.40,23.0,106,4800,26,27,22470


We are going to have our target variable be make so we need to extract that


In [10]:
y_values = df_car['make']
x_values = df_car.drop(columns=['make'])
print(y_values)
print(x_values)

0      alfa-romero
1      alfa-romero
2      alfa-romero
3             audi
4             audi
          ...     
200          volvo
201          volvo
202          volvo
203          volvo
204          volvo
Name: make, Length: 193, dtype: object
    fuel-type aspiration num-of-doors  ... city-mpg highway-mpg  price
0         gas        std          two  ...       21          27  13495
1         gas        std          two  ...       21          27  16500
2         gas        std          two  ...       19          26  16500
3         gas        std         four  ...       24          30  13950
4         gas        std         four  ...       18          22  17450
..        ...        ...          ...  ...      ...         ...    ...
200       gas        std         four  ...       23          28  16845
201       gas      turbo         four  ...       19          25  19045
202       gas        std         four  ...       18          23  21485
203    diesel      turbo         four  ...

In [11]:
x_values.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 193 entries, 0 to 204
Data columns (total 23 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   fuel-type          193 non-null    object 
 1   aspiration         193 non-null    object 
 2   num-of-doors       193 non-null    object 
 3   body-style         193 non-null    object 
 4   drive-wheels       193 non-null    object 
 5   engine-location    193 non-null    object 
 6   wheel-base         193 non-null    float64
 7   length             193 non-null    float64
 8   width              193 non-null    float64
 9   height             193 non-null    float64
 10  curb-weight        193 non-null    int64  
 11  engine-type        193 non-null    object 
 12  num-of-cylinders   193 non-null    object 
 13  engine-size        193 non-null    int64  
 14  fuel-system        193 non-null    object 
 15  bore               193 non-null    object 
 16  stroke             193 non

We have an issue here some data is not being encoded, correctly price,peak-rpm,horsepower,bore are all numeric

In [24]:
for item in ['bore','stroke','horsepower','peak-rpm','price']:
  x_values[item] = pd.to_numeric(x_values[item])


In [26]:
x_values.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 193 entries, 0 to 204
Data columns (total 23 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   fuel-type          193 non-null    object 
 1   aspiration         193 non-null    object 
 2   num-of-doors       193 non-null    object 
 3   body-style         193 non-null    object 
 4   drive-wheels       193 non-null    object 
 5   engine-location    193 non-null    object 
 6   wheel-base         193 non-null    float64
 7   length             193 non-null    float64
 8   width              193 non-null    float64
 9   height             193 non-null    float64
 10  curb-weight        193 non-null    int64  
 11  engine-type        193 non-null    object 
 12  num-of-cylinders   193 non-null    object 
 13  engine-size        193 non-null    int64  
 14  fuel-system        193 non-null    object 
 15  bore               193 non-null    float64
 16  stroke             193 non

In [25]:
x_values.describe()

Unnamed: 0,wheel-base,length,width,height,curb-weight,engine-size,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
count,193.0,193.0,193.0,193.0,193.0,193.0,193.0,193.0,193.0,193.0,193.0,193.0,193.0,193.0
mean,98.923834,174.326425,65.893782,53.869948,2561.507772,128.124352,3.330622,3.24886,10.143627,103.481865,5099.740933,25.326425,30.787565,13285.025907
std,6.152409,12.478593,2.137795,2.39477,526.700026,41.590452,0.272385,0.315421,3.977491,37.960107,468.694369,6.387828,6.81691,8089.082886
min,86.6,141.1,60.3,47.8,1488.0,61.0,2.54,2.07,7.0,48.0,4150.0,13.0,16.0,5118.0
25%,94.5,166.3,64.1,52.0,2145.0,98.0,3.15,3.11,8.5,70.0,4800.0,19.0,25.0,7738.0
50%,97.0,173.2,65.4,54.1,2414.0,120.0,3.31,3.29,9.0,95.0,5100.0,25.0,30.0,10245.0
75%,102.4,184.6,66.9,55.7,2952.0,146.0,3.59,3.41,9.4,116.0,5500.0,30.0,34.0,16515.0
max,120.9,208.1,72.0,59.8,4066.0,326.0,3.94,4.17,23.0,262.0,6600.0,49.0,54.0,45400.0


In [27]:
numerial_cols = ['wheel-base','length','width','height','curb-weight','engine-size','compression-ratio','city-mpg','highway-mpg','bore','stroke','horsepower','peak-rpm','price']
categorial_cols = []
for column in x_values.columns:
  if column not in numerial_cols:
    categorial_cols.append(column)

print(categorial_cols)

['fuel-type', 'aspiration', 'num-of-doors', 'body-style', 'drive-wheels', 'engine-location', 'engine-type', 'num-of-cylinders', 'fuel-system']


How does our basic classifier perform if we only include numerial x_values


In [28]:
x_numerical = x_values[numerial_cols]
x_numerical

Unnamed: 0,wheel-base,length,width,height,curb-weight,engine-size,compression-ratio,city-mpg,highway-mpg,bore,stroke,horsepower,peak-rpm,price
0,88.6,168.8,64.1,48.8,2548,130,9.0,21,27,3.47,2.68,111,5000,13495
1,88.6,168.8,64.1,48.8,2548,130,9.0,21,27,3.47,2.68,111,5000,16500
2,94.5,171.2,65.5,52.4,2823,152,9.0,19,26,2.68,3.47,154,5000,16500
3,99.8,176.6,66.2,54.3,2337,109,10.0,24,30,3.19,3.40,102,5500,13950
4,99.4,176.6,66.4,54.3,2824,136,8.0,18,22,3.19,3.40,115,5500,17450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,109.1,188.8,68.9,55.5,2952,141,9.5,23,28,3.78,3.15,114,5400,16845
201,109.1,188.8,68.8,55.5,3049,141,8.7,19,25,3.78,3.15,160,5300,19045
202,109.1,188.8,68.9,55.5,3012,173,8.8,18,23,3.58,2.87,134,5500,21485
203,109.1,188.8,68.9,55.5,3217,145,23.0,26,27,3.01,3.40,106,4800,22470


In [69]:
X_train_num,X_test_num,y_train_num,y_test_num = train_test_split(x_numerical,y_values,test_size=.33,random_state=42 )

In [70]:
clf = LogisticRegression(multi_class='auto',random_state=42,solver='lbfgs')

In [71]:
clf.fit(X_train_num,y_train_num)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [72]:
y_hat_train = clf.predict(X_train_num)
y_hat_test = clf.predict(X_test_num)

In [73]:
print(classification_report(y_train_num,y_hat_train))

               precision    recall  f1-score   support

  alfa-romero       0.00      0.00      0.00         3
         audi       0.14      0.20      0.17         5
          bmw       0.75      0.75      0.75         4
    chevrolet       0.00      0.00      0.00         1
        dodge       0.12      0.14      0.13         7
        honda       0.33      0.50      0.40         8
       jaguar       0.00      0.00      0.00         2
        mazda       0.00      0.00      0.00         9
mercedes-benz       0.75      1.00      0.86         6
   mitsubishi       0.00      0.00      0.00         7
       nissan       0.14      0.20      0.16        15
       peugot       0.80      0.89      0.84         9
     plymouth       0.00      0.00      0.00         1
      porsche       1.00      0.67      0.80         3
         saab       0.33      0.17      0.22         6
       subaru       0.00      0.00      0.00         8
       toyota       0.35      0.76      0.48        17
   volksw

  _warn_prf(average, modifier, msg_start, len(result))


In [74]:
print(accuracy_score(y_train_num,y_hat_train))

0.3643410852713178


In [75]:
print(classification_report(y_test_num,y_hat_test))

               precision    recall  f1-score   support

         audi       0.25      1.00      0.40         1
          bmw       1.00      0.25      0.40         4
    chevrolet       0.00      0.00      0.00         2
        dodge       0.12      1.00      0.22         1
        honda       0.40      0.40      0.40         5
        isuzu       0.00      0.00      0.00         2
       jaguar       0.00      0.00      0.00         1
        mazda       0.00      0.00      0.00         3
mercedes-benz       0.25      0.50      0.33         2
      mercury       0.00      0.00      0.00         1
   mitsubishi       0.00      0.00      0.00         6
       nissan       0.00      0.00      0.00         3
       peugot       0.50      0.50      0.50         2
     plymouth       0.00      0.00      0.00         6
      porsche       1.00      1.00      1.00         1
       subaru       0.00      0.00      0.00         4
       toyota       0.41      0.73      0.52        15
   volksw

  _warn_prf(average, modifier, msg_start, len(result))


In [76]:
print(accuracy_score(y_test_num,y_hat_test))

0.3125


This does poorly, its get the answer right less than half of the time and we see that we don't acutally reach convergence without model. We can see how applying a scaler help our model


In [77]:
x_scaled_values = StandardScaler().fit_transform(x_numerical)

In [78]:
X_train_num_scaled,X_test_num_scaled,y_train_num_scaled,y_test_num_scaled = train_test_split(x_scaled_values,y_values,test_size=.33,random_state=42 )

In [79]:
clf_scaled = LogisticRegression(multi_class='auto',random_state=42,solver='lbfgs')

In [80]:
clf_scaled.fit(X_train_num_scaled,y_train_num_scaled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [81]:
y_hat_train_scaled = clf_scaled.predict(X_train_num_scaled)
y_hat_test_scaled = clf_scaled.predict(X_test_num_scaled)

In [82]:
print(classification_report(y_train_num_scaled,y_hat_train_scaled))

               precision    recall  f1-score   support

  alfa-romero       1.00      1.00      1.00         3
         audi       0.80      0.80      0.80         5
          bmw       1.00      0.75      0.86         4
    chevrolet       1.00      1.00      1.00         1
        dodge       0.62      0.71      0.67         7
        honda       1.00      1.00      1.00         8
       jaguar       1.00      1.00      1.00         2
        mazda       1.00      0.89      0.94         9
mercedes-benz       1.00      1.00      1.00         6
   mitsubishi       0.86      0.86      0.86         7
       nissan       0.88      0.93      0.90        15
       peugot       1.00      0.89      0.94         9
     plymouth       0.00      0.00      0.00         1
      porsche       1.00      1.00      1.00         3
         saab       0.86      1.00      0.92         6
       subaru       1.00      1.00      1.00         8
       toyota       0.88      0.82      0.85        17
   volksw

  _warn_prf(average, modifier, msg_start, len(result))


In [83]:
print(accuracy_score(y_train_num_scaled,y_hat_train_scaled))

0.8837209302325582


In [84]:
print(accuracy_score(y_test_num_scaled,y_hat_test_scaled))

0.625


We see that by scaling our data we increased the accuracy of our model, we still see that our model doesn't converge but the accuracy has greaterly improved.