# Machine Learning - Multi class Classification using Logistic Regression (One-Versus-All)
## Predictive model for predicting origin of the car

### -by Vinay Kumar Ranganath Babu

In [10]:
# Importing the libraries
import pandas as pd
import numpy as np
columns = ["mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration", "model year", "origin", "car name"]
cars = pd.read_table("auto.txt", delim_whitespace=True, names=columns)
cars.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


In [11]:
# Check the unique origins
unique_regions = cars["origin"].unique()
print(unique_regions)

[1 3 2]


In [12]:
dummy_cylinders = pd.get_dummies(cars["cylinders"], prefix="cyl")
cars = pd.concat([cars, dummy_cylinders], axis=1)

dummy_years = pd.get_dummies(cars["model year"], prefix="year")
cars = pd.concat([cars, dummy_years], axis=1)
cars = cars.drop("model year", axis=1)
cars = cars.drop("cylinders", axis=1)
cars.head()

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,origin,car name,cyl_3,cyl_4,cyl_5,...,year_73,year_74,year_75,year_76,year_77,year_78,year_79,year_80,year_81,year_82
0,18.0,307.0,130.0,3504.0,12.0,1,chevrolet chevelle malibu,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,15.0,350.0,165.0,3693.0,11.5,1,buick skylark 320,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,18.0,318.0,150.0,3436.0,11.0,1,plymouth satellite,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,16.0,304.0,150.0,3433.0,12.0,1,amc rebel sst,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,17.0,302.0,140.0,3449.0,10.5,1,ford torino,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# Cross validation
shuffled_rows = np.random.permutation(cars.index)
shuffled_cars = cars.iloc[shuffled_rows]
highest_train_row = int(cars.shape[0] * .70)
train = shuffled_cars.iloc[0:highest_train_row]
test = shuffled_cars.iloc[highest_train_row:]

In [14]:
# Machine Learning
from sklearn.linear_model import LogisticRegression

unique_origins = cars["origin"].unique()
unique_origins.sort()

models = {}
features = [c for c in train.columns if c.startswith("cyl") or c.startswith("year")]

for origin in unique_origins:
    model = LogisticRegression()
    
    X_train = train[features]
    y_train = train["origin"] == origin

    model.fit(X_train, y_train)
    models[origin] = model

In [15]:
# Check the probabilities 
testing_probs = pd.DataFrame(columns=unique_origins)  

for origin in unique_origins:
    # Select testing features.
    X_test = test[features]   
    # Compute probability of observation being in the origin.
    testing_probs[origin] = models[origin].predict_proba(X_test)[:,1]

In [16]:
# COnsider the highest probability and print the class in test data
predicted_origins = testing_probs.idxmax(axis=1)
print(predicted_origins)

0      3
1      1
2      3
3      1
4      1
5      1
6      3
7      1
8      1
9      1
10     1
11     3
12     1
13     1
14     1
15     1
16     2
17     1
18     1
19     1
20     1
21     1
22     1
23     1
24     3
25     1
26     1
27     1
28     3
29     1
      ..
88     1
89     1
90     1
91     1
92     1
93     1
94     3
95     3
96     1
97     1
98     1
99     3
100    3
101    3
102    1
103    1
104    1
105    1
106    1
107    3
108    1
109    1
110    1
111    2
112    1
113    1
114    1
115    1
116    1
117    1
dtype: int64
