https://archive.ics.uci.edu/ml/datasets/Auto+MPG

Using the information, predict the origin of the vehicle, either North America, Europe, or Asia. 

In [35]:
import pandas as pd
import numpy as np

In [28]:
cars = pd.read_csv("auto.csv")

In [29]:
cars

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,car_name
0,18,8,307,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15,8,350,165,3693,11.5,70,1,buick skylark 320
2,18,8,318,150,3436,11.0,70,1,plymouth satellite
3,16,8,304,150,3433,12.0,70,1,amc rebel sst
4,17,8,302,140,3449,10.5,70,1,ford torino
5,15,8,429,198,4341,10.0,70,1,ford galaxie 500
6,14,8,454,220,4354,9.0,70,1,chevrolet impala
7,14,8,440,215,4312,8.5,70,1,plymouth fury iii
8,14,8,455,225,4425,10.0,70,1,pontiac catalina
9,15,8,390,190,3850,8.5,70,1,amc ambassador dpl


In [30]:
list(cars.columns)

['mpg',
 'cylinders',
 'displacement',
 'horsepower',
 'weight',
 'acceleration',
 'year',
 'origin',
 'car_name']

In [37]:
unique_origions = cars["origin"].unique()
print unique_origions
unique_origions

[1 3 2]


array([1, 3, 2], dtype=int64)

In [32]:
cars["cylinders"].unique()

array([8, 4, 6, 3, 5], dtype=int64)

In [33]:
#Adding dummy variables to generate categorical values and dropping the original columns
dummy_cylinders = pd.get_dummies(cars["cylinders"], prefix="cyl")
cars = pd.concat([cars, dummy_cylinders], axis=1)
#print(cars.head())
dummy_years = pd.get_dummies(cars["year"], prefix="year")
cars = pd.concat([cars, dummy_years], axis=1)
cars = cars.drop("year", axis=1)
cars = cars.drop("cylinders", axis=1)
print(cars.head())

   mpg  displacement horsepower  weight  acceleration  origin  \
0   18           307        130    3504          12.0       1   
1   15           350        165    3693          11.5       1   
2   18           318        150    3436          11.0       1   
3   16           304        150    3433          12.0       1   
4   17           302        140    3449          10.5       1   

                    car_name  cyl_3  cyl_4  cyl_5   ...     year_73  year_74  \
0  chevrolet chevelle malibu      0      0      0   ...           0        0   
1          buick skylark 320      0      0      0   ...           0        0   
2         plymouth satellite      0      0      0   ...           0        0   
3              amc rebel sst      0      0      0   ...           0        0   
4                ford torino      0      0      0   ...           0        0   

   year_75  year_76  year_77  year_78  year_79  year_80  year_81  year_82  
0        0        0        0        0        0      

In [36]:
shuffled_rows = np.random.permutation(cars.index)
shuffled_cars = cars.iloc[shuffled_rows]

#Split into train and test sets
highest_train_row = int(cars.shape[0] * .70)
train = shuffled_cars.iloc[0:highest_train_row]
test = shuffled_cars.iloc[highest_train_row:]

In [38]:
from sklearn.linear_model import LogisticRegression

unique_origins = cars["origin"].unique()
unique_origins.sort()

#create a dictionary origin->relevant Logistic Reg model instance
models = {}

#For each value in unique_origins, train a logistic regression model with the following parameters:
#X: Dataframe containing just the cylinder & year binary columns.
#y: list (or Series) of Boolean values:
#True if observation's value for origin matches the current iterator variable.
#False if observation's value for origin doesn't match the current iterator variable.

features = [c for c in train.columns if c.startswith("cyl") or c.startswith("year")]

for origin in unique_origins:
    model = LogisticRegression()
    
    X_train = train[features]
    y_train = train["origin"] == origin

    model.fit(X_train, y_train)
    models[origin] = model

In [39]:
#For each origin value from unique_origins: Use the LogisticRegression predict_proba function to return the 3 lists of predicted
#probabilities for the test set and add to the testing_probs Dataframe.

testing_probs = pd.DataFrame(columns=unique_origins)
for origin in unique_origins:
    # Select testing features.
    X_test = test[features]   
    # Compute probability of observation being in the origin.
    testing_probs[origin] = models[origin].predict_proba(X_test)[:,1]

In [40]:
#To classify each observation we want to select the origin with the highest probability of classification for that observation.
#Classify each observation in the test set using the testing_probs Dataframe.

predicted_origins = testing_probs.idxmax(axis=1)
print(predicted_origins)


0      1
1      1
2      3
3      1
4      1
5      1
6      3
7      1
8      1
9      1
10     1
11     1
12     1
13     2
14     1
15     3
16     1
17     1
18     1
19     1
20     1
21     1
22     3
23     1
24     1
25     1
26     1
27     1
28     2
29     3
      ..
90     1
91     1
92     1
93     1
94     1
95     1
96     1
97     2
98     1
99     3
100    2
101    1
102    1
103    1
104    3
105    3
106    1
107    3
108    3
109    2
110    1
111    1
112    1
113    1
114    1
115    1
116    1
117    1
118    3
119    1
dtype: int64
