In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
cars = pd.read_csv('auto.csv')
unique_origins = cars['origin'].unique()

unique_origins

array([1, 3, 2])

Treat the columns `cylinders`, `year`, and `origin` as categorical variables. Create dummy variables for the `cylinder` and `year` column to help predict the label `origin`. Concatenate the categorical columns to the `cars` dataframe and drop the existing `year` and `cylinders` columns:

In [2]:
dummy_cylinders = pd.get_dummies(cars["cylinders"], prefix="cyl")
cars = pd.concat([cars, dummy_cylinders], axis=1)
dummy_years = pd.get_dummies(cars["year"], prefix="year")
cars = pd.concat([cars, dummy_years], axis=1)
cars = cars.drop("year", axis=1)
cars = cars.drop("cylinders", axis=1)

cars.head(5)

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,origin,cyl_3,cyl_4,cyl_5,cyl_6,...,year_73,year_74,year_75,year_76,year_77,year_78,year_79,year_80,year_81,year_82
0,18.0,307.0,130.0,3504.0,12.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,15.0,350.0,165.0,3693.0,11.5,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,18.0,318.0,150.0,3436.0,11.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,16.0,304.0,150.0,3433.0,12.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,17.0,302.0,140.0,3449.0,10.5,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Randomize the rows of the dataframe and split into training and test sets. Assign the 70% to the training set and the remaining 30% to the test set.

In [3]:
shuffled_rows = np.random.permutation(cars.index)
shuffled_cars = cars.iloc[shuffled_rows]

last_train_row = int(cars.shape[0]*0.70)
train = shuffled_cars.iloc[0:last_train_row]
test = shuffled_cars.iloc[last_train_row:]

Train a logistic regression model for each of the unique origin categories:

In [9]:
unique_origins.sort()
models = dict()
features = [c for c in train.columns if c.startswith('cyl') or c.startswith('year')]

for origin in unique_origins:
    model = LogisticRegression()
    X_train = train[features]
    y_train = train['origin'] == origin
    
    model.fit(X_train, y_train)
    models[origin] = model

In [10]:
testing_probs = pd.DataFrame(columns=unique_origins)
for origin in unique_origins:
    X_test = test[features]
    testing_probs[origin] = models[origin].predict_proba(X_test)[:,1]
    
testing_probs

Unnamed: 0,1,2,3
0,0.809458,0.034388,0.190926
1,0.953841,0.035433,0.029263
2,0.955680,0.018471,0.051159
3,0.284773,0.476872,0.235638
4,0.971290,0.026578,0.022732
5,0.377629,0.333614,0.264661
6,0.270399,0.402469,0.318078
7,0.377629,0.333614,0.264661
8,0.875263,0.053164,0.090400
9,0.278879,0.256536,0.454825


In [11]:
predicted_origins = testing_probs.idxmax(axis = 1)
predicted_origins

0      1
1      1
2      1
3      2
4      1
5      1
6      2
7      1
8      1
9      3
10     1
11     3
12     1
13     1
14     1
15     1
16     1
17     1
18     1
19     1
20     1
21     1
22     1
23     1
24     3
25     2
26     1
27     1
28     1
29     2
      ..
88     2
89     3
90     1
91     1
92     1
93     1
94     1
95     1
96     1
97     3
98     1
99     2
100    1
101    1
102    1
103    3
104    3
105    1
106    1
107    1
108    1
109    1
110    1
111    3
112    1
113    1
114    1
115    1
116    1
117    1
Length: 118, dtype: int64