In [31]:
import pandas as pd
import matplotlib.pyplot as plt
columns = ["mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration", "year", "origin", "car name"]
cars = pd.read_table("auto-mpg.data", delim_whitespace=True, names=columns)
print(cars.head(5))
print(cars.tail(5))

    mpg  cylinders  displacement horsepower  weight  acceleration  year  \
0  18.0          8         307.0      130.0  3504.0          12.0    70   
1  15.0          8         350.0      165.0  3693.0          11.5    70   
2  18.0          8         318.0      150.0  3436.0          11.0    70   
3  16.0          8         304.0      150.0  3433.0          12.0    70   
4  17.0          8         302.0      140.0  3449.0          10.5    70   

   origin                   car name  
0       1  chevrolet chevelle malibu  
1       1          buick skylark 320  
2       1         plymouth satellite  
3       1              amc rebel sst  
4       1                ford torino  
      mpg  cylinders  displacement horsepower  weight  acceleration  year  \
393  27.0          4         140.0      86.00  2790.0          15.6    82   
394  44.0          4          97.0      52.00  2130.0          24.6    82   
395  32.0          4         135.0      84.00  2295.0          11.6    82   
396  28

In [32]:
dummy_cylinders = pd.get_dummies(cars["cylinders"], prefix="cyl")
#print dummy_cylinders
cars = pd.concat([cars, dummy_cylinders], axis=1)
print(cars.head())
dummy_years = pd.get_dummies(cars["year"], prefix="year")
#print dummy_years
cars = pd.concat([cars, dummy_years], axis=1)
cars = cars.drop("year", axis=1)
cars = cars.drop("cylinders", axis=1)
print(cars.head())

    mpg  cylinders  displacement horsepower  weight  acceleration  year  \
0  18.0          8         307.0      130.0  3504.0          12.0    70   
1  15.0          8         350.0      165.0  3693.0          11.5    70   
2  18.0          8         318.0      150.0  3436.0          11.0    70   
3  16.0          8         304.0      150.0  3433.0          12.0    70   
4  17.0          8         302.0      140.0  3449.0          10.5    70   

   origin                   car name  cyl_3  cyl_4  cyl_5  cyl_6  cyl_8  
0       1  chevrolet chevelle malibu      0      0      0      0      1  
1       1          buick skylark 320      0      0      0      0      1  
2       1         plymouth satellite      0      0      0      0      1  
3       1              amc rebel sst      0      0      0      0      1  
4       1                ford torino      0      0      0      0      1  
    mpg  displacement horsepower  weight  acceleration  origin  \
0  18.0         307.0      130.0  3504.

In [33]:
import numpy as np
shuffled_rows = np.random.permutation(cars.index)
shuffled_cars = cars.iloc[shuffled_rows]
highest_train_row = int(cars.shape[0] * .70)
train = shuffled_cars.iloc[0:highest_train_row]
test = shuffled_cars.iloc[highest_train_row:]

In [14]:
from sklearn.linear_model import LogisticRegression

unique_origins = cars["origin"].unique()
unique_origins.sort()

models = {}
features = [c for c in train.columns if c.startswith("cyl") or c.startswith("year")]

for origin in unique_origins:
    model = LogisticRegression()
    
    X_train = train[features]
    y_train = train["origin"] == origin

    model.fit(X_train, y_train)
    models[origin] = model

In [35]:
testing_probs = pd.DataFrame(columns=unique_origins)  
print testing_probs

for origin in unique_origins:
    # Select testing features.
    X_test = test[features]   
    # Compute probability of observation being in the origin.
    testing_probs[origin] = models[origin].predict_proba(X_test)[:,1]
print testing_probs

Empty DataFrame
Columns: [1, 2, 3]
Index: []
            1         2         3
0    0.449962  0.278233  0.257692
1    0.266463  0.420440  0.319938
2    0.306623  0.391842  0.299603
3    0.253376  0.362190  0.388915
4    0.360157  0.420710  0.216831
5    0.253376  0.362190  0.388915
6    0.266463  0.420440  0.319938
7    0.792845  0.094610  0.103560
8    0.983246  0.016008  0.020850
9    0.817616  0.101673  0.076423
10   0.258733  0.299579  0.440839
11   0.360157  0.420710  0.216831
12   0.958799  0.037250  0.022790
13   0.517603  0.248328  0.235285
14   0.253376  0.362190  0.388915
15   0.958799  0.037250  0.022790
16   0.968540  0.034528  0.018801
17   0.953930  0.032954  0.028759
18   0.896040  0.052606  0.078549
19   0.967178  0.018923  0.036452
20   0.983246  0.016008  0.020850
21   0.517603  0.248328  0.235285
22   0.274615  0.408990  0.299642
23   0.360157  0.420710  0.216831
24   0.967178  0.018923  0.036452
25   0.983246  0.016008  0.020850
26   0.960296  0.030752  0.028753
27 