In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor


In [2]:
name = ["mpg", "cylinders","displacement","horsepower",
        "weight","acceleration","model_year", "origin","car_name"]

features = ["cylinders","displacement","horsepower",
        "weight","acceleration","model_year", "origin"]

data = pd.read_csv("auto-mpg.data", delimiter= "\s+", names = name)

In [3]:
# Change the object variable to numeric:

data["horsepower_int"] = data["horsepower"].apply(pd.to_numeric, errors = 'coerce')
### coerce will change non-numeric to NaNren
data.drop("horsepower", axis = 1, inplace=True)
data.rename(columns={"horsepower_int": "horsepower"}, inplace = True)
data.columns

Index([u'mpg', u'cylinders', u'displacement', u'weight', u'acceleration',
       u'model_year', u'origin', u'car_name', u'horsepower'],
      dtype='object')

In [4]:
# Deal with NaN

## method 1: drop
data_dropna = data.dropna()

## method 2: with mean
horsepower_mean = data.horsepower.mean()
data_fillmean = data.copy()
data_fillmean.horsepower = data["horsepower"].fillna(horsepower_mean)


In [5]:
##[[Q]]: failed attempt to name dataset using a loop

#datasets = [data_dropna, data_fillmean]
#names = [["X1", "Y1"], ["X2", "Y2"]]
#for name, dataset in zip(names,datasets):
#    name[0] = dataset[features]
#    name[1] = dataset["mpg"]


In [6]:
X = data_dropna[features]
y = data_dropna["mpg"]
X1 = data_fillmean[features]
y1 = data_fillmean["mpg"]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                test_size=0.20, random_state=42)
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, 
                                test_size=0.20, random_state=42)

In [8]:
# different models
random_forest = RandomForestRegressor(random_state=0)
linear_reg = LinearRegression()
lasso = Lasso(random_state=0)
ridge = Ridge(random_state=0)

In [73]:
models = [random_forest, linear_reg, lasso, ridge]
k_fold = KFold(n_splits=5, shuffle = True, random_state=0)

def evaluate(X, y):
    scores = {}
    for model in models:
        score = cross_val_score(model, X, y, cv = k_fold)
        scores[model.__class__.__name__] =  [score.mean()]
        scores[model.__class__.__name__].append(score.std())
        
    return scores

In [74]:
# run and show result：
import operator
result = evaluate(X, y)
#[[Q]]: should I use training X and y here?

sorted_result = sorted(result.items(), key=operator.itemgetter(1), reverse = True)
for each in sorted_result:
    print each[0],"R2: %0.2f (+/- %0.2f)" %(each[1][0], each[1][1]/2)

RandomForestRegressor R2: 0.87 (+/- 0.02)
Ridge R2: 0.80 (+/- 0.02)
LinearRegression R2: 0.80 (+/- 0.02)
Lasso R2: 0.80 (+/- 0.02)


In [75]:
result1 = evaluate(X1, y1)
sorted_result1 = sorted(result1.items(), key=operator.itemgetter(1), reverse = True)
for each in sorted_result1:
    print each[0],"R2: %0.2f (+/- %0.2f)" %(each[1][0], each[1][1])

RandomForestRegressor R2: 0.87 (+/- 0.03)
Ridge R2: 0.81 (+/- 0.02)
LinearRegression R2: 0.81 (+/- 0.02)
Lasso R2: 0.80 (+/- 0.03)


Random Forest Regressor has the highest R2 score among the models we have tested. Randome Forest is an ensemble learning method, based on decision tree. It work by constructing multiple decision trees, which are trained on a random sample of the training set, and averaging the results from all the individual regression tree to get to the prediction. 
