# Install Dependencies

In [1]:
# Initial imports
import numpy as np
import pandas as pd
from path import Path
from sklearn.preprocessing import OneHotEncoder

# Import and Clean Data

In [2]:
# Read in data
data = Path("../Resources/mpg.csv")
df = pd.read_csv(data, na_values='?')

# Drop rows with missing values
df.dropna(inplace=True)

# Drop 'car name' column
df = df.drop(["car name"],1)

# Convert 'origin' to a string
df['origin'] = df['origin'].astype(str) 

df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower      float64
weight            int64
acceleration    float64
model year        int64
origin           object
dtype: object

# Encode Categorical Data

In [3]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

In [4]:
# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df[["origin"]]))

In [5]:
# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(["origin"])
encode_df.head()

Unnamed: 0,origin_1,origin_2,origin_3
0,1.0,0.0,0.0
1,1.0,0.0,0.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,1.0,0.0,0.0


In [6]:
# Merge one-hot encoded features and drop the originals
df = df.merge(encode_df,left_index=True, right_index=True)
df = df.drop("origin",1)
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin_1,origin_2,origin_3
0,18.0,8,307.0,130.0,3504,12.0,70,1.0,0.0,0.0
1,15.0,8,350.0,165.0,3693,11.5,70,1.0,0.0,0.0
2,18.0,8,318.0,150.0,3436,11.0,70,1.0,0.0,0.0
3,16.0,8,304.0,150.0,3433,12.0,70,1.0,0.0,0.0
4,17.0,8,302.0,140.0,3449,10.5,70,1.0,0.0,0.0


In [7]:
df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower      float64
weight            int64
acceleration    float64
model year        int64
origin_1        float64
origin_2        float64
origin_3        float64
dtype: object