使用 One-Hot Encoding , 把類別型資料轉成 dummy型態
在這本筆記本中並未將所有欄位都轉型 , 因為有些欄位的獨一值過多, 轉成dummy會導致欄位過多,難以進行機器學習

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('gurus_ML.csv')

In [3]:
data.columns

Index(['Unnamed: 0', 'Title', 'Year', 'Brand', 'Location', 'State', 'Rental',
       'Price', 'Mileage', 'Transmission', 'FuelType', 'Engine', 'Drivetrain',
       'OptionCount', 'MajorOptions', 'Accident Check', 'OwnershipHistory'],
      dtype='object')

In [4]:
data.head(10)

Unnamed: 0.1,Unnamed: 0,Title,Year,Brand,Location,State,Rental,Price,Mileage,Transmission,FuelType,Engine,Drivetrain,OptionCount,MajorOptions,Accident Check,OwnershipHistory
0,2,Silverado 2500HD Work Truck Extended Cab LB 4WD,2010,Chevrolet,Frankfort,KY,False,15900.0,144000.0,Automatic,Gasoline,V8,Four-Wheel Drive,2,"['Steel Wheels', 'Tow Package']",0.0,2.0
1,3,Corvette Stingray 3LT Coupe RWD,2014,Chevrolet,Lewistown,MT,False,49995.0,2500.0,7-Speed Manual,Gasoline,V8,Rear-Wheel Drive,5,"['Bluetooth', 'Backup Camera', 'Aluminum Wheel...",0.0,1.0
2,4,Monte Carlo SS FWD,2002,Chevrolet,Delavan,IL,False,3440.0,174000.0,Automatic,Gasoline,V6,Front-Wheel Drive,2,"['Leather Seats', 'Aluminum Wheels']",0.0,2.0
3,5,Monte Carlo LS FWD,2004,Chevrolet,Delavan,IL,False,4890.0,115500.0,Automatic,Gasoline,V6,Front-Wheel Drive,1,['Steel Wheels'],0.0,3.0
4,7,Silverado 1500 LT Extended Cab RWD,2001,Chevrolet,Delavan,IL,False,6550.0,188583.0,Automatic,Gasoline,V8,Rear-Wheel Drive,3,"['Leather Seats', 'Suspension Package', 'Alumi...",0.0,1.0
5,8,Equinox 2LT FWD,2009,Chevrolet,Delavan,IL,True,9960.0,80124.0,Automatic,Gasoline,V6,Front-Wheel Drive,4,"['Leather Seats', 'Chrome Wheels', 'Bluetooth'...",0.0,2.0
6,9,Silverado 1500 LS Extended Cab LB RWD,2004,Chevrolet,Delavan,IL,False,9220.0,103000.0,Automatic,Gasoline,V8,Rear-Wheel Drive,4,"['Power Package', 'Suspension Package', 'Chrom...",0.0,1.0
7,11,Impala LS FWD,2005,Chevrolet,Delavan,IL,True,5230.0,171466.0,Automatic,Gasoline,V6,Front-Wheel Drive,2,"['Leather Seats', 'Aluminum Wheels']",0.0,4.0
8,12,Blazer LS 4-Door 4WD,2004,Chevrolet,Delavan,IL,False,3970.0,178000.0,Automatic,Gasoline,V6,Four-Wheel Drive,2,"['Power Package', 'Convenience Package']",0.0,3.0
9,14,Silverado 1500 1LT Crew Cab 4WD,2006,Chevrolet,Murdo,SD,False,16995.0,90200.0,Automatic,Flex Fuel Vehicle,V8 Flex Fuel Vehicle,Four-Wheel Drive,0,['empty'],0.0,4.0


------------------------

### 1.把每個欄位都有值的資料切出來

In [5]:
data.notnull().all(axis=1)

0           True
1           True
2           True
3           True
4           True
           ...  
2088473    False
2088474     True
2088475    False
2088476    False
2088477    False
Length: 2088478, dtype: bool

In [6]:
data1 = data[data.notnull().all(axis=1)]
# data1: 是每個欄位都有值的那些資料 , 之後會再被切割成兩份 ,一份訓練資料 ,一份測試資料

In [7]:
data1.isnull().sum()

Unnamed: 0          0
Title               0
Year                0
Brand               0
Location            0
State               0
Rental              0
Price               0
Mileage             0
Transmission        0
FuelType            0
Engine              0
Drivetrain          0
OptionCount         0
MajorOptions        0
Accident Check      0
OwnershipHistory    0
dtype: int64

---------------------

### 2.用 One-Hot Encoding 把類別型資料轉成數字型態

In [8]:
# from sklearn.preprocessing import OneHotEncoder

In [9]:
data1 = data1.drop(columns=['Unnamed: 0','Title'])

In [10]:
data1.dtypes

Year                  int64
Brand                object
Location             object
State                object
Rental                 bool
Price               float64
Mileage             float64
Transmission         object
FuelType             object
Engine               object
Drivetrain           object
OptionCount           int64
MajorOptions         object
Accident Check      float64
OwnershipHistory    float64
dtype: object

In [11]:
data1.columns

Index(['Year', 'Brand', 'Location', 'State', 'Rental', 'Price', 'Mileage',
       'Transmission', 'FuelType', 'Engine', 'Drivetrain', 'OptionCount',
       'MajorOptions', 'Accident Check', 'OwnershipHistory'],
      dtype='object')

In [12]:
objectcolumn = ['Brand','State', 'Transmission', 'FuelType', 'Engine', 'Drivetrain']
# 因為 Location 和 MajorOptions 兩個欄位的 唯一的值太多了 , 所以先不做 One-Hot

In [13]:
len(data1['Brand'].unique())

52

In [14]:
len(data1['Location'].unique())

6383

In [15]:
len(data1['State'].unique())

50

In [16]:
len(data1['Transmission'].unique())

45

In [17]:
len(data1['FuelType'].unique())

7

In [18]:
len(data1['Engine'].unique())

38

In [19]:
len(data1['Drivetrain'].unique())

5

In [20]:
def column_to_dummy(i):
    global data1
    for k in i:
        dummy_column = pd.get_dummies(data1[k], sparse=True)
        data1 = pd.concat([data1,dummy_column], axis =1)

In [21]:
column_to_dummy(objectcolumn)

In [22]:
data1.columns

Index(['Year', 'Brand', 'Location', 'State', 'Rental', 'Price', 'Mileage',
       'Transmission', 'FuelType', 'Engine',
       ...
       'V8 Hybrid', 'V8 Propane', 'W12', 'W12 Flex Fuel Vehicle', 'W8', '4X2',
       'All-Wheel Drive', 'Four-Wheel Drive', 'Front-Wheel Drive',
       'Rear-Wheel Drive'],
      dtype='object', length=212)

In [23]:
from sklearn.model_selection import train_test_split

In [34]:
X = data1.drop(columns=['Brand', 'Location', 'State', 'Transmission', 'FuelType', 'MajorOptions',
                         'Engine', 'Drivetrain', 'Price'])

In [35]:
X.columns

Index(['Year', 'Rental', 'Mileage', 'OptionCount', 'Accident Check',
       'OwnershipHistory', 'Acura', 'Alfa Romeo', 'Aston Martin', 'Audi',
       ...
       'V8 Hybrid', 'V8 Propane', 'W12', 'W12 Flex Fuel Vehicle', 'W8', '4X2',
       'All-Wheel Drive', 'Four-Wheel Drive', 'Front-Wheel Drive',
       'Rear-Wheel Drive'],
      dtype='object', length=203)

In [36]:
y = data1.iloc[:,5]

In [37]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=1)

In [38]:
X_train.shape

(1389857, 203)

In [39]:
y_train.shape

(1389857,)

In [40]:
from sklearn.ensemble import RandomForestRegressor

In [41]:
model = RandomForestRegressor(n_estimators=10, criterion='mse', random_state=0)

In [42]:
model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=0, verbose=0,
                      warm_start=False)

In [43]:
y_pred = model.predict(X_test)

In [44]:
from sklearn.metrics import r2_score

In [45]:
r2_score(y_test, y_pred)

0.7870528537819138