In [1]:
#Read the data
import pandas as pd
file = 'data/adult.csv'
#The data set has ''?'' for na values.
df = pd.read_csv(file, na_values='?')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [2]:
df.shape

(32561, 15)

We will remove any rows that has missing values.

In [3]:
df_clean = df.dropna()

In [4]:
df_clean.shape

(30162, 15)

In [5]:
features = df_clean.drop(columns = ['salary'], axis=1)
response = df_clean[['salary']]

In [6]:
# select columns with numerical data types
num_cols = features.select_dtypes(include=['int16', 'int32', 'int64', 'float16', 'float32', 'float64']).columns
# select columns with categorical data types
cat_cols = features.select_dtypes(include=['object', 'bool', 'category']).columns

In [7]:
num_cols = num_cols.tolist()
num_cols

['age',
 'fnlwgt',
 'education-num',
 'capital-gain',
 'capital-loss',
 'hours-per-week']

In [8]:
cat_cols = cat_cols.tolist()
cat_cols

['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country']

Now, let's encode the categorical columns.

In [9]:
#Encoding categorical data values
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict

features[cat_cols] = features[cat_cols].astype('category') #let's make sure all categorical variables are of type str or category
#Now let's use apply() function to convert all caterical variables into encoded values.
features[cat_cols] = features[cat_cols].apply(LabelEncoder().fit_transform)

#now data set is ready for fitting.

In [10]:
features.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,5,77516,9,13,4,0,1,4,1,2174,0,40,38
1,50,4,83311,9,13,2,3,0,4,1,0,0,13,38
2,38,2,215646,11,9,0,5,1,4,1,0,0,40,38
3,53,2,234721,1,7,2,5,0,2,1,0,0,40,38
4,28,2,338409,9,13,2,9,5,2,0,0,0,40,4


In [11]:
from sklearn.model_selection import train_test_split
my_result_list = train_test_split(features, response, test_size=0.20, random_state=0)
features_train, features_test, response_train, response_test = my_result_list

## Evaluating Multiple Models for Classification

Let's compare **Random Forest**, **Decision Tree**, **Extreme Gradient Boosting**, **Categorical Gradient Boosting**, **Light Gradient Boosting** models by evaluating the prediction accuracy results on the same train data set.

In [12]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score 

In [13]:
models_list = [RandomForestClassifier(), 
               DecisionTreeClassifier(), 
               XGBClassifier(), 
               CatBoostClassifier(silent=True),
               LGBMClassifier()] # we put model functions in a list

# model names in a list
model_names = ['Random Forest', 
               'Decision Tree', 
               'Extreme Gradient Boosting', 
               'Categorical Gradient Boosting',
               'Light Gradient Boosting'] 
accuracy_list = []
results_dict = {}

for model in range(len(models_list)):
    classifier = models_list[model]
    classifier.fit(features_train, response_train)
    response_pred = classifier.predict(features_test)
    accuracy_list.append(accuracy_score(response_pred, response_test))
     
result_dict = {'Model Name':model_names, 'Accuracy':accuracy_list}

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [14]:
results_df = pd.DataFrame(result_dict)
results_df

Unnamed: 0,Model Name,Accuracy
0,Random Forest,0.852147
1,Decision Tree,0.804906
2,Extreme Gradient Boosting,0.866236
3,Categorical Gradient Boosting,0.869385
4,Light Gradient Boosting,0.869385


## Evaluating Multiple Models for Regression

We will evaluate **Random Forest**, **Decision Trees**, **Multiple Linear Regression**, **Extreme Gradient Boosting**, **Categorical Gradient Boosting** and **Light Gradient Boosting Model** models using Boston House Prices data.

In [16]:
from sklearn.datasets import load_boston
import pandas as pd
boston = load_boston()
df  = pd.DataFrame(boston.data, columns = boston.feature_names)
df['MEDV'] = boston.target
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [17]:
from sklearn.model_selection import train_test_split
features = df.iloc[:,0:13] #First 13 columns in dataframe accounts for features
features.head() #this is a dataframe

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [18]:
response = df[['MEDV']] #create a dataframe for response variable
response.head()

Unnamed: 0,MEDV
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2


In [19]:
my_result_list = train_test_split(features, response, test_size=0.2, random_state=0)

features_train = my_result_list[0]
features_test = my_result_list[1]
response_train = my_result_list[2]
response_test = my_result_list[3]

Now our data set is ready for models training. We will create a list of regressor models and call one by one.

In [16]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

models_list = [RandomForestRegressor(), 
               DecisionTreeRegressor(), 
               LinearRegression(),
               XGBRegressor(),
               CatBoostRegressor(silent = True),
               LGBMRegressor()] # we put model functions in a list

model_names = ['Random Forest', 
               'Decision Tree', 
               'Multiple Linear Regression', 
               'Extreme Gradient Boosting', 
               'Categorical Gradient Boosting',
               'Light Gradient Boosting Model'] # model names in a list

mae_list = []
mse_list = []
rmse_list = []
results_dict = {}

for model in range(len(models_list)):
    regressor = models_list[model]
    regressor.fit(features_train, response_train)
    response_pred=regressor.predict(features_test)
    mae_list.append(metrics.mean_absolute_error(response_pred, response_test))
    mse_list.append(metrics.mean_squared_error(response_pred, response_test))
    rmse_list.append(np.sqrt(metrics.mean_squared_error(response_pred, response_test)))
    
result_dict = {'Model Name':model_names, 
               'Mean Absolute Error':mae_list, 
               'Mean Squared Error':mse_list,
               'Root Mean Squared Error':rmse_list}



In [17]:
results_df = pd.DataFrame(result_dict)
results_df

Unnamed: 0,Model Name,Mean Absolute Error,Mean Squared Error,Root Mean Squared Error
0,Random Forest,2.652529,19.249748,4.387453
1,Decision Tree,3.478431,30.738627,5.544243
2,Multiple Linear Regression,3.842909,33.44898,5.783509
3,Extreme Gradient Boosting,2.796177,21.366959,4.622441
4,Categorical Gradient Boosting,2.804644,24.367557,4.936351
5,Light Gradient Boosting Model,2.812486,24.498697,4.949616
