# <b>Kaggle Learn</b>
# 10. Intermediate Machine Learning

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import statsmodels as sm
pd.options.display.max_columns = 20
pd.options.display.max_rows = 20
pd.options.display.max_colwidth = 80
np.set_printoptions(precision = 4, suppress = True)
from pandas import Series, DataFrame
%matplotlib inline

## 2. Missing Values

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the data
data = pd.read_csv('input/melb_data.csv')

# Select target
y = data['Price']

# To keep things simple, we'll use only numerical predictors
melb_predictors = data.drop(['Price'], axis = 1)
X = melb_predictors.select_dtypes(exclude = ['object'])

# Divide data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                      train_size = 0.8, 
                                                      test_size = 0.2,
                                                      random_state = 0)

In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators = 10, random_state = 0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [4]:
# Get names of columns with missing values
cols_with_missing = [col for col in X_train.columns
                     if X_train[col].isnull().any()]

# Drop columns in training and validation data
reduced_X_train = X_train.drop(cols_with_missing, axis = 1)
reduced_X_valid = X_valid.drop(cols_with_missing, axis = 1)

print('MAE from approach 1 (drop columns with missing values):')
print(score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid))

MAE from approach 1 (drop columns with missing values):
183550.22137772635


In [5]:
from sklearn.impute import SimpleImputer

# Imputation
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

# Imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

print('MAE from approach 2 (imputation):')
print(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))

MAE from approach 2 (imputation):
178166.46269899711


In [6]:
# Make a copy to avoid changing original data (when imputing)
X_train_plus = X_train.copy()
X_valid_plus = X_valid.copy()

# Make new columns indicating what will be imputed
for col in cols_with_missing:
    X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull()
    X_valid_plus[col + '_was_missing'] = X_valid_plus[col].isnull()
    
# Imputation
my_imputer = SimpleImputer()
imputed_X_train_plus = pd.DataFrame(my_imputer.fit_transform(X_train_plus))
imputed_X_valid_plus = pd.DataFrame(my_imputer.transform(X_valid_plus))

# Imputation removed column names; put them back
imputed_X_train_plus.columns = X_train_plus.columns
imputed_X_valid_plus.columns = X_valid_plus.columns

print('MAE from approach 3 (an extension to imputation):')
print(score_dataset(imputed_X_train_plus, imputed_X_valid_plus, y_train, y_valid))

MAE from approach 3 (an extension to imputation):
178927.503183954


In [7]:
# Shape of training data (num_rows, num_columns)
print(X_train.shape)

# Number of missing values in each column of training data
missing_val_count_by_column = X_train.isnull().sum()
print(missing_val_count_by_column[missing_val_count_by_column > 0])

(10864, 12)
Car               49
BuildingArea    5156
YearBuilt       4307
dtype: int64


## 3. Categorical Variables

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the data
data = pd.read_csv('input/melb_data.csv')

# Separate target from predictors
y = data['Price']
X = data.drop(['Price'], axis=1)

# Divide data into training and validation subsets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y,
                                                                train_size = 0.8,
                                                                test_size = 0.2,
                                                                random_state = 0)

# Drop columns with missing values (simplest approach)
cols_with_missing = [col for col in X_train_full.columns if X_train_full[col].isnull().any()] 
X_train_full.drop(cols_with_missing, axis = 1, inplace = True)
X_valid_full.drop(cols_with_missing, axis = 1, inplace = True)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = low_cardinality_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

In [9]:
X_train.head()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
12167,u,S,Southern Metropolitan,1,5.0,3182.0,1.0,1.0,0.0,-37.85984,144.9867,13240.0
6524,h,SA,Western Metropolitan,2,8.0,3016.0,2.0,2.0,193.0,-37.858,144.9005,6380.0
8413,h,S,Western Metropolitan,3,12.6,3020.0,3.0,1.0,555.0,-37.7988,144.822,3755.0
2919,u,SP,Northern Metropolitan,3,13.0,3046.0,3.0,1.0,265.0,-37.7083,144.9158,8870.0
6043,h,S,Western Metropolitan,3,13.3,3020.0,3.0,1.0,673.0,-37.7623,144.8272,4217.0


In [10]:
# Get list of categorical vairables
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

Categorical variables:
['Type', 'Method', 'Regionname']


In [11]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators = 100, random_state = 0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [12]:
drop_X_train = X_train.select_dtypes(exclude = ['object'])
drop_X_valid = X_valid.select_dtypes(exclude = ['object'])

print("MAE from approach 1 (drop categorical variables):")
print(score_dataset(drop_X_train, drop_X_valid, y_train, y_valid))

MAE from approach 1 (drop categorical variables):
175703.48185157913


In [13]:
from sklearn.preprocessing import OrdinalEncoder

# Make copy to avoid changing original data
label_X_train = X_train.copy()
label_X_valid = X_valid.copy()

# Apply ordinal encoder to each column with categorical data
ordinal_encoder = OrdinalEncoder()
label_X_train[object_cols] = ordinal_encoder.fit_transform(X_train[object_cols])
label_X_valid[object_cols] = ordinal_encoder.transform(X_valid[object_cols])

print("MAE from approach 2 (ordinal encoding):")
print(score_dataset(label_X_train, label_X_valid, y_train, y_valid))

MAE from approach 2 (ordinal encoding):
165936.40548390493


In [14]:
from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
# [handle_unknown = 'ignore']: avoid errors when the validation data contains classes
# that aren't represented in the training data
# [sparse = False]: ensure that the encoded columns are returned as a numpy array 
# (instead of a sparse matrix)
OH_encoder = OneHotEncoder(handle_unknown = 'ignore', sparse = False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis = 1)
num_X_valid = X_valid.drop(object_cols, axis = 1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis = 1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis = 1)

# Ensure all columns have string type
OH_X_train.columns = OH_X_train.columns.astype(str)
OH_X_valid.columns = OH_X_valid.columns.astype(str)

print("MAE from approach 3 (one-hot encoding):")
print(score_dataset(OH_X_train, OH_X_valid, y_train, y_valid))

MAE from approach 3 (one-hot encoding):
166089.4893009678


## 4. Pipelines

In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the data
data = pd.read_csv('input/melb_data.csv')

# Separate target from predictors
y = data.Price
X = data.drop(['Price'], axis = 1)

# Divide data into training and validation subsets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y,
                                                                train_size = 0.8,
                                                                test_size = 0.2,
                                                                random_state = 0)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

In [16]:
# Sample data
data = {
    'Age': [25, 35, 28, 31, 22, 27, 32, 29, 26, 30],
    'Income': [50000, 60000, 48000, 70000, 45000, 52000, 75000, 49000, 51000, 68000]
}

df = pd.DataFrame(data)
print("Original Dataset:")
print(df)

Original Dataset:
   Age  Income
0   25   50000
1   35   60000
2   28   48000
3   31   70000
4   22   45000
5   27   52000
6   32   75000
7   29   49000
8   26   51000
9   30   68000


In [17]:
# Filter before split
filtered_df_before = df[df['Age'] < 30]

# Perform train/test split
train_df_before, test_df_before = train_test_split(filtered_df_before, test_size=0.2, random_state=0)

print("\nFiltered Dataset Before Splitting:")
print(filtered_df_before)
print("\nTraining Data Before Splitting:")
print(train_df_before)
print("\nTesting Data Before Splitting:")
print(test_df_before)


Filtered Dataset Before Splitting:
   Age  Income
0   25   50000
2   28   48000
4   22   45000
5   27   52000
7   29   49000
8   26   51000

Training Data Before Splitting:
   Age  Income
2   28   48000
5   27   52000
0   25   50000
7   29   49000

Testing Data Before Splitting:
   Age  Income
8   26   51000
4   22   45000


In [18]:
# Perform train/test split first
train_df_after, test_df_after = train_test_split(df, test_size=0.2, random_state=0)

# Filter after split
train_df_after = train_df_after[train_df_after['Age'] < 30]
test_df_after = test_df_after[test_df_after['Age'] < 30]

print("\nTraining Data After Splitting:")
print(train_df_after)
print("\nTesting Data After Splitting:")
print(test_df_after)


Training Data After Splitting:
   Age  Income
4   22   45000
7   29   49000
0   25   50000
5   27   52000

Testing Data After Splitting:
   Age  Income
2   28   48000
8   26   51000


In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Create a synthetic dataset
data = {
    'Feature': [10, 20, 30, 40, 50, 60],
    'Leakage': [100, 200, 300, 400, 500, 600],
    'Target': [0, 1, 0, 1, 0, 1]  # Binary target (0 or 1)
}
df = pd.DataFrame(data)

# Display the original dataset
print("Original Dataset:")
print(df)

Original Dataset:
   Feature  Leakage  Target
0       10      100       0
1       20      200       1
2       30      300       0
3       40      400       1
4       50      500       0
5       60      600       1


In [20]:
# Preprocessing step before splitting (data leakage)
scaler = StandardScaler()
df[['Feature', 'Leakage']] = scaler.fit_transform(df[['Feature', 'Leakage']])

# Perform train/test split after preprocessing (incorrect order)
X = df[['Feature', 'Leakage']]  # Features
y = df['Target']  # Target variable

# Incorrectly split data after preprocessing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a model (e.g., Logistic Regression)
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate the model on the test set
accuracy = model.score(X_test, y_test)
print(f"\nModel Accuracy (with data leakage): {accuracy:.2f}")


Model Accuracy (with data leakage): 0.50


In [21]:
# Correct order: Split data first, then preprocess
X = df[['Feature', 'Leakage']]  # Features
y = df['Target']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocess the training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Fit scaler on training data only

# Train a model using the preprocessed training data
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Preprocess the test data using the same scaler
X_test_scaled = scaler.transform(X_test)  # Use scaler fitted on training data for test data

# Evaluate the model on the preprocessed test data
accuracy = model.score(X_test_scaled, y_test)
print(f"\nModel Accuracy (without data leakage): {accuracy:.2f}")


Model Accuracy (without data leakage): 0.50


In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Create a synthetic dataset with a meaningful relationship
data = {
    'Feature': [10, 20, 30, 40, 50, 60],
    'Leakage': [100, 200, 300, 400, 500, 600],
    'Target': [0, 1, 0, 1, 1, 1]  # Updated target variable
}
df = pd.DataFrame(data)

# Display the updated dataset
print("Updated Dataset:")
print(df)

# Preprocessing step before splitting (data leakage)
scaler = StandardScaler()
df[['Feature', 'Leakage']] = scaler.fit_transform(df[['Feature', 'Leakage']])

# Perform train/test split after preprocessing (incorrect order)
X = df[['Feature', 'Leakage']]  # Features
y = df['Target']  # Target variable

# Incorrectly split data after preprocessing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a model (e.g., Logistic Regression)
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate the model on the test set
accuracy = model.score(X_test, y_test)
print(f"\nModel Accuracy (with data leakage): {accuracy:.2f}")

Updated Dataset:
   Feature  Leakage  Target
0       10      100       0
1       20      200       1
2       30      300       0
3       40      400       1
4       50      500       1
5       60      600       1

Model Accuracy (with data leakage): 0.50


In [23]:
# Correct order: Split data first, then preprocess
X = df[['Feature', 'Leakage']]  # Features
y = df['Target']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocess the training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Fit scaler on training data only

# Train a model using the preprocessed training data
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Preprocess the test data using the same scaler
X_test_scaled = scaler.transform(X_test)  # Use scaler fitted on training data for test data

# Evaluate the model on the preprocessed test data
accuracy = model.score(X_test_scaled, y_test)
print(f"\nModel Accuracy (without data leakage): {accuracy:.2f}")


Model Accuracy (without data leakage): 0.50


In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Create a synthetic dataset with a meaningful relationship and leakage
data = {
    'Feature': [10, 20, 30, 40, 50, 60],
    'Leakage': [0, 1, 0, 1, 1, 1],  # Direct correlation with Target
    'Target': [0, 1, 0, 1, 0, 1]
}
df = pd.DataFrame(data)

# Display the updated dataset
print("Updated Dataset:")
print(df)

# Preprocessing step before splitting (data leakage)
scaler = StandardScaler()
df[['Feature', 'Leakage']] = scaler.fit_transform(df[['Feature', 'Leakage']])

# Perform train/test split after preprocessing (incorrect order)
X = df[['Feature', 'Leakage']]  # Features including leakage
y = df['Target']  # Target variable

# Incorrectly split data after preprocessing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a model (e.g., Logistic Regression)
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate the model on the test set
accuracy = model.score(X_test, y_test)
print(f"\nModel Accuracy (with data leakage): {accuracy:.2f}")

Updated Dataset:
   Feature  Leakage  Target
0       10        0       0
1       20        1       1
2       30        0       0
3       40        1       1
4       50        1       0
5       60        1       1

Model Accuracy (with data leakage): 0.50


In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Create a synthetic dataset with a clear data leakage scenario
data = {
    'Feature': [10, 20, 30, 40, 50, 60],
    'Leakage': [0, 1, 0, 1, 1, 1],  # Direct correlation with Target
    'Target': [0, 1, 0, 1, 0, 1]
}
df = pd.DataFrame(data)

# Display the updated dataset
print("Updated Dataset:")
print(df)

# Incorrect approach: Preprocessing before splitting (data leakage)
scaler = StandardScaler()
df[['Feature', 'Leakage']] = scaler.fit_transform(df[['Feature', 'Leakage']])

# Perform train/test split after preprocessing (incorrect order)
X = df[['Feature', 'Leakage']]  # Features including leakage
y = df['Target']  # Target variable

# Incorrectly split data after preprocessing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a model (e.g., Logistic Regression) on preprocessed data (incorrectly)
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate the model on the test set (with data leakage)
accuracy_leakage = model.score(X_test, y_test)
print(f"\nModel Accuracy (with data leakage): {accuracy_leakage:.2f}")

# Correct approach: Split data first, then preprocess to avoid data leakage
X = df[['Feature', 'Leakage']]  # Features including leakage
y = df['Target']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocess the training data only (without leakage)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Train a model on preprocessed training data (correct approach)
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Preprocess the test data using the same scaler
X_test_scaled = scaler.transform(X_test)

# Evaluate the model on the preprocessed test data (without data leakage)
accuracy_no_leakage = model.score(X_test_scaled, y_test)
print(f"\nModel Accuracy (without data leakage): {accuracy_no_leakage:.2f}")

Updated Dataset:
   Feature  Leakage  Target
0       10        0       0
1       20        1       1
2       30        0       0
3       40        1       1
4       50        1       0
5       60        1       1

Model Accuracy (with data leakage): 0.50

Model Accuracy (without data leakage): 0.50


In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Create a synthetic dataset with data leakage scenario
data = {
    'Feature': [10, 20, 30, 40, 50, 60],
    'Target': [0, 1, 0, 1, 0, 1]
}
df = pd.DataFrame(data)

# Display the dataset
print("Original Dataset:")
print(df)

# Preprocessing before splitting (data leakage)
scaler = StandardScaler()
df['Feature_scaled'] = scaler.fit_transform(df[['Feature']])

# Perform train/test split after preprocessing (incorrect order)
X = df[['Feature_scaled']]  # Scaled feature
y = df['Target']  # Target variable

# Incorrectly split data after preprocessing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a model on preprocessed data (incorrectly)
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate the model on the test set (with data leakage)
accuracy_leakage = model.score(X_test, y_test)
print(f"\nModel Accuracy (with data leakage): {accuracy_leakage:.2f}")

Original Dataset:
   Feature  Target
0       10       0
1       20       1
2       30       0
3       40       1
4       50       0
5       60       1

Model Accuracy (with data leakage): 0.50


In [29]:
X_train.head()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
12167,u,S,Southern Metropolitan,1,5.0,3182.0,1.0,1.0,1.0,0.0,,1940.0,-37.85984,144.9867,13240.0
6524,h,SA,Western Metropolitan,2,8.0,3016.0,2.0,2.0,1.0,193.0,,,-37.858,144.9005,6380.0
8413,h,S,Western Metropolitan,3,12.6,3020.0,3.0,1.0,1.0,555.0,,,-37.7988,144.822,3755.0
2919,u,SP,Northern Metropolitan,3,13.0,3046.0,3.0,1.0,1.0,265.0,,1995.0,-37.7083,144.9158,8870.0
6043,h,S,Western Metropolitan,3,13.3,3020.0,3.0,1.0,2.0,673.0,673.0,1970.0,-37.7623,144.8272,4217.0


In [30]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy = 'constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown = 'ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers = [
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)])

In [31]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators = 100, random_state = 0)

In [32]:
from sklearn.metrics import mean_absolute_error

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps = [('preprocessor', preprocessor),
                                ('model', model)
                               ])

# Preprocessing of training data, fit model
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

# Evaluate the model
score = mean_absolute_error(y_valid, preds)
print('MAE:', score)

MAE: 160679.18917034855


## 5. Cross-Validation

In [2]:
import pandas as pd

# Read the data
data = pd.read_csv('input/melb_data.csv')

# Select subset of predictors
cols_to_use = ['Rooms', 'Distance', 'Landsize', 'BuildingArea', 'YearBuilt']
X = data[cols_to_use]

# Select target
y = data['Price']

In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

my_pipeline = Pipeline(steps = [('preprocessor', SimpleImputer()),
                                ('model', RandomForestRegressor(n_estimators = 50,
                                                                random_state = 0))
                               ])

In [4]:
from sklearn.model_selection import cross_val_score

# Multiply by -1 since sklearn calculates *negative* MAE
scores = -1 * cross_val_score(my_pipeline, X, y,
                              cv = 5,
                              scoring = 'neg_mean_absolute_error')

print('MAE scores:\n', scores)

MAE scores:
 [301628.7894 303164.4783 287298.3317 236061.8475 260383.4511]


In [5]:
print('Average MAE score (across experiments):')
print(scores.mean())

Average MAE score (across experiments):
277707.3795913405


## 6. XGBoost

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the data
data = pd.read_csv('input/melb_data.csv')

# Select subset of predictors
cols_to_use = ['Rooms', 'Distance', 'Landsize', 'BuildingArea', 'YearBuilt']
X = data[cols_to_use]

# Select target
y = data['Price']

# Separate data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y)

In [10]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-2.0.3-py3-none-win_amd64.whl (99.8 MB)
     ---------------------------------------- 99.8/99.8 MB 2.7 MB/s eta 0:00:00
Installing collected packages: xgboost
Successfully installed xgboost-2.0.3
Note: you may need to restart the kernel to use updated packages.


In [11]:
from xgboost import XGBRegressor

my_model = XGBRegressor()
my_model.fit(X_train, y_train)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)

In [12]:
from sklearn.metrics import mean_absolute_error

predictions = my_model.predict(X_valid)
print('Mean Absolute Error: ' + str(mean_absolute_error(predictions, y_valid)))

Mean Absolute Error: 244967.28017765094


In [13]:
my_model = XGBRegressor(n_estimators = 500)
my_model.fit(X_train, y_train)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=500, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)

In [15]:
my_model = XGBRegressor(n_estimators = 500)
my_model.fit(X_train, y_train,
             early_stopping_rounds = 5,
             eval_set = [(X_valid, y_valid)],
             verbose = False)



XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=500, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)

In [None]:
stopped at learning_rate