In [5]:
### Library

# Data manipulation and visualization

import pandas as pd

pd.options.display.float_format = '{:,.2f}'.format

pd.options.display.max_rows = 20

pd.options.display.max_columns = 20

import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns

from sklearn import preprocessing



# Modeling algorithms

# General

import statsmodels.api as sm

from scipy import stats



# Model selection

from sklearn.model_selection import train_test_split



def datasplit(df, Y_colname, test_size=0.2, random_state=123):

    X_colname = [x for x in df.columns if x not in Y_colname]

       

    X_train, X_test, Y_train, Y_test = train_test_split(df[X_colname], df[Y_colname],

                                                        test_size=test_size, random_state=random_state)

    print(X_train.shape, Y_train.shape)

    print(X_test.shape, Y_test.shape)

    

    return X_train, X_test, Y_train, Y_test



def scale(scaler, X_train, X_test):

    scaler_fit = scaler.fit(X_train)

    X_train_scaling = pd.DataFrame(scaler_fit.transform(X_train), 

                                   index=X_train.index, columns=X_train.columns)

    X_test_scaling = pd.DataFrame(scaler_fit.transform(X_test), 

                                  index=X_test.index, columns=X_test.columns)

    

    return X_train_scaling, X_test_scaling









### Data Loading

from sklearn.datasets import load_boston

raw = load_boston()

df_X = pd.DataFrame(raw.data, columns=raw.feature_names)

df_Y = pd.DataFrame(raw.target, columns=['Price'])

df = pd.concat([df_Y, df_X], axis=1)

In [8]:
df

Unnamed: 0,Price,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,24.00,0.01,18.00,2.31,0.00,0.54,6.58,65.20,4.09,1.00,296.00,15.30,396.90,4.98
1,21.60,0.03,0.00,7.07,0.00,0.47,6.42,78.90,4.97,2.00,242.00,17.80,396.90,9.14
2,34.70,0.03,0.00,7.07,0.00,0.47,7.18,61.10,4.97,2.00,242.00,17.80,392.83,4.03
3,33.40,0.03,0.00,2.18,0.00,0.46,7.00,45.80,6.06,3.00,222.00,18.70,394.63,2.94
4,36.20,0.07,0.00,2.18,0.00,0.46,7.15,54.20,6.06,3.00,222.00,18.70,396.90,5.33
5,28.70,0.03,0.00,2.18,0.00,0.46,6.43,58.70,6.06,3.00,222.00,18.70,394.12,5.21
6,22.90,0.09,12.50,7.87,0.00,0.52,6.01,66.60,5.56,5.00,311.00,15.20,395.60,12.43
7,27.10,0.14,12.50,7.87,0.00,0.52,6.17,96.10,5.95,5.00,311.00,15.20,396.90,19.15
8,16.50,0.21,12.50,7.87,0.00,0.52,5.63,100.00,6.08,5.00,311.00,15.20,386.63,29.93
9,18.90,0.17,12.50,7.87,0.00,0.52,6.00,85.90,6.59,5.00,311.00,15.20,386.71,17.10


In [9]:
interval = [1, 2, 3, 4, 5, 6]
if df['RAD'].max() >= 6:
    df['RAD'] = np.digitize(df['RAD'], bins=interval)
    
if 'RAD' in df.columns:
        df_dummy = pd.get_dummies(df['RAD'], prefix='RAD', drop_first=True)
        df = pd.concat([df, df_dummy], axis=1)
        del df['RAD']

In [10]:
df

Unnamed: 0,Price,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,TAX,PTRATIO,B,LSTAT,RAD_2,RAD_3,RAD_4,RAD_5,RAD_6
0,24.00,0.01,18.00,2.31,0.00,0.54,6.58,65.20,4.09,296.00,15.30,396.90,4.98,0,0,0,0,0
1,21.60,0.03,0.00,7.07,0.00,0.47,6.42,78.90,4.97,242.00,17.80,396.90,9.14,1,0,0,0,0
2,34.70,0.03,0.00,7.07,0.00,0.47,7.18,61.10,4.97,242.00,17.80,392.83,4.03,1,0,0,0,0
3,33.40,0.03,0.00,2.18,0.00,0.46,7.00,45.80,6.06,222.00,18.70,394.63,2.94,0,1,0,0,0
4,36.20,0.07,0.00,2.18,0.00,0.46,7.15,54.20,6.06,222.00,18.70,396.90,5.33,0,1,0,0,0
5,28.70,0.03,0.00,2.18,0.00,0.46,6.43,58.70,6.06,222.00,18.70,394.12,5.21,0,1,0,0,0
6,22.90,0.09,12.50,7.87,0.00,0.52,6.01,66.60,5.56,311.00,15.20,395.60,12.43,0,0,0,1,0
7,27.10,0.14,12.50,7.87,0.00,0.52,6.17,96.10,5.95,311.00,15.20,396.90,19.15,0,0,0,1,0
8,16.50,0.21,12.50,7.87,0.00,0.52,5.63,100.00,6.08,311.00,15.20,386.63,29.93,0,0,0,1,0
9,18.90,0.17,12.50,7.87,0.00,0.52,6.00,85.90,6.59,311.00,15.20,386.71,17.10,0,0,0,1,0


In [11]:
X_train, X_test, Y_train, Y_test = datasplit(df, Y_colname)

NameError: name 'Y_colname' is not defined