In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split

#Read the data 
data = pd.read_csv('melb_data.csv')

#Separate data for predictors
y = data['Price']
x = data.drop(['Price'], axis = 1)

#Divide data into training and validation subset
x_train_full, x_valid_full, y_train, y_valid = train_test_split(x,y, train_size = 0.8, test_size = 0.2, random_state = 0)

#Drop columns with missing values (simplest approach)
cols_with_missing = [col for col in x_train_full.columns if x_train_full[col].isnull().any()]
x_train_full.drop(cols_with_missing, axis = 1, inplace=True)
x_valid_full.drop(cols_with_missing, axis = 1, inplace=True)

#Select columns with relatively low cardinality (conviniet but arbritary -> 

low_cardinality_cols = [cname for cname in x_train_full.columns if x_train_full[cname].nunique() < 10 and x_train_full[cname].dtype == 'object']

#Columns with < 10 different values
low_cardinality_cols


['Suburb', 'Type', 'Method', 'SellerG', 'Date', 'Regionname']

In [9]:
#Select numerical columns

numerical_cols = [cname for cname in x_train_full.columns if x_train_full[cname].dtype in ['int64', 'float64']]
numerical_cols

['Rooms',
 'Distance',
 'Postcode',
 'Bedroom2',
 'Bathroom',
 'Landsize',
 'Lattitude',
 'Longtitude',
 'Propertycount']

In [10]:
#Keep selected columns only
my_cols = low_cardinality_cols + numerical_cols
x_train = x_train_full[my_cols].copy()
x_valid = x_valid_full[my_cols].copy()

In [11]:
x_train.head()

Unnamed: 0,Suburb,Type,Method,SellerG,Date,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
12167,St Kilda,u,S,hockingstuart,29/07/2017,Southern Metropolitan,1,5.0,3182.0,1.0,1.0,0.0,-37.85984,144.9867,13240.0
6524,Williamstown,h,SA,Hunter,17/09/2016,Western Metropolitan,2,8.0,3016.0,2.0,2.0,193.0,-37.858,144.9005,6380.0
8413,Sunshine,h,S,Barry,8/04/2017,Western Metropolitan,3,12.6,3020.0,3.0,1.0,555.0,-37.7988,144.822,3755.0
2919,Glenroy,u,SP,Brad,18/06/2016,Northern Metropolitan,3,13.0,3046.0,3.0,1.0,265.0,-37.7083,144.9158,8870.0
6043,Sunshine North,h,S,First,22/05/2016,Western Metropolitan,3,13.3,3020.0,3.0,1.0,673.0,-37.7623,144.8272,4217.0


In [16]:
#Get list of categorical variables
s = (x_train.dtypes == 'object')

#List with categorical variables ('object' => text)
object_cols = list(s[s].index)


Suburb            True
Type              True
Method            True
SellerG           True
Date              True
Regionname        True
Rooms            False
Distance         False
Postcode         False
Bedroom2         False
Bathroom         False
Landsize         False
Lattitude        False
Longtitude       False
Propertycount    False
dtype: bool