In [37]:
import pandas as pd
import numpy as np





In [38]:
def ordinal_enc(data, cols):
    enc_maps = {}  # Store encoding maps for multiple columns
    for col in cols:
        unique_vals = data[col].unique()  # Get unique values for each column
        enc_map = {val: i for i, val in enumerate(unique_vals)}  # Create encoding map
        data[col] = data[col].map(enc_map)  # Apply the encoding
        enc_maps[col] = enc_map  # Store the encoding map
    return data, enc_maps

In [39]:
def one_hot_enc(data, cols):
    for col in cols:
        unique_vals = data[col].unique()
        one_hot_matrix = np.zeros((data.shape[0], len(unique_vals)))
        for i, val in enumerate(unique_vals):
            one_hot_matrix[:, i] = (data[col] == val).astype(int)
    one_hot_df = pd.DataFrame(one_hot_matrix, columns=[f"{col}_{val}" for val in unique_vals])
    data = pd.concat([data.drop(col, axis=1), one_hot_df], axis=1)
    return data

In [40]:
data=pd.read_csv("melb_data.csv")

In [49]:
data.head()
data.describe()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0,13518.0,13580.0,7130.0,8205.0,13580.0,13580.0,13580.0
mean,2.937997,1075684.0,10.137776,3105.301915,2.914728,1.534242,1.610075,558.416127,151.96765,1964.684217,-37.809203,144.995216,7454.417378
std,0.955748,639310.7,5.868725,90.676964,0.965921,0.691712,0.962634,3990.669241,541.014538,37.273762,0.07926,0.103916,4378.581772
min,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,-38.18255,144.43181,249.0
25%,2.0,650000.0,6.1,3044.0,2.0,1.0,1.0,177.0,93.0,1940.0,-37.856822,144.9296,4380.0
50%,3.0,903000.0,9.2,3084.0,3.0,1.0,2.0,440.0,126.0,1970.0,-37.802355,145.0001,6555.0
75%,3.0,1330000.0,13.0,3148.0,3.0,2.0,2.0,651.0,174.0,1999.0,-37.7564,145.058305,10331.0
max,10.0,9000000.0,48.1,3977.0,20.0,8.0,10.0,433014.0,44515.0,2018.0,-37.40853,145.52635,21650.0


In [42]:
from sklearn.model_selection import train_test_split
y = data.Price
X = data.drop(['Price'], axis=1)
cols_with_missing = [col for col in X_train_full.columns if X_train_full[col].isnull().any()] 
X_train_full.drop(cols_with_missing, axis=1, inplace=True)
X_valid_full.drop(cols_with_missing, axis=1, inplace=True)

In [43]:
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)
cols_with_missing = [col for col in X_train_full.columns if X_train_full[col].isnull().any()] 
X_train_full.drop(cols_with_missing, axis=1, inplace=True)
X_valid_full.drop(cols_with_missing, axis=1, inplace=True)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

In [44]:
cols_with_missing = [col for col in X_train_full.columns if X_train_full[col].isnull().any()] 
X_train_full.drop(cols_with_missing, axis=1, inplace=True)
X_valid_full.drop(cols_with_missing, axis=1, inplace=True)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

In [45]:
X_train_full.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Regionname,Propertycount
12167,St Kilda,11/22 Charnwood Cr,1,u,S,hockingstuart,29/07/2017,5.0,3182.0,1.0,1.0,0.0,-37.85984,144.9867,Southern Metropolitan,13240.0
6524,Williamstown,18 James St,2,h,SA,Hunter,17/09/2016,8.0,3016.0,2.0,2.0,193.0,-37.858,144.9005,Western Metropolitan,6380.0
8413,Sunshine,10 Dundalk St,3,h,S,Barry,8/04/2017,12.6,3020.0,3.0,1.0,555.0,-37.7988,144.822,Western Metropolitan,3755.0
2919,Glenroy,1/2 Prospect St,3,u,SP,Brad,18/06/2016,13.0,3046.0,3.0,1.0,265.0,-37.7083,144.9158,Northern Metropolitan,8870.0
6043,Sunshine North,35 Furlong Rd,3,h,S,First,22/05/2016,13.3,3020.0,3.0,1.0,673.0,-37.7623,144.8272,Western Metropolitan,4217.0


In [46]:
s = (X_train_full.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

Categorical variables:
['Suburb', 'Address', 'Type', 'Method', 'SellerG', 'Date', 'Regionname']


In [47]:
ordinal_enc(X_train_full, object_cols)

(       Suburb  Address  Rooms  Type  Method  SellerG  Date  Distance  \
 12167       0        0      1     0       0        0     0       5.0   
 6524        1        1      2     1       1        1     1       8.0   
 8413        2        2      3     1       0        2     2      12.6   
 2919        3        3      3     0       2        3     3      13.0   
 6043        4        4      3     1       0        4     4      13.3   
 ...       ...      ...    ...   ...     ...      ...   ...       ...   
 13123     109    10738      3     1       2       63    54       5.2   
 3264       76    10739      3     1       0       42    27      10.5   
 9845       15    10740      4     1       4       11    24       6.7   
 10799      23    10741      3     1       0       22    16      12.0   
 2732       44     4487      4     1       2       13     9       6.4   
 
        Postcode  Bedroom2  Bathroom  Landsize  Lattitude  Longtitude  \
 12167    3182.0       1.0       1.0       0.0  -

In [48]:
one_hot_enc(X_train_full, np.array(object_cols))

Unnamed: 0,Suburb,Address,Rooms,Type,Method,SellerG,Date,Distance,Postcode,Bedroom2,...,Longtitude,Propertycount,Regionname_0,Regionname_1,Regionname_2,Regionname_3,Regionname_4,Regionname_5,Regionname_6,Regionname_7
12167,0.0,0.0,1.0,0.0,0.0,0.0,0.0,5.0,3182.0,1.0,...,144.9867,13240.0,,,,,,,,
6524,1.0,1.0,2.0,1.0,1.0,1.0,1.0,8.0,3016.0,2.0,...,144.9005,6380.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8413,2.0,2.0,3.0,1.0,0.0,2.0,2.0,12.6,3020.0,3.0,...,144.8220,3755.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2919,3.0,3.0,3.0,0.0,2.0,3.0,3.0,13.0,3046.0,3.0,...,144.9158,8870.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6043,4.0,4.0,3.0,1.0,0.0,4.0,4.0,13.3,3020.0,3.0,...,144.8272,4217.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10849,,,,,,,,,,,...,,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10855,,,,,,,,,,,...,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
10856,,,,,,,,,,,...,,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10859,,,,,,,,,,,...,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
