# Data Preprocessing in Python

## Importing the libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Importing the datasets

In [2]:
df = pd.read_excel('innercity.xlsx')
df['dayhours'] = pd.to_datetime(df['dayhours'])
df['year'] = df['dayhours'].dt.year
df['month'] = df['dayhours'].dt.month
df['day'] = df['dayhours'].dt.day
df['hour'] = df['dayhours'].dt.hour
categorical_mask = (df.dtypes == object) 
# Remove the original 'dayhours' column
df = df.drop(columns=['dayhours'])
# Drop 'cid' column
df = df.drop(columns=['cid'])
df = df[~df.apply(lambda row: row.astype(str).str.contains('$',regex=False)).any(axis=1)]
df.head()

Unnamed: 0,price,room_bed,room_bath,gender,living_measure,lot_measure,ceil,coast,sight,condition,...,lat,long,living_measure15,lot_measure15,furnished,total_area,year,month,day,hour
0,600000,4.0,1.75,Female,3050.0,9440.0,1,0,0.0,3,...,47.7228,-122.183,2020.0,8660.0,0.0,12490,2015,4,27,0
1,190000,2.0,1.0,Male,670.0,3101.0,1,0,0.0,4,...,47.5546,-122.274,1660.0,4100.0,0.0,3771,2015,3,17,0
2,735000,4.0,2.75,Female,3040.0,2415.0,2,1,4.0,3,...,47.5188,-122.256,2620.0,2433.0,0.0,5455,2014,8,20,0
3,257000,3.0,2.5,Male,1740.0,3721.0,2,0,0.0,3,...,47.3363,-122.213,2030.0,3794.0,0.0,5461,2014,10,10,0
4,450000,2.0,1.0,Female,1120.0,4590.0,1,0,0.0,3,...,47.5663,-122.285,1120.0,5100.0,0.0,5710,2015,2,18,0


In [3]:
print(df.isnull().sum())
# df.fillna(df.mean(numeric_only=True), inplace=True)
# print(df.isnull().sum())

# numeric_columns = df.select_dtypes(include='number')
# df.fillna(numeric_columns.mean(), inplace=True)
# print(df.isnull().sum())

# df.dropna(inplace=True)
print(df.isnull().sum())

price                 0
room_bed             66
room_bath            66
gender                0
living_measure       17
lot_measure          42
ceil                 42
coast                 1
sight                57
condition            57
quality               1
ceil_measure          1
basement              1
yr_built              1
yr_renovated          0
zipcode               0
lat                   0
long                  0
living_measure15    124
lot_measure15        29
furnished            29
total_area           29
year                  0
month                 0
day                   0
hour                  0
dtype: int64
price                 0
room_bed             66
room_bath            66
gender                0
living_measure       17
lot_measure          42
ceil                 42
coast                 1
sight                57
condition            57
quality               1
ceil_measure          1
basement              1
yr_built              1
yr_renovated          0
zip

In [4]:
x = df.iloc[:, (df.columns != 'price') & (df.columns != 'dayhours') & (df.columns != 'cid')].values
print(x[0])

[4.0 1.75 'Female' 3050.0 9440.0 1 0 0.0 3 8.0 1800.0 1250.0 1966 0 98034
 47.7228 -122.183 2020.0 8660.0 0.0 12490 2015 4 27 0]


In [5]:
y = df.iloc[:, df.columns == 'price'].values
print(y)

[[ 600000]
 [ 190000]
 [ 735000]
 ...
 [ 998000]
 [ 262000]
 [1150000]]


## Taking care of missng values

In [6]:
df.isnull().sum()

price                 0
room_bed             66
room_bath            66
gender                0
living_measure       17
lot_measure          42
ceil                 42
coast                 1
sight                57
condition            57
quality               1
ceil_measure          1
basement              1
yr_built              1
yr_renovated          0
zipcode               0
lat                   0
long                  0
living_measure15    124
lot_measure15        29
furnished            29
total_area           29
year                  0
month                 0
day                   0
hour                  0
dtype: int64

In [7]:
check_nan = df.iloc[:, df.columns != 'price']
check_nan.isnull().sum()

room_bed             66
room_bath            66
gender                0
living_measure       17
lot_measure          42
ceil                 42
coast                 1
sight                57
condition            57
quality               1
ceil_measure          1
basement              1
yr_built              1
yr_renovated          0
zipcode               0
lat                   0
long                  0
living_measure15    124
lot_measure15        29
furnished            29
total_area           29
year                  0
month                 0
day                   0
hour                  0
dtype: int64

In [8]:
from sklearn.impute import SimpleImputer
ct = SimpleImputer(strategy = 'mean')
ct.fit(x[:, 0:2])
x[:, 0:2] = ct.transform(x[:, 0:2])

In [9]:
cts = SimpleImputer(strategy = 'mean')
cts.fit(x[:, 3:6])
x[:, 3:6] = cts.transform(x[:, 3:6])

In [10]:
# x[x == '$'] = '0'

In [11]:
ct.fit(x[:, 6:])
x[:, 6:] = ct.transform(x[:, 6:])

## Encoding the categorical Value

## Encoding the Independent Variable

In [12]:
x[x == 'Female'] = 0
x[x == 'Male'] = 1

In [13]:
# from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import OneHotEncoder
# ct = ColumnTransformer(transformers = [('encoder', OneHotEncoder(),categorical_mask )], remainder = 'passthrough')
# x_encoded = ct.fit_transform(x)
# x = np.array(ct.fit_transform(x))
# x_encoded

In [14]:
x[0]

array([4.0, 1.75, 0, 3050.0, 9440.0, 1.0, 0.0, 0.0, 3.0, 8.0, 1800.0,
       1250.0, 1966.0, 0.0, 98034.0, 47.7228, -122.183, 2020.0, 8660.0,
       0.0, 12490.0, 2015.0, 4.0, 27.0, 0.0], dtype=object)

## Splitting the dataset into the training set and testing set

In [15]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state =1)

In [16]:
x_train

array([[4.0, 1.0, 1, ..., 6.0, 19.0, 0.0],
       [4.0, 2.5, 0, ..., 7.0, 24.0, 0.0],
       [8.0, 5.0, 1, ..., 12.0, 4.0, 0.0],
       ...,
       [2.0, 1.75, 1, ..., 9.0, 15.0, 0.0],
       [4.0, 2.25, 1, ..., 12.0, 15.0, 0.0],
       [3.0, 1.5, 1, ..., 8.0, 15.0, 0.0]], dtype=object)

In [17]:
x_test

array([[2.0, 2.5, 0, ..., 2.0, 13.0, 0.0],
       [3.0, 2.5, 1, ..., 5.0, 16.0, 0.0],
       [3.0, 2.25, 0, ..., 11.0, 12.0, 0.0],
       ...,
       [2.0, 1.0, 0, ..., 8.0, 19.0, 0.0],
       [3.0, 3.5, 0, ..., 9.0, 12.0, 0.0],
       [3.0, 1.5, 0, ..., 3.0, 27.0, 0.0]], dtype=object)

In [18]:
y_train

array([[395000],
       [672000],
       [490000],
       ...,
       [766500],
       [734000],
       [259950]], dtype=int64)

In [19]:
y_test

array([[400000],
       [425000],
       [490000],
       ...,
       [255000],
       [496000],
       [552500]], dtype=int64)

## Feature Scaling

In [20]:
# from sklearn.preprocessing import StandardScaler
# sc = StandardScaler()
# x_train[:,:] = sc.fit_transform(x_train[:,:])
# x_test[:,:] = sc.fit_transform(x_test[:,:])

In [21]:
x_train

array([[4.0, 1.0, 1, ..., 6.0, 19.0, 0.0],
       [4.0, 2.5, 0, ..., 7.0, 24.0, 0.0],
       [8.0, 5.0, 1, ..., 12.0, 4.0, 0.0],
       ...,
       [2.0, 1.75, 1, ..., 9.0, 15.0, 0.0],
       [4.0, 2.25, 1, ..., 12.0, 15.0, 0.0],
       [3.0, 1.5, 1, ..., 8.0, 15.0, 0.0]], dtype=object)

In [22]:
x_test

array([[2.0, 2.5, 0, ..., 2.0, 13.0, 0.0],
       [3.0, 2.5, 1, ..., 5.0, 16.0, 0.0],
       [3.0, 2.25, 0, ..., 11.0, 12.0, 0.0],
       ...,
       [2.0, 1.0, 0, ..., 8.0, 19.0, 0.0],
       [3.0, 3.5, 0, ..., 9.0, 12.0, 0.0],
       [3.0, 1.5, 0, ..., 3.0, 27.0, 0.0]], dtype=object)

In [23]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train,y_train)

LinearRegression()

In [24]:
regressor.score(x_test,y_test)

0.6890590128087117

In [25]:
# Predict house prices on the test set
y_pred = regressor.predict(x_test)

In [26]:
# Evaluate the model using metrics
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [27]:
print("Mean Squared Error(MSE):", mse)
print("R-squared (R2) Score:", r2)

Mean Squared Error(MSE): 38898381989.28538
R-squared (R2) Score: 0.6890590128087117


In [28]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
regressor.fit(x_train,y_train)

# rf_predictions = regressor.predict(x_test)

  regressor.fit(x_train,y_train)


RandomForestRegressor(n_estimators=10, random_state=0)

In [29]:
regressor.score(x_test, y_test)

0.8715925996928533

In [30]:
# Predict house prices on the test set
y_pred = regressor.predict(x_test)

In [31]:
# Evaluate the model using metrics
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [32]:
print("Mean Squared Error(MSE):", mse)
print("R-squared (R2) Score:", r2)

Mean Squared Error(MSE): 16063627225.59214
R-squared (R2) Score: 0.8715925996928533
