# Data Preprocessing in Python

## Importing the libraries

In [286]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Importing the datasets

In [287]:
df = pd.read_excel('innercity.xlsx')
df['dayhours'] = pd.to_datetime(df['dayhours'])
df['year'] = df['dayhours'].dt.year
df['month'] = df['dayhours'].dt.month
df['day'] = df['dayhours'].dt.day
df['hour'] = df['dayhours'].dt.hour
categorical_mask = (df.dtypes == object) 
# df = df[~df.apply(lambda row: row.astype(str).str.contains('$',regex=False)).any(axis=1)]
df.head()

Unnamed: 0,cid,dayhours,price,room_bed,room_bath,gender,living_measure,lot_measure,ceil,coast,...,lat,long,living_measure15,lot_measure15,furnished,total_area,year,month,day,hour
0,3876100940,2015-04-27,600000,4.0,1.75,Female,3050.0,9440.0,1,0,...,47.7228,-122.183,2020.0,8660.0,0.0,12490,2015,4,27,0
1,3145600250,2015-03-17,190000,2.0,1.0,Male,670.0,3101.0,1,0,...,47.5546,-122.274,1660.0,4100.0,0.0,3771,2015,3,17,0
2,7129303070,2014-08-20,735000,4.0,2.75,Female,3040.0,2415.0,2,1,...,47.5188,-122.256,2620.0,2433.0,0.0,5455,2014,8,20,0
3,7338220280,2014-10-10,257000,3.0,2.5,Male,1740.0,3721.0,2,0,...,47.3363,-122.213,2030.0,3794.0,0.0,5461,2014,10,10,0
4,7950300670,2015-02-18,450000,2.0,1.0,Female,1120.0,4590.0,1,0,...,47.5663,-122.285,1120.0,5100.0,0.0,5710,2015,2,18,0


In [288]:
print(df.isnull().sum())
df.fillna(df.mean(numeric_only=True), inplace=True)
print(df.isnull().sum())

numeric_columns = df.select_dtypes(include='number')
df.fillna(numeric_columns.mean(), inplace=True)
print(df.isnull().sum())

df.dropna(inplace=True)
print(df.isnull().sum())

cid                   0
dayhours              0
price                 0
room_bed            108
room_bath           108
gender                0
living_measure       17
lot_measure          42
ceil                 42
coast                 1
sight                57
condition            57
quality               1
ceil_measure          1
basement              1
yr_built              1
yr_renovated          0
zipcode               0
lat                   0
long                  0
living_measure15    166
lot_measure15        29
furnished            29
total_area           29
year                  0
month                 0
day                   0
hour                  0
dtype: int64
cid                  0
dayhours             0
price                0
room_bed             0
room_bath            0
gender               0
living_measure       0
lot_measure          0
ceil                42
coast                1
sight                0
condition           57
quality              0
ceil_measure    

In [289]:
x = df.iloc[:, (df.columns != 'price') & (df.columns != 'dayhours') & (df.columns != 'cid')].values
print(x[0])

[4.0 1.75 'Female' 3050.0 9440.0 1 0 0.0 3 8.0 1800.0 1250.0 1966 0 98034
 47.7228 -122.183 2020.0 8660.0 0.0 12490 2015 4 27 0]


In [290]:
y = df.iloc[:, df.columns == 'price'].values
print(y)

[[ 600000]
 [ 190000]
 [ 735000]
 ...
 [ 998000]
 [ 262000]
 [1150000]]


## Taking care of missng values

In [291]:
df.isnull().sum()

cid                 0
dayhours            0
price               0
room_bed            0
room_bath           0
gender              0
living_measure      0
lot_measure         0
ceil                0
coast               0
sight               0
condition           0
quality             0
ceil_measure        0
basement            0
yr_built            0
yr_renovated        0
zipcode             0
lat                 0
long                0
living_measure15    0
lot_measure15       0
furnished           0
total_area          0
year                0
month               0
day                 0
hour                0
dtype: int64

In [292]:
check_nan = df.iloc[:, df.columns != 'price']
check_nan.isnull().sum()

cid                 0
dayhours            0
room_bed            0
room_bath           0
gender              0
living_measure      0
lot_measure         0
ceil                0
coast               0
sight               0
condition           0
quality             0
ceil_measure        0
basement            0
yr_built            0
yr_renovated        0
zipcode             0
lat                 0
long                0
living_measure15    0
lot_measure15       0
furnished           0
total_area          0
year                0
month               0
day                 0
hour                0
dtype: int64

In [293]:
from sklearn.impute import SimpleImputer
ct = SimpleImputer(strategy = 'mean')
ct.fit(x[:, 0:2])
x[:, 0:2] = ct.transform(x[:, 0:2])

In [294]:
x[0]

array([4.0, 1.75, 'Female', 3050.0, 9440.0, 1, 0, 0.0, 3, 8.0, 1800.0,
       1250.0, 1966, 0, 98034, 47.7228, -122.183, 2020.0, 8660.0, 0.0,
       12490, 2015, 4, 27, 0], dtype=object)

In [295]:
x[x == '$'] = '0'

In [296]:
ct.fit(x[:, 6:])
x[:, 6:] = ct.transform(x[:, 6:])

## Encoding the categorical Value

## Encoding the Independent Variable

In [297]:
x[x == 'Female'] = 0
x[x == 'Male'] = 1

In [298]:
# from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import OneHotEncoder
# ct = ColumnTransformer(transformers = [('encoder', OneHotEncoder(),categorical_mask )], remainder = 'passthrough')
# x_encoded = ct.fit_transform(x)
# x = np.array(ct.fit_transform(x))
# x_encoded

In [299]:
x[0]

array([4.0, 1.75, 0, 3050.0, 9440.0, 1, 0.0, 0.0, 3.0, 8.0, 1800.0,
       1250.0, 1966.0, 0.0, 98034.0, 47.7228, -122.183, 2020.0, 8660.0,
       0.0, 12490.0, 2015.0, 4.0, 27.0, 0.0], dtype=object)

## Splitting the dataset into the training set and testing set

In [300]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state =1)

In [301]:
x_train

array([[3.0, 3.0, 1, ..., 11.0, 20.0, 0.0],
       [5.0, 3.0, 1, ..., 9.0, 9.0, 0.0],
       [3.0, 2.75, 0, ..., 3.0, 16.0, 0.0],
       ...,
       [4.0, 1.75, 1, ..., 12.0, 12.0, 0.0],
       [4.0, 2.0, 1, ..., 6.0, 24.0, 0.0],
       [4.0, 1.75, 1, ..., 5.0, 30.0, 0.0]], dtype=object)

In [302]:
x_test

array([[4.0, 2.5, 0, ..., 8.0, 25.0, 0.0],
       [3.0, 2.5, 0, ..., 4.0, 8.0, 0.0],
       [4.0, 2.0, 0, ..., 11.0, 17.0, 0.0],
       ...,
       [2.0, 1.75, 1, ..., 5.0, 5.0, 0.0],
       [3.0, 2.75, 0, ..., 12.0, 26.0, 0.0],
       [3.0, 2.5, 0, ..., 6.0, 9.0, 0.0]], dtype=object)

In [303]:
y_train

array([[ 649500],
       [ 559900],
       [1070000],
       ...,
       [ 289950],
       [ 339000],
       [ 715000]], dtype=int64)

In [304]:
y_test

array([[ 530000],
       [1010000],
       [ 295000],
       ...,
       [ 276200],
       [ 473000],
       [1150000]], dtype=int64)

## Feature Scaling

In [305]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train[:,:] = sc.fit_transform(x_train[:,:])
x_test[:,:] = sc.fit_transform(x_test[:,:])

In [306]:
x_train

array([[-0.4141697560857475, 1.1466704029717916, 1.536117176922452, ...,
        1.4162344333243995, 0.4959457004242489, 0.0],
       [1.7988625150975472, 1.1466704029717916, 1.536117176922452, ...,
        0.7748390538982881, -0.7765645345943011, 0.0],
       [-0.4141697560857475, 0.8225998266238329, -0.6509920044015296,
        ..., -1.1493470843800457, 0.033214705872048916, 0.0],
       ...,
       [0.6923463795058998, -0.4736824787680018, 1.536117176922452, ...,
        1.736932123037455, -0.42951628868015107, 0.0],
       [0.6923463795058998, -0.14961190242004313, 1.536117176922452, ...,
        -0.18725401524087879, 0.9586766949764489, 0.0],
       [0.6923463795058998, -0.4736824787680018, 1.536117176922452, ...,
        -0.5079517049539345, 1.6527731868047488, 0.0]], dtype=object)

In [307]:
x_test

array([[0.6293226062333607, 0.512947906291363, -0.6443697545965218, ...,
        0.47392343413260873, 1.0958065744382937, 0.0],
       [-0.3501700222802705, 0.512947906291363, -0.6443697545965218, ...,
        -0.8132316495337115, -0.8850005101936641, 0.0],
       [0.6293226062333607, -0.14976549848559273, -0.6443697545965218,
        ..., 1.439289746882349, 0.16366206402325473, 0.0],
       ...,
       [-1.3296626507939018, -0.4811222008740706, 1.5519040005627571,
        ..., -0.4914428786171315, -1.2345547015993037, 0.0],
       [-0.3501700222802705, 0.8443046086798409, -0.6443697545965218,
        ..., 1.761078517798929, 1.2123246382401736, 0.0],
       [-0.3501700222802705, 0.512947906291363, -0.6443697545965218, ...,
        -0.16965410770055142, -0.7684824463917842, 0.0]], dtype=object)

In [308]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train,y_train)

LinearRegression()

In [309]:
rf_predictions = regressor.predict(x_test)

In [310]:
regressor.score(x_test,y_test)

0.651054679367552