In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [0]:
from pyspark.sql.types import DoubleType,StringType,StructField,StructType

schema = StructType(
    [
        StructField("longitude", DoubleType(), True),
        StructField("latitude", DoubleType(), True),
        StructField("price", DoubleType(), True),
        StructField("total_bedrooms", DoubleType(), True),
        StructField("total_rooms", DoubleType(), True),
        StructField("median_income", DoubleType(), True),
        StructField("median_house_value", DoubleType(), True),
        StructField("ocean_proximity", StringType(), True),
    ]
)

In [0]:
df = spark.read.format("csv").schema(schema).option("header", "true").load("dbfs:/FileStore/housing.csv")

In [0]:
display(df)

longitude,latitude,price,total_bedrooms,total_rooms,median_income,median_house_value,ocean_proximity
-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252
-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014
-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574
-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431
-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462
-122.25,37.85,52.0,919.0,213.0,413.0,193.0,4.0368
-122.25,37.84,52.0,2535.0,489.0,1094.0,514.0,3.6591
-122.25,37.84,52.0,3104.0,687.0,1157.0,647.0,3.12
-122.26,37.84,42.0,2555.0,665.0,1206.0,595.0,2.0804
-122.25,37.84,52.0,3549.0,707.0,1551.0,714.0,3.6912


# 1 Missing value imputations

In [0]:
df = df.toPandas()
df.isnull().sum()

longitude               0
latitude                0
price                   0
total_bedrooms          0
total_rooms           207
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [0]:
df.dtypes

longitude             float64
latitude              float64
price                 float64
total_bedrooms        float64
total_rooms           float64
median_income         float64
median_house_value    float64
ocean_proximity        object
dtype: object

In [0]:
df.fillna(df.mean(numeric_only=True), inplace=True)

In [0]:
df.isnull().sum()

longitude             0
latitude              0
price                 0
total_bedrooms        0
total_rooms           0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [0]:
df.select_dtypes(include=['object']).columns

Index(['ocean_proximity'], dtype='object')

In [0]:
df["ocean_proximity"] = df["ocean_proximity"].fillna(df["ocean_proximity"].mode()[0])

In [0]:
df.isnull().sum()

longitude             0
latitude              0
price                 0
total_bedrooms        0
total_rooms           0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

# Outlier removal

## There are several ways of removing outliers and we shall be exploring the following

### 1 Zscore
    This assumes that the data is normally distributed, and we set a threshold against which any rows shall be removed. 
### 2 IQR (Inter Quartile Range)    
    We calculate the interquartile range for each column and then we proceed to remove rows where any column is below the first quartile minus a multiple or IQR or above the third quartile
### 3 Tukey's Fences Method
    We will cacluate the upper and lower fences on the first and third quartile. Remove rows where columns have values below the lower fence or upper fence
### 4 Standard Deviation Method
    Calculate the mean and standard deviation method, remove rows where any column value is above or below a certain number of the standard deviation        
### 5 Percentile Method
    Calculate the upper and lower percentiles for each column , then remove rows where column values is below lower percentile or above the upper percentile. This is not distribution specific and can remove extreme values    

In [0]:
# IQR Method
numerical_columns = ["longitude", "latitude", "price", "total_bedrooms", "total_rooms", "median_income", "median_house_value"]

Q1 = df[numerical_columns].quantile(0.25)
Q3 = df[numerical_columns].quantile(0.75)
# inter quartile range
IQR = Q3 - Q1
lower_bound = Q1 - (1.5 * IQR)
upper_bound = Q3 + (1.5 * IQR)

df_no_outliers = df[~((df[numerical_columns] < lower_bound) | (df[numerical_columns] > upper_bound)).any(axis=1)]   

In [0]:
df_no_outliers.shape

(18770, 8)

In [0]:
df_no_outliers.shape[0] - df.shape[0]

-1870

In [0]:
# Z score method
from scipy import stats

z_scores = stats.zscore(df[numerical_columns])
threshold = 3
df_no_outliers = df[(z_scores < threshold).all(axis=1)]

In [0]:
df_no_outliers.shape[0] - df.shape[0]

-561

In [0]:
# Merge without renaming columns
df = pd.merge(df_no_outliers, df, left_index=True, right_index=True, suffixes=('', '_duplicate'))

# Drop duplicate columns
df = df.loc[:, ~df.columns.str.endswith('_duplicate')]

# Display the DataFrame
display(df)

longitude,latitude,price,total_bedrooms,total_rooms,median_income,median_house_value,ocean_proximity
-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252
-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014
-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574
-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431
-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462
-122.25,37.85,52.0,919.0,213.0,413.0,193.0,4.0368
-122.25,37.84,52.0,2535.0,489.0,1094.0,514.0,3.6591
-122.25,37.84,52.0,3104.0,687.0,1157.0,647.0,3.12
-122.26,37.84,42.0,2555.0,665.0,1206.0,595.0,2.0804
-122.25,37.84,52.0,3549.0,707.0,1551.0,714.0,3.6912


In [0]:
df.shape

(20079, 8)

In [0]:
df.head()

Unnamed: 0,longitude,latitude,price,total_bedrooms,total_rooms,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462


# Feature Normalization and Standardization
## Normalization [-1,1] or [0,1]
## Stanardization is like mph or kph units, example is to zero mean a distribution

In [0]:
numerical_columns
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

In [0]:
df.head()

Unnamed: 0,longitude,latitude,price,total_bedrooms,total_rooms,median_income,median_house_value,ocean_proximity
0,-1.319774,1.043893,0.962113,-1.05156,-1.255708,-1.274857,-1.243722,8.3252
1,-1.314781,1.034547,-0.644336,3.257221,2.123891,1.417317,2.540976,8.3014
2,-1.324768,1.029874,1.845661,-0.644862,-1.0447,-1.049538,-1.052991,7.2574
3,-1.329761,1.029874,1.845661,-0.77858,-0.889037,-0.969252,-0.895918,5.6431
4,-1.329761,1.029874,1.845661,-0.534007,-0.733375,-0.960188,-0.746325,3.8462


# One hot encoding


In [0]:
df.select_dtypes(include=['object']).columns

Index(['ocean_proximity'], dtype='object')

In [0]:
df["ocean_proximity"].value_counts()

15.0001    49
3.125      48
2.625      44
2.875      44
4.125      44
           ..
4.4015      1
5.2287      1
3.8621      1
4.8508      1
2.3661      1
Name: ocean_proximity, Length: 12509, dtype: int64

In [0]:
df["ocean_proximity"].unique()

array(['8.3252', '8.3014', '7.2574', ..., '2.4695', '2.3598', '2.3661'],
      dtype=object)

# Feature selection 



In [0]:
X = df.drop("median_house_value", axis=1)
y = df["median_house_value"]

In [0]:
type(y)

pandas.core.series.Series

In [0]:
type(X)

pandas.core.frame.DataFrame

In [0]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

estimator = LinearRegression()
rfe = RFE(estimator, n_features_to_select=5)
rfe.fit(X, y)
selected_features = X.columns[rfe.support_]
print(selected_features)

Index(['longitude', 'latitude', 'total_bedrooms', 'total_rooms',
       'median_income'],
      dtype='object')
