# Notebook

#Use the housing data

In [None]:
import pandas as pd
housing = pd.read_csv('sample_data/housing.csv')

In [None]:
final_housing = housing.copy()

In [None]:
final_housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [None]:
final_housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


Fill the missing values using the median value

In [None]:
final_housing['total_bedrooms'].fillna(final_housing['total_bedrooms'].median(), inplace = True)

Import all the necessary modules

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

Add the Preprocessing Steps

In [None]:
numeric_features = ['longitude','latitude','housing_median_age',
                    'total_rooms','total_bedrooms',
                    'population','households','median_income']

numeric_transformer = Pipeline(steps=[('poly',PolynomialFeatures(degree =3)),
                                      ('scaler', StandardScaler())])

In [None]:
categorical_features = ['ocean_proximity']

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

Combine 2 steps

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

Integrate the Preprocessed Features and Linear Regression Model

In [None]:
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LinearRegression())])

Training the model

In [None]:
X = final_housing.drop('median_house_value', axis = 1)
y = final_housing['median_house_value']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0 )

In [None]:
clf.fit(X_train, y_train)

Evaluate the model

In [None]:
clf.score(X_train, y_train)

0.7443783826983963

Evaluate with the test dataset

In [None]:
clf.score(X_test, y_test)

0.6739208059759456

#Use diamond data

In [None]:
diamonds = pd.read_csv('sample_data/diamonds.csv', index_col=0)

In [None]:
final_diamonds = diamonds.copy()

In [None]:
final_diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


Find missing value

In [None]:
final_diamonds.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53940 entries, 1 to 53940
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  int64  
 7   x        53940 non-null  float64
 8   y        53940 non-null  float64
 9   z        53940 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.5+ MB


There is no missing value, so no need to replace any. But convert int to float, because it did not work for some reason

In [201]:
final_diamonds['price'] = final_diamonds['price'].astype(float)

numeric_features = ['carat', 'depth', 'table', 'x', 'y', 'z']
numeric_transformer = Pipeline(steps=[('poly', PolynomialFeatures(degree=3)),
                                      ('scaler', StandardScaler())])

categorical_features = ['cut', 'color', 'clarity']
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LinearRegression())])

X = final_diamonds.drop('price', axis=1)
y = final_diamonds['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

clf.fit(X_train, y_train)



Evaluate the model

In [202]:
clf.score(X_train, y_train)

0.9354372025977855

In [203]:
clf.score(X_test, y_test)

0.8953051958188195