In [1]:
#import modules 
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler

from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor, LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

def load_data():
    data=pd.read_csv('housing.csv')
    X=data.drop('median_house_value',axis=1).copy()
    y=data['median_house_value']
    return X,y

def rmse(y,y_hat):
    return np.sqrt(mean_squared_error(y,y_hat))

def model_report(models,X_train,X_test,y_train,y_test):
    for name,model in models:
        model.fit(X_train,y_train)
        y_hat_train=model.predict(X_train)
        y_hat_test=model.predict(X_test)
        print(name.center(50))
        print()
        print('Training RMSE Error :',rmse(y_train,y_hat_train))
        print('Test RMSE Error :',rmse(y_test,y_hat_test))
        print()
        print('Train Accuracy :', r2_score(y_train,y_hat_train))
        print('Test Accuracy :', r2_score(y_test,y_hat_test))
        print('-'*120)
        print('-'*120)
        
models=[('Linear Regression', LinearRegression()),
        ('SGD', SGDRegressor()),
        ('Decision Tree', DecisionTreeRegressor()),
        ('Support Vector Machines', SVR()),
        ('Random Forest', RandomForestRegressor()),
        ('K-Nearest Neighbour', KNeighborsRegressor()),
       ]

if __name__=='__main__':
    X,y=load_data()
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=123)
    #categorial and numeric features
    num_featuers=['housing_median_age', 'total_rooms',
           'total_bedrooms', 'population', 'households']

    cat_features=['ocean_proximity']

    drop_features=['longitude', 'latitude']

    pass_through=['median_income']

    #numeric and categorial pipelines
    num_pipeline=Pipeline([
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler())
    ])

    #final pipeline
    final_pipeline=ColumnTransformer([
        ('numerical pipeline', num_pipeline, num_featuers),
        ('categorical pipeline', OneHotEncoder(), cat_features),
        ('remove features', 'drop', drop_features),
        ('pass through', 'passthrough',pass_through)
    ])

    X_train_tr=final_pipeline.fit_transform(X_train)

    cat_col=['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN']

    columns=num_featuers+cat_col+pass_through

    X_train_tr=pd.DataFrame(X_train_tr,columns=columns)

    X_test_tr=final_pipeline.transform(X_test)

    X_test_tr=pd.DataFrame(X_test_tr,columns=columns)



    model_report(models,X_train_tr,X_test_tr,y_train,y_test)

                Linear Regression                 

Training RMSE Error : 69850.99879115926
Test RMSE Error : 69850.63044502078

Train Accuracy : 0.6336904838792504
Test Accuracy : 0.63309831091288
------------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------
                       SGD                        

Training RMSE Error : 70413.68637213121
Test RMSE Error : 70304.86857848929

Train Accuracy : 0.6277650708043956
Test Accuracy : 0.628310877095114
------------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------
                  Decision Tree                   

Training RMSE Error : 0.0
Test RMSE Error : 84395.40879499703

Trai



               K-Nearest Neighbour                

Training RMSE Error : 54164.530256727514
Test RMSE Error : 64671.936101181105

Train Accuracy : 0.7797413756974221
Test Accuracy : 0.6854854249418978
------------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------




In [58]:
#import modules 
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler

from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor, LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

def load_data():
    data=pd.read_csv('housing.csv')
    X=data.drop('median_house_value',axis=1).copy()
    y=data['median_house_value']
    return X,y

def rmse(y,y_hat):
    return np.sqrt(mean_squared_error(y,y_hat))

def model_report(models,X_train,X_test,y_train,y_test):
    for name,model in models:
        model.fit(X_train,y_train)
        y_hat_train=model.predict(X_train)
        y_hat_test=model.predict(X_test)
        print(name.center(50))
        print()
        print('Training RMSE Error :',rmse(y_train,y_hat_train))
        print('Test RMSE Error :',rmse(y_test,y_hat_test))
        print()
        print('Train Accuracy :', r2_score(y_train,y_hat_train))
        print('Test Accuracy :', r2_score(y_test,y_hat_test))
        print('-'*120)
        print('-'*120)
        
models=[('Linear Regression', LinearRegression()),
        ('SGD', SGDRegressor()),
        ('Random Forest', RandomForestRegressor()),
        ('K-Nearest Neighbour', KNeighborsRegressor()),
       ]

if __name__=='__main__':
    X,y=load_data()
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=123)
    #categorial and numeric features
    num_featuers=['longitude', 'latitude','housing_median_age', 'total_rooms',
           'total_bedrooms', 'population', 'households','median_income']

    cat_features=['ocean_proximity']

    #numeric and categorial pipelines
    num_pipeline=Pipeline([
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler())
    ])

    #final pipeline
    final_pipeline=ColumnTransformer([
        ('numerical pipeline', num_pipeline, num_featuers),
        ('categorical pipeline', OneHotEncoder(), cat_features),
    ])

    X_train_tr=final_pipeline.fit_transform(X_train)

    cat_col=['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN']

    columns=num_featuers+cat_col

    X_train_tr=pd.DataFrame(X_train_tr,columns=columns)

    X_test_tr=final_pipeline.transform(X_test)

    X_test_tr=pd.DataFrame(X_test_tr,columns=columns)



    model_report(models,X_train_tr,X_test_tr,y_train,y_test)

                Linear Regression                 

Training RMSE Error : 68730.76314789179
Test RMSE Error : 68734.74913831857

Train Accuracy : 0.6453456488677727
Test Accuracy : 0.6447273670935495
------------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------
                       SGD                        

Training RMSE Error : 68831.38637462868
Test RMSE Error : 68767.10484590383

Train Accuracy : 0.6443064464377458
Test Accuracy : 0.6443928113374651
------------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------
                  Random Forest                   

Training RMSE Error : 18228.748158244376
Test RMSE Error : 47509