In [1]:
#  Import Liibraries

import pandas as pd
import numpy as np


from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split


In [2]:

# Load Housing Data
housing_df = pd.read_csv('../download/HousingData.csv')
print(housing_df.head())



      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD  TAX  PTRATIO  \
0  0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900    1  296     15.3   
1  0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671    2  242     17.8   
2  0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671    2  242     17.8   
3  0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622    3  222     18.7   
4  0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622    3  222     18.7   

        B  LSTAT  MEDV  
0  396.90   4.98  24.0  
1  396.90   9.14  21.6  
2  392.83   4.03  34.7  
3  394.63   2.94  33.4  
4  396.90    NaN  36.2  




# Using Linear Regression to Predict the Accuracy of the Median Values of Our Dataset



In [3]:

#  Remove All Null Values from DataFramte
housing_df = housing_df.dropna()



In [4]:
#  Declare X and y
# print(housing_df.head())
X = housing_df.iloc[:, :-1] # Get all column exept last
y = housing_df.iloc[:, -1]  # Get Last Column only



In [5]:
#  Create Training And Test Sets

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size = 0.2)



In [6]:

#  Create Empty Linear Regression Model
reg = LinearRegression()



In [7]:

#  Fit Regressor to the training data
reg.fit(X_train, y_train)



LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [8]:

#  Predict on the test data 
y_pred = reg.predict(X_test)



In [9]:

#  Compute and print RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Square Error is " , rmse )



Root Mean Square Error is  5.3796535818066



# Linear Regression Function
 
 

In [35]:
#  Create a functino for a Model
def regression_model(model):
    
    # Create Training and Test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    # Create Regressor 
    reg_all = model
    
    # Fit the regressor to the  Training Data
    reg_all.fit(X_train, y_train)
    
    # Predict on the test data y pred
    y_pred = reg_all.predict(X_test)
    
    # Compute and print RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    print("Root mean square error ", rmse) 
    


In [42]:
regression_model(LinearRegression())
#  Every time score was differ from previous score

#  So Apply Validaion on it



Root mean square error  7.261436497505474




# Cross Validation



In [12]:
from sklearn.model_selection import cross_val_score



In [13]:
#  Create a function for regression model CV

def regression_model_cv(model, k=5):
    scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=k)
    rmse = np.sqrt(-scores)
    
    print("REG RMSE ", rmse)
    print("REG Mean ", rmse.mean())
    
    

In [14]:
#  Now, Call the function

regression_model_cv(LinearRegression())


REG RMSE  [3.26123843 4.42712448 5.66151114 8.09493087 5.24453989]
REG Mean  5.337868962878373


In [33]:
regression_model_cv(LinearRegression(), k=3)

REG RMSE  [ 3.72504914  6.01655701 23.20863933]
REG Mean  10.983415161090695


In [35]:
regression_model_cv(LinearRegression(), k=6)

REG RMSE  [3.23879491 3.97041949 5.58329663 3.92861033 9.88399671 3.91442679]
REG Mean  5.08659081080109






# Regularization: Ridge and Lasso






In [37]:
from sklearn.linear_model import Ridge


In [38]:

regression_model_cv(Ridge())


REG RMSE  [3.17202127 4.54972372 5.36604368 8.03715216 5.03988501]
REG Mean  5.232965166251768


In [40]:


#  Using Lasso

from sklearn.linear_model import Lasso



In [41]:
regression_model_cv(Lasso())

REG RMSE  [3.52318747 5.70083491 7.82318757 6.9878025  3.97229348]
REG Mean  5.601461185384289





# Using K-Nearest Neighbors to Find the Median Value of the Dataset



In [42]:
from sklearn.neighbors import KNeighborsRegressor

In [43]:
regression_model_cv(KNeighborsRegressor())

REG RMSE  [ 8.24568226  8.81322798 10.58043836  8.85643441  5.98100069]
REG Mean  8.495356738515685


In [44]:
regression_model_cv(KNeighborsRegressor(n_neighbors=4))

REG RMSE  [ 8.44659788  8.99814547 10.97170231  8.86647969  5.72114135]
REG Mean  8.600813339223432




# K-Nearest Neighbors with GridSearchCV to Find the Optimal Number of Neighbors



In [45]:
from sklearn.model_selection import GridSearchCV


In [48]:
neighbors = np.linspace(1, 20, 20)


In [51]:
k = neighbors.astype(int)


In [55]:
param_grid = {'n_neighbors': k}
# param_grid

In [58]:
knn = KNeighborsRegressor()
knn_tuned = GridSearchCV(
    knn,
    param_grid,
    cv=5,
    scoring='neg_mean_squared_error')


In [59]:
knn_tuned.fit(X, y)


GridSearchCV(cv=5, error_score=nan,
             estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30,
                                           metric='minkowski',
                                           metric_params=None, n_jobs=None,
                                           n_neighbors=5, p=2,
                                           weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=0)

In [77]:
k = knn_tuned.best_params_
print("Best n_neighbors: {}".format(k))
score = knn_tuned.best_score_
rsm = np.sqrt(-score)
print("Best score: {}".format(rsm))


Best n_neighbors: {'n_neighbors': 7}
Best score: 8.516767055977628





#  Decision Trees and Random Forests




In [78]:
from sklearn import tree


In [79]:
regression_model_cv(tree.DecisionTreeRegressor())

REG RMSE  [3.95258288 7.40015395 7.41662518 6.42968787 5.89310766]
REG Mean  6.218431508396617


In [80]:
from sklearn.ensemble import RandomForestRegressor
regression_model_cv(RandomForestRegressor())


REG RMSE  [3.26314545 3.7065341  5.17038978 6.45643455 4.02954982]
REG Mean  4.525210738983679
