In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Lets load the Boston Housing Pricing Dataset

In [None]:
from sklearn.datasets import load_boston

In [None]:
boston = load_boston()

In [None]:
boston.keys()

In [None]:
# lets check the description of the dataset

print(boston.DESCR)

In [None]:
print(boston.feature_names)

In [None]:
print(boston.data)

In [None]:
print(boston.target)

## Preparing the dataset

In [None]:
df = pd.DataFrame(boston.data, columns = boston.feature_names)

In [None]:
df['Price'] = boston.target 

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# Summarizing the dataset

df.describe()

In [None]:
# Checking the missing values

df.isnull().sum()

# Exploratory Data Analysis

## Correlation

In [None]:
df.corr()

In [None]:
sns.pairplot(df)

In [None]:
plt.scatter(df['CRIM'], df['Price'])
plt.xlabel('Crime Rate')
plt.ylabel('Price')

In [None]:
sns.regplot(x= 'LSTAT', y = 'Price', data = df)
plt.xlabel('lower staus of population')
plt.ylabel('Price of the house')

In [None]:
# Lower status of the population and price of the house are negatively correlated

In [None]:
sns.regplot(x= 'RM', y = 'Price', data = df)
plt.xlabel('number of rooms per dwelling')
plt.ylabel('Price of the house')

In [None]:
# Number of rooms per dwelling and price of the house are negatively correlated

In [None]:
sns.regplot(x= 'CHAS', y = 'Price', data = df)

In [None]:
sns.regplot(x= 'PTRATIO', y = 'Price', data = df)
plt.xlabel('pupil teacher ratio')
plt.ylabel('Price of the house')

In [None]:
# Pupil teacher ratio and price of the house are somewhat negatively correlated

In [None]:
## Independent and Dependent Features

X = df.iloc[:,:-1]
Y = df.iloc[:,-1]

In [None]:
X.head()

In [None]:
Y

In [None]:
# Train and Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 42)

In [None]:
X_train

In [None]:
X_test

In [None]:
Y_train

In [None]:
Y_test

In [None]:
# Standardizing the dataset

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

In [None]:
X_train = scaler.fit_transform(X_train)

In [None]:
X_test = scaler.transform(X_test)

# Model Training

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
regression = LinearRegression()

In [None]:
regression.fit(X_train,Y_train)

In [None]:
print(regression.coef_)

In [None]:
print(regression.intercept_)

In [None]:
## on which parameter this model has been trained
regression.get_params()

In [None]:
## Prediction with test data
reg_pred = regression.predict(X_test)

In [None]:
reg_pred

In [None]:
## plot a scatter plot for prediction
plt.scatter(Y_test,reg_pred)

In [None]:
## Calculating residuals
residuals = Y_test - reg_pred
residuals

In [None]:
sns.displot(residuals, kind ='kde')

In [None]:
# a scatter plot between predicted values and residuals
# uniform distribution
plt.scatter(reg_pred, residuals)

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

print(mean_absolute_error(Y_test, reg_pred))
print(mean_squared_error(Y_test, reg_pred))
print(np.sqrt(mean_squared_error(Y_test, reg_pred)))

# R square and adjusted R square

In [None]:
from sklearn.metrics import r2_score
score = r2_score(Y_test, reg_pred)
print(score)

In [None]:
adj_r = 1 - (1-score)*(len(Y_test)-1)/(len(Y_test)-X_test.shape[1]-1)
print(adj_r)

# New Data Prediction

In [None]:
boston.data[0].reshape(1,-1)

In [None]:
#transform the above set
scaler.transform(boston.data[0].reshape(1,-1))

In [None]:
regression.predict(scaler.transform(boston.data[0].reshape(1,-1)))

# Pickling the Model for Deployment

In [None]:
import pickle

In [None]:
pickle.dump(regression, open('regmodel.pkl','wb'))

In [None]:
pickled_model = pickle.load(open('regmodel.pkl','rb'))

In [None]:
pickled_model.predict(scaler.transform(boston.data[0].reshape(1,-1)))