In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import piplite # important to have seaborn in jupyter lite
await piplite.install('seaborn') # important to have seaborn in jupyter lite
%matplotlib inline


## lets load the boston dataset

In [None]:
boston_df=pd.read_csv('Boston.csv')
type(boston_df)

In [None]:
boston_df.head()

In [None]:
boston_df['medv']

In [None]:
X=boston_df.loc[:, boston_df.columns != 'medv']

In [None]:
X.head()

In [None]:
# to check for missing values and data types
boston_df.info() 

In [None]:
## Summarizing the Stats of Data
boston_df.describe()

In [None]:
## Check mIssing values
boston_df.isnull()
boston_df.isnull().sum()

### Exploratory Data Analysis(EDA)

In [None]:
## Corelation (Mainly in Regression) 
## High Correlation means high performance
boston_df.corr()

## if High Corelation between independent features itself, then we can remove the redundant ones (Multi Colinearity)

## Also check for corelaion between independent and target variable

In [None]:
import seaborn as sns
#sns.pairplot(boston_df)

In [None]:
#sns.pairplot(boston_df.loc[1:3])

In [None]:
sns.pairplot(boston_df[["crim","medv"]])

In [None]:
plt.scatter(boston_df["crim"],boston_df["medv"])
plt.xlabel("Crime rate")
plt.ylabel("Price")

In [None]:
plt.scatter(boston_df["rm"],boston_df["medv"])
plt.xlabel("Average Room")
plt.ylabel("Price")

In [None]:
# regressin plot
sns.regplot(x="rm",y="medv",data=boston_df)

In [None]:
# regressin plot
sns.regplot(x="lstat",y="medv",data=boston_df)

In [None]:
# regressin plot
sns.regplot(x="chas",y="medv",data=boston_df)

In [None]:
# regressin plot (pupil teacher ratio)
sns.regplot(x="ptratio",y="medv",data=boston_df)

## Independent and Dependent Features

In [None]:
X=boston_df.loc[:, boston_df.columns != 'medv']
# or
# X=boston_df.iloc[:,:-1]
y=boston_df.iloc[:,-1]

In [None]:
X .head()

In [None]:
y.head()

In [None]:
## TRAin test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42) # returns numpy array

In [None]:
X_train.head()

In [None]:
X_train

In [None]:
## Standardize the dataset
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

In [None]:
X_train=scaler.fit_transform(X_train)

In [None]:
X_test=scaler.transform(X_test)  # test data set transform on training datset only

In [None]:
X_train

## Model Training

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
regression=LinearRegression()

In [None]:
regression.fit(X_train,y_train)

In [None]:
## print the coefficients(weights) and intercept(bais)
print(regression.coef_)

In [None]:
print(regression.intercept_)

In [None]:
# on which parameters the model has been trained
regression.get_params()

## Prediction

In [None]:
reg_pred=regression.predict(X_test)

In [None]:
reg_pred

In [None]:
## Plot a scatter plot for the predictions
plt.scatter(reg_pred,y_test)
plt.xlabel("Predicted Value")
plt.ylabel("Actual")
# linear plotting means our model is linear 

In [None]:
# residual is error
residuals=y_test-reg_pred

In [None]:
## Plotting this residuals
sns.displot(residuals,kind="kde") # gives us outliers in the normal distribution

In [None]:
## Scatter plot wrt prediction and residuals
plt.scatter(reg_pred,residuals)  # to check if there is a relation between residual and prediction

## Performance Metrics

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

print(mean_absolute_error(y_test,reg_pred))
print(mean_squared_error(y_test,reg_pred))

In [None]:
print(np.sqrt(mean_squared_error(y_test,reg_pred)))

# R square and adjusted R square


Formula 
R^2=1-SSR/SST
r^2= 1- Sum of square residul/sum of square total

Adjusted R^2=1- [(1-R^2)*(n-1)/(n-k-1)] 
It will always be less than R^2

In [None]:
from sklearn.metrics import r2_score
score=r2_score(y_test,reg_pred)
print(score) # it should be more towards one

In [None]:
# for adjusted R2 there is no such library
# so we will implement the formula
ad_r2_score=1- (1-score)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
print(ad_r2_score)

# New data prediction

In [None]:
boston_df.shape

In [None]:
X.iloc[1]

In [None]:
# standardize first
X_first=scaler.transform(X.head(1))

regression.predict(X_first)

# Pickling the Model file for Deployment

In [51]:
import pickle 
import builtins

In [53]:
# pkl is a serialize format file so that it can be deployed on server/local storage
with builtins.open('regmodel.pkl', 'wb') as f:
    pickle.dump(regression, f)
#pickle.dump(regression,builtins.open('/data/firstproject/regmodel.pkl'),'wb') 

In [54]:
# Then we can reload the model with all the data frames and value sas such
with builtins.open('regmodel.pkl', 'rb') as f:
    pickled_model=pickle.load(f)

In [55]:
pickled_model.predict(X_first)

array([30.08649576])