# Notebook for multiple linear regression. Dataset: Boston House Prices. 


## Load the basic libraries for data processing

In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 

## Import the dataset, it is available through the scikit-learn library.

In [2]:
from sklearn.datasets import load_boston
dataset = load_boston()

#**Section 1. Overivew of the Dataset**

In [None]:
dataset

In [4]:
print(dataset.keys())

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])


###  Look into the description field (DESCR)

In [None]:
print(dataset.DESCR)

### Need to convert the dataset into matrix format

In [6]:
df= pd.DataFrame(dataset.data, columns=dataset.feature_names)

In [None]:
df.head(10)

### Add the target or dependent feature column to the matrix or dataframe

In [8]:
df['Cost'] = dataset.target


In [None]:
df.head(10)

#**Section 2. Preprocess the data and EDA**

### 1. Check for any missing values

In [None]:
df.isnull().sum()


### 2. Lets see how the dependent variable is distributed

In [None]:
sns.set(rc={'figure.figsize':(10,10)})
sns.distplot(df['Cost'], bins=10)
plt.show()

### Looks like the dependent variable follows a bell curve (normal distribution) though there seems to be an outlier around Cost =50. 
### 3.Let's see how the cost changes with rooms and crime



In [None]:
sns.scatterplot(data=df, x="RM", y="Cost")


In [None]:
sns.scatterplot(data=df, x="CRIM", y="Cost")


### Makes sense so as the number of rooms are increasing the cost is also increasing. Also we see houses in high crime area cost less. By sense I mean correlation ! May or may not be causation.

### 4. Let's make a correlation plot to see how the features are correlated.

In [None]:
sns.heatmap(data=df.corr(), annot=True,cmap="vlag")


### Correlation varies from -1 to 1. Correlation values close to 1 means high positive correlation and values close to -1 mean high negative correlation. Values close to 0 mean there is no or little correlation. Correlation does not mean causation, but it serves as an indicator of possible influence among features. 

## Let's set a threshold. 
###Correlation >= 0.7 is considered highly positive correlation 
###Correlation <= -0.7 is considered highly negative correlation 


### **Observations for high correlation with dependent feature Cost**
1. Cost and LSTAT have high negative correlation
2. Cost and RM have high positive correlation

### **Observations for high correlation among the independent features**
1. INDUS and NOX 
2. INDUS and DIS
3. INDUS and TAX
4. NOX and AGE
5. NOX and DIS
6. AGE and DIS
7. RAD and TAX



###4. Keep independent features that are highly correlated with depedendent feature. 
### Remove indepedent features that are higly correlated with each other, to avoid multicollinearity, issues. 

In [15]:
X_indep_feat=df.drop(['Cost'], axis=1) # Remove only cost, then try removing other vriables and see the effect on R^2 # Try to remove indus and rad
Y_dep_feat=df['Cost']

# **Section 3 Split the data into training and testing set**

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X_indep_feat, Y_dep_feat, test_size = 0.2, random_state=2)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

# **Section 4 Build Multiple Linear Regression Model**
---



In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [None]:
lm = LinearRegression(normalize=False)
lm.fit(X_train, Y_train)

In [20]:
def adjusted_r2 (R2,n,p): #### Function to calculate adjusted R^2 , n is the sample size, p is the number of indepedent features.
  Adj_r2 = R2 - ((p-1) * (1-R2)/ (n-p))
  return Adj_r2


### Fit the model to the training data

In [21]:
y_train_predict = lm.predict(X_train)
mse_training = mean_squared_error(Y_train, y_train_predict)
r2_training = r2_score(Y_train, y_train_predict)

n_training= len(X_train)
p=X_train.shape[1] #number of independent features
adj_r2_traing=adjusted_r2(r2_training,n_training,p)


In [None]:
print("The model performance for training set")
print("-----------------------------------")
print('MSE is {}'.format(mse_training))
print('R2 score is {}'.format(r2_training))
print('Adj R2 score is {}'.format(adj_r2_traing))
print("\n")





### Evaluate the model on Testing Data

In [23]:
y_test_predict = lm.predict(X_test)
mse_test = mean_squared_error(Y_test, y_test_predict)
r2_test = r2_score(Y_test, y_test_predict)
n_test= len(X_test)
adj_r2_test=adjusted_r2(r2_test,n_test,p)


In [None]:


print("The model performance for testing set")
print("-----------------------------------")
print('MSE is {}'.format(mse_test))
print('R2 score is {}'.format(r2_test))
print('Adj R2 score is {}'.format(adj_r2_test))
print("\n")