In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import KFold, train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import MinMaxScaler #For feature Scaling
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
% matplotlib inline 

# List to denote custom field names
fields = ['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PTRATIO','B','LSTAT','MEDV']

# read csv and apply custom names to it 
# 'delim_whitespace' separates the field by a whitespace when set to True
data = pd.read_csv('housing.csv',delim_whitespace=True, names = fields)
data.head()

### Preprocess data

In [None]:
X = data.drop(['MEDV','CHAS'], 1)
y = data['MEDV']

# Visualize the corellation of the data
correlations = X.corr()
# plot correlation matrix
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
plt.show()

### Lets see how the average number or rooms affects the price of the House 
from the below plot, we can easily tell the more the rooms increase, the Value of the house increases as well.

In [None]:
sns.lmplot(x = 'RM', y = 'MEDV', palette = 'Set1',data = data)
plt.xlabel('average number of rooms per dwelling')
plt.ylabel('Median value of owner-occupied homes in ($1000)')
plt.show()

### Lets see how the crime rate by town affects the price of the House 
from the below plot, we can see that with the crime rate at the rate of 0 - 15, the values of the Houses were experiencing an increase but as the crime rate increased, the values dropped. From this information, we can conclude that The Higher the crime rate, the lower the house is valued at and Visa-versa

In [None]:
sns.lmplot(x = 'CRIM', y = 'MEDV', palette = 'Set1',data = data)
plt.xlabel('Crime rate per town')
plt.ylabel('Value of House')
plt.show()

### Here, we visualize how that Charles River dummy variable (= 1 if tract bounds river; 0 otherwise) (CHAS) affects the value of the House
We can see that it doesn't necessarily make sense, thus we drop the entire column:
    ### Note:
    After droping the CHAS column, the accuracy score of our model increased from 79% to a cool 80.39% Accuracy 
    Score.

In [None]:
sns.lmplot(x = 'CHAS', y = 'MEDV', palette = 'Set1',data = data)
plt.xlabel('CHAS')
plt.ylabel('Value of House')
plt.show()

### Lets see how the Age of the house affects the price of the House 
from the below plot, we can see that with the increase in the Age of the house, the Value Decreases

In [None]:
sns.lmplot(x = 'AGE', y = 'MEDV', palette = 'Set1',data = data)
plt.xlabel('The Age of The House')
plt.ylabel('Value of House')
plt.show()

### Cross Validation using train_test_split

In [None]:
scaler = MinMaxScaler(feature_range = (0,1))
scaledX = scaler.fit_transform(X)

from sklearn.preprocessing import Normalizer
norm = Normalizer().fit_transform(scaledX)

X_train, X_test, y_train, y_test = train_test_split(norm, y,test_size = 0.2, random_state = 3)

LR = LinearRegression()
LR.fit(X_train, y_train)

print('Accuracy Score: ',LR.score(X_test, y_test) * 100.00)

pred = LR.predict(X_test[6:10])
print('Prediction: ', pred)
print('Coeffitients: \n', LR.coef_)
print('')
print(y[6:10])