# Linear Regression
## Finding meaningful dependencies

In [None]:
# Import libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import pandas as pd

In [None]:
# More information on dataset: https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.names

### WRITE CODE HERE
df = None # Download data from https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data
### WRITE CODE HERE

feature_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 
                 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 
                 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']

### WRITE CODE HERE
df.columns = None # set column names to feature names
### WRITE CODE HERE

In [None]:
# Let's see how the columns are correlated
sb.set(style='whitegrid', context='notebook')
sb.pairplot(df)
plt.show()

In [None]:
# Find the columns, that are the most lineary dependable on MEDV column

### WRITE CODE HERE
cols = []
### WRITE CODE HERE

linear_columns = df[cols]
sb.pairplot(linear_columns)
plt.show()

In [None]:
# Find the coefficient of correlation between values
# Note: use numpy corrcoef(dataframe.values) function to find Pearson product-moment correlation coefficients
# Link: https://docs.scipy.org/doc/numpy-1.14.0/reference/generated/numpy.corrcoef.html

### WRITE CODE HERE
cm = None
### WRITE CODE HERE

sb.set(font_scale=1.5)
heatmap = sb.heatmap(cm, cbar=True, annot=True, square=True,
                     fmt='.2f', annot_kws={'size':15},
                     yticklabels=cols, xticklabels=cols)
plt.show() 
sb.reset_orig()

#### We can see that value mostly depends on room numbers

# Let's implement our own linear regression!

Basically, the equation of a line in 2 dimentional grid looks like this:

$$y = x*w_{1}+w_{0}$$

Where $$w_{1}$$ is the slope coefficient, and $$w_{0}$$ is the shift.

In [None]:
# Set parameters of linear regression
w1 = None
w0 = None

Now we will make a function, that finds the y, given x input:

In [None]:
def line(x):
    global w1
    global w0
    
    ### WRITE CODE HERE
    y = None
    ### WRITE CODE HERE
    
    return y

The core part of linear regression algorithm is a method, that finds the coefficients w1 and w0. These may be found in different ways (like Gradient descent) but will use standard statistical method.

What should we do:

1. Calculate mean of all x
2. Calculate mean of all y
3. Find variance of x: mean of sum of squared differences between x and mean x
4. Find covariance between x and y: mean of differences between x and its mean multiplied by differences of y and its mean
5. w1 is the covariance divided by x's variance
6. w0 is the difference between y's mean and w1 multiplied by x's mean 

In [None]:
def core(x, y):
    # Note: use numpy mean(x) function to find mean over columns or rows 
    ### WRITE CODE HERE
    mean_x = None
    mean_y = None
    var_x = None
    cov_xy = None
    w1 = None
    w0 = None
    ### WRITE CODE HERE
    
    return w1, w0

In machine learning, almost all developers, who implement ML algorithms, use methods fit and predict as a standard API. If you look at implementation of algorithms in sklearn, you may see that the developers stick to this rule.

Now let's create a method fit. This method mostly takes two arguments - x and y and returns the object itself. As we write it outside the class, the method returns nothing.

In [None]:
def fit(x, y):
    # Initialize parameters here
    # Do some math
    return core(x, y)

Finally, make some predictions:

In [None]:
def predict(x):
    return line(x)

# Standard scaler

Here is a class that should help you to standardize your data (remove mean and divide by standard deviation)

In [None]:
class Scaler:
    def fit(self, data):
        self.mean = np.mean(data)
        sum_sq = ((data-self.mean)**2).sum()
        self.N = len(data)
        self.std_dev = np.sqrt(sum_sq/self.N)
    
    def fit_std(self, data):
        self.fit(data)
        return self.standardize(data)
    
    def standardize(self, data):
        return (data - self.mean)/self.std_dev
    
    def revert(self, data_std):
        return data_std*self.std_dev+self.mean

# Now we will put what we have altogether

In [None]:
# Start from creating standard scaler instance for every column - x and y
### WRITE CODE HERE
x_sc = None
y_sc = None
# Select the data by columns, that we are intersted in and take its values 
# by calling .values attribute - this returns numpy array, that we will work with
x_train = None
y_train = None
# Standardize your data then
x_std = None
y_std = None
### WRITE CODE HERE

In [None]:
# Now let's create fit the data and find our coefficients
w1, w0 = fit(x_std, y_std)

# Visualize results

In [None]:
plt.scatter(x_std, y_std, color='lightblue')
plt.plot(x_std, predict(x_std), color='orange')
plt.title('Linear regression')
plt.xlabel('Average number of rooms (std)')
plt.ylabel('Price*1000$ (std)')
plt.show()

# How much will 5 rooms cost?

In [None]:
input_std = x_sc.standardize([5.0])
pred_lr = predict(input_std)
pred_lr = x_sc.revert(pred_lr)[0]

print(f'Linear regression predicted cost: {pred_lr*1000:.2f}$')