In [10]:
import pandas as pd 
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

dataset = pd.read_csv("./datasets/tips.csv")

X = dataset[["total_bill"]]
y = dataset[["tip"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size=0.7)

reg = linear_model.LinearRegression()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)

print("The MSE on test set is {0:.4f}".format(mean_squared_error(y_test, y_pred)))

The MSE on test set is 0.8712


In [9]:
import pandas as pd 
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split



X = dataset[["total_bill"]]
y = dataset[["tip"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size=0.7)

reg = linear_model.SGDRegressor(loss='squared_error', max_iter=1000)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)

print("The MSE on test set is {0:.4f}".format(mean_squared_error(y_test, y_pred)))

The MSE on test set is 2.1533


  y = column_or_1d(y, warn=True)


In [12]:
'''
Ridge Regression L2 Regularization

'''

import pandas as pd 
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split


dataset = pd.read_csv("./datasets/tips.csv")

X = dataset[["total_bill"]]
y = dataset[["tip"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size=0.7)

reg = linear_model.Ridge(alpha=0.9)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)

print("The MSE on test set is {0:.4f}".format(mean_squared_error(y_test, y_pred)))

The MSE on test set is 0.8712


In [16]:
'''
Lasso Regression L1 Regularization

'''

import pandas as pd 
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split


dataset = pd.read_csv("./datasets/tips.csv")

X = dataset[["total_bill"]]
y = dataset[["tip"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size=0.7)

reg = linear_model.Lasso(alpha=0.1)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)

print("The MSE on test set is {0:.4f}".format(mean_squared_error(y_test, y_pred)))

The MSE on test set is 0.8730


In [29]:
"""
Elastic Net Regression

Elastic-Net Regression incorporates both L1-Regularization and L2-Regularization while building the Linear Regression Model. This allows us

Learn a sparse model where few of the weights or parameters are non-zero like Lasso (L1-Regularization).

Maintain the properties of Ridge Regression (L2-Regularization).

l1_ratio is the ElasticNet mixing parameter with a value between 0 and 1.

For l1_ratio = 0, L2-Regularization is used.

For l1_ratio = 1, L1-Regularization is used.

For 0 < l1_ratio < 1, the combination of both L1 and L2 are used.
"""

import pandas as pd 
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

dataset = pd.read_csv("./datasets/tips.csv")

X = dataset[["total_bill"]]
y = dataset[["tip"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size=0.7)

reg = linear_model.ElasticNet(l1_ratio=0.5)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)

print("The MSE on test set is {0:.4f}".format(mean_squared_error(y_test, y_pred)))

The MSE on test set is 0.8833


In [40]:
'''Support Vector Regression
The idea of Support Vector Regression has been borrowed from Support Vector Machines. In classification, we predict a discrete-valued output. Here are some things to note:

As the name suggests Support Vector Regression is used for predicting the real-valued output.

The model produced by Support Vector Regression depends only on a subset of the training data.

There is a concept of Kernel, which involves mapping the features or columns or dimensions to higher dimensions to make the problem solvable.'''

import pandas as pd 
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

dataset = pd.read_csv("./datasets/tips.csv")

X = dataset[["total_bill"]]
y = dataset[["tip"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size=0.7)

for k in ['rbf', 'linear', 'sigmoid', 'poly']:
    reg =  SVR(kernel=k)
    reg.fit(X_train, y_train)
    y_pred = reg.predict(X_test)

    print("The MSE on test set is {0:.4f}  ".format(mean_squared_error(y_test, y_pred))+" for kernal "+str(k))

The MSE on test set is 1.0160   for kernal rbf
The MSE on test set is 0.9205   for kernal linear
The MSE on test set is 18.3361   for kernal sigmoid
The MSE on test set is 1.2018   for kernal poly


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [4]:
'''
Nearest Neighbors Regression
The idea of Nearest Neighbor Regression has been borrowed from Nearest Neighbors Classification. Note that:

The principle behind the nearest neighbors algorithm in regression is to find the nearest, let’s say, k neighbors. The neighbors are calculated based on some measure of similarity or distance calculation. Based on the value K chosen and neighbors retrieved, this algorithm is also called K-Nearest neighbors.

K
K
 is a parameter that can be tuned.

The output value for a new instance is returned by taking the mean of its nearest neighbors in case of Regression. The important thing to remember is that no equation is constructed and no parameters are optimized.

The nearest neighbors algorithms remembers all the training dataset and comes under the category of non-generalizing algorithms of Machine Learning. It stores the training dataset in some efficient data structure.

Implementation in Scikit Learn
The KNeighborsRegressor class implements the KNN algorithm.
'''

import pandas as pd 
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor

dataset = pd.read_csv("./datasets/tips.csv")

X = dataset[["total_bill"]]
y = dataset[["tip"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size=0.7)


for i in range(2,10):
    reg =  KNeighborsRegressor(n_neighbors=i)
    reg.fit(X_train, y_train)
    y_pred = reg.predict(X_test)

    print("The MSE on test set is {0:.4f}".format(mean_squared_error(y_test, y_pred))+" num nearest neighbours "+str(i))

The MSE on test set is 1.1852 num nearest neighbours 2
The MSE on test set is 1.0105 num nearest neighbours 3
The MSE on test set is 1.0105 num nearest neighbours 4
The MSE on test set is 0.9872 num nearest neighbours 5
The MSE on test set is 0.9542 num nearest neighbours 6
The MSE on test set is 1.0054 num nearest neighbours 7
The MSE on test set is 1.0365 num nearest neighbours 8
The MSE on test set is 0.9841 num nearest neighbours 9


In [7]:
'''
The idea of the Decision Tree Regression has been borrowed from Decision Tree Classification. Note that:

Decision Trees create a tree structure from the dataset at hand. It learns the if/else structure from the dataset.

We will look into the key algorithms to make a Decision Tree in the Classification section. In the case of Regression, it just returns the continuous valued output for an instance.
'''

import pandas as pd 
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn import tree

dataset = pd.read_csv("./datasets/tips.csv")

X = dataset[["total_bill"]]
y = dataset[["tip"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size=0.7)

reg =  tree.DecisionTreeRegressor()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)

print("The MSE on test set is {0:.4f}".format(mean_squared_error(y_test, y_pred)))

The MSE on test set is 2.3152


<p>
Some advanced Regression algorithms are mentioned below.

Least Angle Regression (LARS)
Polynomial Regression
Bayesian Regression
Robustness Regression
Isotonic Regression


</p>