In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
dataset = pd.read_csv('data/50_Startups.csv')

In [3]:
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [4]:
dataset.describe()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
count,50.0,50.0,50.0,50.0
mean,73721.6156,121344.6396,211025.0978,112012.6392
std,45902.256482,28017.802755,122290.310726,40306.180338
min,0.0,51283.14,0.0,14681.4
25%,39936.37,103730.875,129300.1325,90138.9025
50%,73051.08,122699.795,212716.24,107978.19
75%,101602.8,144842.18,299469.085,139765.9775
max,165349.2,182645.56,471784.1,192261.83


In [5]:
dataset.count()

R&D Spend          50
Administration     50
Marketing Spend    50
State              50
Profit             50
dtype: int64

In [6]:
dataset.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [7]:
X = dataset.iloc[:, :4].values
y = dataset.iloc[:, 4].values

In [8]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [9]:
state_label_encoder = LabelEncoder()

In [10]:
X[:, 3] = state_label_encoder.fit_transform(X[:, 3])

In [11]:
X[:4, :]

array([[165349.2, 136897.8, 471784.1, 2],
       [162597.7, 151377.59, 443898.53, 0],
       [153441.51, 101145.55, 407934.54, 1],
       [144372.41, 118671.85, 383199.62, 2]], dtype=object)

In [12]:
state_onehot_encoder = OneHotEncoder(categorical_features = [3], dtype = np.int)
X = state_onehot_encoder.fit_transform(X).toarray()

In [13]:
X[:4, :]

array([[0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.6534920e+05,
        1.3689780e+05, 4.7178410e+05],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.6259770e+05,
        1.5137759e+05, 4.4389853e+05],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.5344151e+05,
        1.0114555e+05, 4.0793454e+05],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.4437241e+05,
        1.1867185e+05, 3.8319962e+05]])

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.10, random_state = 0)

In [16]:
from sklearn.linear_model import LinearRegression

In [17]:
linear = LinearRegression()

In [18]:
linear.fit(X_train, y_train)



LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [19]:
y_test_pred = linear.predict(X_test)

In [20]:
linear.coef_

array([ 1.24850276e+02, -1.02835632e+03,  9.03506043e+02,  7.78395354e-01,
        5.70695437e-03,  3.21344322e-02])

In [21]:
linear.intercept_

46250.73721995708

In [22]:
from sklearn.metrics import mean_absolute_error

In [23]:
mean_absolute_error(y_test, y_test_pred)

8830.917535943

# Using Gradient Descent

In [24]:
from sklearn.linear_model import SGDRegressor 

In [25]:
sgd_linear = SGDRegressor(max_iter = 1000)

In [26]:
sgd_linear.fit(X_train, y_train)

SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01,
       fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
       loss='squared_loss', max_iter=1000, n_iter=None, penalty='l2',
       power_t=0.25, random_state=None, shuffle=True, tol=None, verbose=0,
       warm_start=False)

In [27]:
y_test_pred = sgd_linear.predict(X_test)

In [28]:
sgd_linear.coef_

array([-9.62986520e+10,  1.50228357e+10,  1.30173608e+11, -1.28471941e+14,
       -1.35080445e+14, -1.40445567e+14])

In [29]:
sgd_linear.intercept_

array([4.8963973e+10])

In [30]:
from sklearn.metrics import mean_absolute_error

In [31]:
mean_absolute_error(y_test, y_test_pred)

5.983391769859658e+19