In [1]:
import pandas as pd

In [2]:
wine = pd.read_csv("wine.csv")

In [3]:
wine.head()

Unnamed: 0,Year,Price,WinterRain,AGST,HarvestRain,Age,FrancePop
0,1952,7.495,600,17.1167,160,31,43183.569
1,1953,8.0393,690,16.7333,80,30,43495.03
2,1955,7.6858,502,17.15,130,28,44217.857
3,1957,6.9845,420,16.1333,110,26,45152.252
4,1958,6.7772,582,16.4167,187,25,45653.805


In [4]:
wine.describe()

Unnamed: 0,Year,Price,WinterRain,AGST,HarvestRain,Age,FrancePop
count,25.0,25.0,25.0,25.0,25.0,25.0,25.0
mean,1965.8,7.067224,605.28,16.509336,148.56,17.2,49694.43676
std,7.691987,0.650341,132.277965,0.675397,74.419464,7.691987,3665.270243
min,1952.0,6.2049,376.0,14.9833,38.0,5.0,43183.569
25%,1960.0,6.5188,536.0,16.2,89.0,11.0,46583.995
50%,1966.0,7.1211,600.0,16.5333,130.0,17.0,50254.966
75%,1972.0,7.495,697.0,17.0667,187.0,23.0,52894.183
max,1978.0,8.4937,830.0,17.65,292.0,31.0,54602.193


In [5]:
wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 7 columns):
Year           25 non-null int64
Price          25 non-null float64
WinterRain     25 non-null int64
AGST           25 non-null float64
HarvestRain    25 non-null int64
Age            25 non-null int64
FrancePop      25 non-null float64
dtypes: float64(3), int64(4)
memory usage: 1.4 KB


In [7]:
from sklearn import linear_model

In [8]:
linearRegression = linear_model.LinearRegression()

In [9]:
linearRegression

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

#### Here we have used wine[['AGST']] instead of wine['AGST'] because passing 1d arrays raises ValueError in the version after 0.19.

In [12]:
X_train_AGST = wine[['AGST']]
y_train_price = wine[['Price']]

In [13]:
linearRegression.fit(X_train_AGST, y_train_price)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

#### Here the coefficient 0.63509431 is for AGST (independent variable)

#### Simple Linear Regression, Y = a + bX
#### where 
- **a is the y-intercept (linearRegression.intercept_)**, 
- ** Y is the dependent variable (Price in this case)**
- **X is the dependent variable (AGST in this case)**
- **b is the slope of the line (linearRegression.coef_)**

In [14]:
print("Coefficients: ", linearRegression.coef_)

Coefficients:  [[ 0.63509431]]


In [15]:
linearRegression

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [16]:
print("Intercept: ", linearRegression.intercept_)

Intercept:  [-3.41776131]


#### Sum of Squared Errors or SSE is given by residues_

In [17]:
linearRegression.residues_



array([ 5.73487515])

#### coef_ : array, shape (n_features, ) or (n_targets, n_features)
#### Estimated coefficients for the linear regression problem. If multiple targets are passed during the fit (y 2D), this is a 2D array of shape (n_targets, n_features), while if only one target is passed, this is a 1D array of length n_features.

#### intercept_ : array
#### Independent term in the linear model.

In [18]:
X_train_AGST_HarvestRain = wine[['AGST', 'HarvestRain']]

In [19]:
linearRegression.fit(X_train_AGST_HarvestRain, y_train_price)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [20]:
print("Coefficients of AGST and HarvestRain",linearRegression.coef_)

Coefficients of AGST and HarvestRain [[ 0.60261691 -0.00457006]]


In [21]:
print("Intercept with two independent variables", linearRegression.intercept_)

Intercept with two independent variables [-2.2026536]


#### Note that SSE has been reduced after adding 'HarvestRain' variable. It's 2.97037334.

In [22]:
print("Sum of Squared Errors (SSE) with two independent variables", linearRegression.residues_)

Sum of Squared Errors (SSE) with two independent variables [ 2.97037334]




In [23]:
X_train_all_independent_variables = wine[['AGST', 'HarvestRain', 'WinterRain', 'Age', 'FrancePop']]

In [24]:
linearRegression.fit(X_train_all_independent_variables, y_train_price)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [25]:
print("Coefficients of all the variables ", linearRegression.coef_)

Coefficients of all the variables  [[  6.01223884e-01  -3.95812450e-03   1.04250681e-03   5.84748489e-04
   -4.95273038e-05]]


In [26]:
print("Intercept with all the independent variables: ", linearRegression.intercept_)

Intercept with all the independent variables:  [-0.45039886]


#### Note that SSE has been reduced further after adding all the independent variables

In [27]:
print("Sum of Squared Errors (SSE) with all the independent variables", linearRegression.residues_)

Sum of Squared Errors (SSE) with all the independent variables [ 1.73211272]


