In [None]:
import math

import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn import metrics

import statsmodels.api as sm

In [None]:
# load dataset
df = pd.read_csv('../data/weight_height.csv', index_col=0)
df

In [None]:
df.corr(method='pearson') # check the correlation matrix

In [None]:
# Scikit Learn
x = df['Height'].values.reshape(-1,1) # we need a 2D arrary since there could be multiple independent variables. But in this case we only have one independent variable since this is a simple linear regression model. So we need to reshape the 1D array from DataFrame into a 2D array but the size of the second dimension is of course 1
y = df['Weight'].values

lr = LinearRegression(fit_intercept = True)
lr.fit(x, y) # actual fitting of the model, note we are using 100% of the dataset for training
y_pred = lr.predict(x)

print('Coefficients = ', lr.coef_)

In [None]:
print('Intercept = ', lr.intercept_)

In [None]:
print('R^2 = ', lr.score(x, y)) # larger value, i.e., close to 1.0, is better

In [None]:
print('Root MSE = ', math.sqrt(metrics.mean_squared_error(y, y_pred))) # smaller value, i.e., close to 0.0, is better

In [None]:
# Statsmodel
x = df['Height']
y = df['Weight']

x2 = sm.add_constant(x)
ols = sm.OLS(y, x2)
est = ols.fit()
est.summary() # this gives you a very nicely and comprehensive formatted report

In [None]:
# plot the regression line
plt.figure(0)
plt.title('Linear Regression Line')
plt.xlabel('Height')
plt.ylabel('Weight')
plt.scatter(x, y,  color='black')
plt.plot(x, y_pred, color='blue', linewidth=3)