# Simple Linear Regression

## Import the relevant libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from sklearn.linear_model import LinearRegression

## Load the data

In [3]:
data = pd.read_csv('reference/S4_L38/1.01. Simple linear regression.csv')

In [4]:
data.head()

Unnamed: 0,SAT,GPA
0,1714,2.4
1,1664,2.52
2,1760,2.54
3,1685,2.74
4,1693,2.83


## Create the regression

### Declare the dependent and independent variables

In [5]:
x = data['SAT']
y = data['GPA']

In [6]:
x.shape

(84,)

In [7]:
y.shape

(84,)

In [8]:
x_matrix = x.values.reshape(-1, 1)

In [9]:
x_matrix.shape

(84, 1)

### Regression

In [10]:
reg = LinearRegression(n_jobs=2)

In [11]:
reg.fit(x_matrix,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=2, normalize=False)

### $R^2$

In [12]:
reg.score(x_matrix, y)

0.40600391479679754

### Coefficients

In [13]:
reg.coef_

array([0.00165569])

### Intercept

In [14]:
reg.intercept_

0.2750402996602799

### Making Predictions

In [15]:
reg.predict(pd.DataFrame({'SAT':[1740,1760]}))

array([3.15593751, 3.18905127])

In [16]:
new_data = pd.DataFrame(data=[1740,1760], columns=['SAT'])
new_data

Unnamed: 0,SAT
0,1740
1,1760


In [17]:
reg.predict(new_data)

array([3.15593751, 3.18905127])

In [18]:
new_data['Predicted_GPA'] = reg.predict(new_data)
new_data

Unnamed: 0,SAT,Predicted_GPA
0,1740,3.155938
1,1760,3.189051


In [19]:
plt.scatter(x, y)
plt.xlabel('SAT', fontsize=20)
plt.ylabel('GPA', fontsize=20)

yhat = reg.coef_*x_matrix + reg.intercept_
# yhat = 0.0017*x1 + 0.2750
fig = plt.plot(x, yhat, lw=4, c='orange', label='regression line')

plt.show()