In [None]:
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import pandas as pd

# Predicting the weight of ROUSes using Linear Regression
Using other data we have in the table, we want to predict the weight of ROUSes.

In [None]:
rouses = pd.read_csv('ROUSes.csv')
print(rouses.shape)
rouses.head()

# Exploratory analysis
First, let's look at some scatterplots of the numberic variables vs. weight.

In [None]:
sns.scatterplot(data=rouses, x='Age',y='Weight')

In [None]:
sns.scatterplot(data=rouses, x='Length',y='Weight')

Age seems to have a decently linear correlation with weight.  Lenght looks more like a square relationship.  So we'll try using Age to predict Weight.  Let's drop the other columns.

In [None]:
rouses = rouses.drop(columns=['Temperament','Length']) # drop the column 'Name'
rouses.head()

We will **randomly split the sample into two groups, and reserve the samples in the test set for evaluating the model**. 

Think of it as a professor reserving some questions in the question bank for the actual test (test) and releasing the rest as practice questions (train). Why would giving all the questions ahead of time be an inaccurate way to evaluate a student's understanding (model)?

In [None]:
train = rouses.sample(frac=0.8) # 80% rows for training
test = rouses.drop(index=train.index) # rest of rows for testing
print(train.shape, test.shape)

The next thing to do is to separate out the target data `Weight` from the predictor data (everything else; in this case just Age is left).

In [None]:
y_train = train['Weight']
X_train = train.drop(columns=['Weight'])
print(X_train.shape, y_train.shape)

y_test = test['Weight']
X_test = test.drop(columns=['Weight']) 
print(X_test.shape, y_test.shape)

## Training or fitting a model 

To use a machine learning model from `scikit-learn`, you should import the relevant model.

In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression(fit_intercept=True)
lr.fit(X_train, y_train)
print('Train score:',lr.score(X_train, y_train))
print('Test score:',lr.score(X_test, y_test))

coefs = pd.DataFrame(lr.coef_, 
                     index=lr.feature_names_in_, 
                    columns=['Coefficient vals'])
coefs



For linear regression a "score" (the R2 value) near 1 is what we are hoping for, and 0 is the worst result.  So our model is doing a very good job at predicting the data!  The coefficient tells us how much Weight change to expect with every unit change of Age.

To visualize our line, we can plug some values in and have their outputs predicted:

In [None]:
predict_inputs = pd.DataFrame({'Age':[1,55]})
predictions=lr.predict(predict_inputs)
predictions

In [None]:
fig, ax = plt.subplots(1)

ax.plot([1,55], [predictions[0], predictions[1]], color = "red", label = "linear regression model")
ax.scatter(rouses['Age'], rouses['Weight'], label = "data")
ax.set(xlabel='Age', ylabel='Weight')
plt.legend()

Indeed, it looks like this line is a very good fit to our data!

# Another example
Here is another linear regression example for synthetically generated data that we actually know the **true** model

In [None]:
# controls random number generation
# always get the same data
np.random.seed(1234) 

# true model is linear with a = 1 and b = 1
a = 1
b = 1

n_points = 100

X = np.random.rand(n_points)
Y = a*X + b + 0.2*np.random.randn(n_points) # final term is random noise

In [None]:
fig, ax = plt.subplots(1)

ax.plot([0,1], [1, 2], color = "black", label = "true model")
ax.scatter(X, Y, label = "data")
ax.set(xlabel='X', ylabel='Y')
plt.legend()

In [None]:
df = pd.DataFrame(data={'Y': Y, 'X': X})
train = df.sample(frac=0.8) # 80% rows for training
test = df.drop(index=train.index) # rest of rows for testing
print(train.shape, test.shape)
df


In [None]:
y_train = train['Y']
X_train = train.drop(columns=['Y'])
print(X_train.shape, y_train.shape)

y_test = test['Y']
X_test = test.drop(columns=['Y']) 
print(X_test.shape, y_test.shape)

In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression(fit_intercept=True)
lr.fit(X_train, y_train)



In [None]:
lr.score(X_train, y_train) 
# R2 value for our training data 

In [None]:
lr.score(X_test, y_test) 
# R2 value for our training data 

In [None]:
coefs = pd.DataFrame(lr.coef_, 
                     index=lr.feature_names_in_, 
                    columns=['Coefficient vals'])
coefs



In [None]:
prediction_inputs = pd.DataFrame(data={'X': [0,1]})
predictions = lr.predict(prediction_inputs)
predictions

In [None]:
fig, ax = plt.subplots(1)

ax.plot([0,1], [1, 2], color = "black", label = "true model")
ax.plot([0,1], [predictions[0], predictions[1]], color = "red", label = "linear regression model")
ax.scatter(X, Y, label = "data")
ax.set(xlabel='X', ylabel='Y')
plt.legend()