## import the necessary python packages

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

## load data

In [None]:
df = pd.read_csv("./data.csv")

### show the first 5 samples

In [None]:
df.head(5)

## 1. Workflow 

### 1.1 get independent (X) and dependent variables (Y)

In [None]:
X = df[["Solids_Reduction"]].values
y = df[["Reduction_in_Oxygen_Demand"]].values

In [None]:
plt.scatter(X,y,c="red")
plt.xlabel("Solid Reduction (%)")
plt.ylabel("Reduction in Oxygen Demand (%)")
plt.show()

### 1.2 create an linear regression object, then train the model

In [None]:
regr = linear_model.LinearRegression()
regr.fit(X,y)
print("We have trained a linear model!")

#### show the parameters

In [None]:
beta_0 = regr.intercept_.item()
print("beta_0 (intercept) is:", beta_0)

In [None]:
beta_1 = regr.coef_.item()
print("beta_1 (slope) is:", beta_1)

### 1.3 apply the model

#### 1.3.1 based on parameters

In [None]:
y_hat_equation = beta_0 + beta_1*X

In [None]:
plt.scatter(X,y,c="red",label="true value")
plt.scatter(X,y_hat_equation,c="blue",label="calculate based on parameters")
plt.plot([X[0],X[-1]],[y_hat_equation[0],y_hat_equation[-1]],label="linear function f(X)")
plt.xlabel("Solid Reduction (%)")
plt.ylabel("Reduction in Oxygen Demand (%)")
plt.legend()
plt.show()

#### 1.3.2 use the method "predict"

In [None]:
y_hat = regr.predict(X)

In [None]:
plt.scatter(X, y, c="red", label="true value")
plt.scatter(X, y_hat_equation, c="blue", label="calculate based on parameters")
plt.plot([X[0],X[-1]],[y_hat_equation[0],y_hat_equation[-1]], label="linear function f(X)")

plt.scatter(X, y_hat, c="black", label='use the method "predict"')

plt.xlabel("Solid Reduction (%)")
plt.ylabel("Reduction in Oxygen Demand (%)")
plt.legend()
plt.show()

### 1.4 model evaluation

In [None]:
print("root-mean-square-error:", mean_squared_error(y_true=y, y_pred=y_hat, squared=False))
print("r2:", r2_score(y_true=y, y_pred=y_hat))

## 2. If we don't have an independent testing dataset

### 2.1 split the whole data into training and testing data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=520)

regr = linear_model.LinearRegression()
regr.fit(X_train,y_train)
print("We have trained a linear model!")

### 2.2 training results

In [None]:
y_pred = regr.predict(X_train)
print("root mean square error:", mean_squared_error(y_true=y_train, y_pred=y_pred, squared=False))
print("r2:", r2_score(y_true=y_train, y_pred=y_pred))

### 2.3  testing results

In [None]:
y_pred = regr.predict(X_test)
print("root mean square error:", mean_squared_error(y_true=y_test, y_pred=y_pred, squared=False))
print("r2:", r2_score(y_true=y_test, y_pred=y_pred))

In [None]:
plt.scatter(X_test, y_test, c="red", label="true value")
plt.scatter(X_test, y_pred, c="blue", label="predicted value")

plt.xlabel("Solid Reduction (%)")
plt.ylabel("Reduction in Oxygen Demand (%)")
plt.legend()
plt.show()