Simple Linear Regression Problem with 2 variables - 
Predict the percentage of marks that a student is expected to score based upon the number of hours they studied.

In [13]:
# Import python libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Import the cvs dataset using pandas
dataset = pd.read_csv("./data/student_scores.csv")
dataset.head()

In [None]:
# Returns number of rows and columns of dataset
dataset.shape

In [None]:
# Print statistical details of the dataset
dataset.describe()

In [None]:
dataset.plot(x="Hours", y="Scores", style="o")
plt.title("Hours vs Percentage")
plt.xlabel("Hours studied")
plt.ylabel("Percentage score")
plt.show()

# there is a positive linear relation between the number of hours studied and percentage of score.
# In a positive linear relationship, the increase in the independent variable also increases the levels of the dependent
# variable. In a negative linear relationship, as the independent variable increases or decreases,
# the dependent variable moves in the opposite way.

In [18]:
# Extract attributes and labels
# X = attributes. Specified "-1" as the range for columns since we wanted our attribute set to contain all the columns except the last one, which is "Scores"
X = dataset.iloc[:, :-1].values

# y = labels. Specified "1" for the label column since the index for "Scores" column is 1.
y = dataset.iloc[:, 1].values

In [19]:
# Split this data into training and test sets
from sklearn.model_selection import train_test_split

# test_size variable is where we specify the proportion of test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
# Training the algorithm
# Import the LinearRegression class
from sklearn.linear_model import LinearRegression

# Instantiate it
regressor = LinearRegression()

# Call the fit() method along with our training data.
regressor.fit(X_train, y_train)

In [None]:
# Retrieve the intercept
print(regressor.intercept_)

# Retrieve the slope
print(regressor.coef_)

In [22]:
# Making predictions
# y_pred is a numpy array that contains all the predicted values for the input values in the X_test series.
y_pred = regressor.predict(X_test)

In [None]:
# To compare the actual output values for X_test with the predicted values
df = pd.DataFrame({"Actual": y_test, "Predicted": y_pred})
df

In [None]:
# Evaluate the algorithm

from sklearn import metrics

# For regression algorithms, three evaluation metrics are commonly used
# Mean Absolute Error (MAE) is the mean of the absolute value of the errors
print("Mean Absolute Error:", metrics.mean_absolute_error(y_test, y_pred))

# Mean Squared Error (MSE) is the mean of the squared errors
print("Mean Squared Error:", metrics.mean_squared_error(y_test, y_pred))

# Root Mean Squared Error (RMSE) is the square root of the mean of the squared errors
print("Root Mean Squared Error:", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))