# Overview of Regression


## From scratch example
Source: https://machinelearningmastery.com/implement-simple-linear-regression-scratch-python/

You can use something like this in the homework that I will publish in a few weeks

In the homework, you will need to add a measurement of R^2


In [None]:
import pandas as pd

In [None]:
# I downloaded the Swedish Insurance dataset and coverted to CSV and put it on github
url = "https://raw.githubusercontent.com/willwilson-sdsu/CS550/main/insurance.csv"
insurance_original = pd.read_csv(url)
print(insurance_original.head())
print(insurance_original.describe())


In [None]:
# Make a copy so I dont need to keep pulling from web when I break things
# Even though it is pretty small
insurance = insurance_original.copy()

In [None]:
# Example from https://machinelearningmastery.com/implement-simple-linear-regression-scratch-python/ that does stats from scratch
# We will use a super basic example for this one then move to the insurance example and use Panda methods for that
# Calculate the mean value of a list of numbers
def mean(values):
    return sum(values) / float(len(values))
 
# Calculate the variance of a list of numbers
def variance(values, mean):
    return sum([(x-mean)**2 for x in values])
 
# calculate mean and variance
dataset = [[1.0, 1.0], [2.0, 3.0], [4.0, 3.0], [3.0, 2.0], [5.0, 5.0]]
x = [float(row[0]) for row in dataset]
y = [float(row[1]) for row in dataset]
mean_x, mean_y = mean(x), mean(y)
var_x, var_y = variance(x, mean_x), variance(y, mean_y)
print('x stats: mean=%.3f variance=%.3f' % (mean_x, var_x))
print('y stats: mean=%.3f variance=%.3f' % (mean_y, var_y))

In [None]:
# Calculate covariance between x and y
def covariance(x, mean_x, y, mean_y):
    covar = 0.0
    for i in range(len(x)):
        covar += (x[i] - mean_x) * (y[i] - mean_y)
    return covar

In [None]:
covar = covariance(x, mean_x, y, mean_y)
print('Covariance: %.3f' % (covar))

In [None]:
# Calculate coefficients
def coefficients(dataset):
    x = [row[0] for row in dataset]
    y = [row[1] for row in dataset]
    x_mean, y_mean = mean(x), mean(y)
    b1 = covariance(x, x_mean, y, y_mean) / variance(x, x_mean)
    b0 = y_mean - b1 * x_mean
    return [b0, b1]

b0, b1 = coefficients(dataset)
print('Coefficients: B0=%.3f, B1=%.3f' % (b0, b1))

In [None]:
# Sanity check with Scitkitlearn
from sklearn.linear_model import LinearRegression
import numpy as np  
x_np =  np.array([x]).reshape(5,1)
y_np = np.array([y]).reshape(5,1)
model = LinearRegression()
model.fit(x_np,y_np)
print("Coefficient:",model.coef_)
print("Y intercept:",model.intercept_)
print("Model R^2 score:",model.score(x_np,y_np))


In [None]:
import matplotlib.pyplot as plt

x_new = np.linspace(0, 10, 10)
y_new = model.predict(x_new[:, np.newaxis]) # We can look at sklearn docs to get this syntax
plt.figure(figsize=(4, 3))
# Create the axes
ax = plt.axes()
ax.scatter(x_np, y_np)   # Add a plot of the points to the graph
ax.plot(x_new, y_new) # Add the line to the graph

ax.set_xlabel('x') # Set the labels
ax.set_ylabel('y')
# Look at documentation to see other options

# Resize the axis so subgraphs wont overlap and things like that
ax.axis('tight')

# Show the plot
plt.show()

In [None]:
# Now we will do this again using the insurance data
# Use Pandas to get stats

def panda_coefficient(dataset):
    x_mean = dataset['X'].mean()
    y_mean = dataset['Y'].mean()
    # Note - variance normalized by N-1 by default. This can be changed using the ddof argument
    var_x= dataset.var(axis=0,ddof=0)['X']
    cov_x_y = dataset.cov(ddof=0)['X']['Y']
    b1 = cov_x_y / var_x
    b0 = y_mean - b1 * x_mean
    return [b0,b1]
    
b0, b1 = panda_coefficient(insurance)
print('Coefficients: B0=%.3f, B1=%.3f' % (b0, b1))

In [None]:
# Test with sklearn
from sklearn.linear_model import LinearRegression
import numpy as np  
x_np = insurance.iloc[:, 0].values.reshape(-1, 1)  # values converts it into a numpy array
y_np = insurance.iloc[:, 1].values.reshape(-1, 1)  # -1 means that calculate the dimension of rows, but have 1 column
model_insurance = LinearRegression()
model_insurance.fit(x_np,y_np)
print("Coefficient:",model_insurance.coef_)
print("Y intercept:",model_insurance.intercept_)
print("Model score:",model_insurance.score(x_np,y_np))

In [None]:
x_new = np.linspace(0, 120, 400)
y_new = model_insurance.predict(x_new[:, np.newaxis]) # We can look at sklearn docs to get this syntax
plt.figure(figsize=(4, 3))
# Create the axes
ax = plt.axes()
ax.scatter(x_np, y_np)   # Add a plot of the points to the graph
ax.plot(x_new, y_new) # Add the line to the graph

ax.set_xlabel('x') # Set the labels
ax.set_ylabel('y')
# Look at documentation to see other options

# Resize the axis so subgraphs wont overlap and things like that
ax.axis('tight')

# Show the plot
plt.show()


In [None]:
#pip install --user yellowbrick

In [None]:
# Residiuals
# https://www.scikit-yb.org/en/latest/api/regressor/residuals.html

# Create the train and test data
#from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from yellowbrick.regressor import ResidualsPlot

X_train, X_test, y_train, y_test = train_test_split(x_np, y_np, test_size=0.2, random_state=42)
# Not much data here, but we will see what we get
model_ins2 =  LinearRegression()
visualizer = ResidualsPlot(model_ins2)
visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test, y_test)  # Evaluate the model on the test data
visualizer.show()                 # Finalize and render the figure

# Logistic Regression
Now we want to classify data

If you feel up to it, read through https://towardsdatascience.com/logistic-regression-detailed-overview-46c4da4303bc and manually set up the cost and activation functions.
I am going to show a basic example using sklearn.
For the homework (one assigned), I encourage you to look at the parameters in sklearn to improve your results

In [None]:
# Pull in the Iris dataset 
# This time we will grab it direct from sklearn
from sklearn import datasets
iris = datasets.load_iris()
iris_values = iris['data']
iris_target = iris['target']
#.reshape(-1, 1)


In [None]:
# Using this example https://medium.com/@kgpvijaybg/logistic-regression-on-iris-dataset-48b2ecdfb6d3
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris_values, iris_target, test_size = 0.25, random_state = 0)
# This uses mulitple input values so we need to use a scaler
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
classifier = LogisticRegression(random_state = 0, solver='lbfgs', multi_class='auto')
classifier.fit(X_train, y_train)


In [None]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)
# Predict probabilities
probs_y=classifier.predict_proba(X_test)
### Print results 
probs_y = np.round(probs_y, 2)
res = "{:<10} | {:<10} | {:<10} | {:<13} | {:<5}".format("y_test", "y_pred", "Setosa(%)", "versicolor(%)", "virginica(%)\n")
res += "-"*65+"\n"
res += "\n".join("{:<10} | {:<10} | {:<10} | {:<13} | {:<10}".format(x, y, a, b, c) for x, y, a, b, c in zip(y_test, y_pred, probs_y[:,0], probs_y[:,1], probs_y[:,2]))
res += "\n"+"-"*65+"\n"
print(res)

In [None]:
# Very basic. 
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
# Plot confusion matrix
import seaborn as sns
import pandas as pd
# confusion matrix sns heatmap 
ax = plt.axes()
df_cm = cm
sns.heatmap(df_cm, annot=True, annot_kws={"size": 30}, fmt='d',cmap="Blues", ax = ax )
ax.set_title('Confusion Matrix')
plt.show()