## Aprendizado de Máquina
Implementando Regressão Múltipla do Zero 

In [1]:
from numpy import append, genfromtxt, array
from sklearn import linear_model
import pandas as pd


### <font color="blue">Minha implementação</font>

In [2]:
def compute_error_for_given_function(w_array, array_points):
    total_error = 0
    for i in range(len(array_points)):
        tuple_len = len(array_points[i])
        x_array = append(array(1), array_points[i, 0:tuple_len - 1])
        y = array_points[i, tuple_len - 1]

        total_error += (y - (hypothesis(w_array, x_array))) ** 2

    return total_error / float(len(array_points))

def compute_error_for_given_function_r2(w_array, array_points):
    total_error = 0
    for i in range(len(array_points)):
        tuple_len = len(array_points[i])
        x_array = append(array(1), array_points[i, 0:tuple_len - 1])
        y = array_points[i, tuple_len - 1]

        total_error += (y - (hypothesis(w_array, x_array))) ** 2
            
    y_array = array_points[:, [array_points.shape[1] - 1]]
    y_mean = sum(y_array) / len(y_array)
        
    v = 0
    for y in y_array[:,0]:
        v += (y - y_mean) **2
        
    return 1 - (total_error / v)

def hypothesis(w_array, x_array):
    assert len(w_array) == len(x_array)

    total = 0
    for i in range(len(w_array)):
        total += w_array[i] * x_array[i]

    return total

def step_gradient(current_w_array, array_points, learning_rate):
    #gradient_descent
    w_gradient_array = [0] * array_points.shape[1]
    N = float(len(array_points))

    for i in range(len(array_points)):
        tuple_len = len(array_points[i])

        x_array = append(array(1), array_points[i, 0:tuple_len - 1])
        y = array_points[i, tuple_len - 1]

        for i in range(len(w_gradient_array)):
            w_gradient_array[i] += gradiente_descent_calc(current_w_array, x_array, N, y, x_array[i])

    #update coefficients
    new_w_array = [None] * len(current_w_array)
    for i in range(len(current_w_array)):
        new_w_array[i] = current_w_array[i] - (2 * learning_rate * w_gradient_array[i])

    return new_w_array

def gradiente_descent_calc(current_w_array, x_array, N, y, x):
    assert len(current_w_array) == len(x_array)

    return -1 * (y - (hypothesis(current_w_array, x_array))) * x

def gradient_descent_runner(points, initial_w_array, learning_rate, num_iterations, cost_tolerance, verbosity=False):
    w_array = initial_w_array
    
    rss = compute_error_for_given_function(w_array, array_points=points)
    
    iterations_count = 0
    while rss >= cost_tolerance and iterations_count <= num_iterations:
        iterations_count += 1    
        
        w_array = step_gradient(w_array, array(points), learning_rate)
        
        rss = compute_error_for_given_function(w_array, array_points=points)
        
        if verbosity:
            print("Current RSS:", rss) #item 2

    print()
    print("---\nFinal RSS:", rss)
    return w_array, rss

def run(data, learning_rate=0.0001, num_iterations=1000, cost_tolerance=float("-inf"), verbosity=False):
    initial_w_array = [0] * data.shape[1]

    w_array, rss = gradient_descent_runner(data, initial_w_array, learning_rate, num_iterations, cost_tolerance, verbosity)

    print("")
    for i in range(len(w_array)):
        print("w{}:".format(i), w_array[i])
    
    return w_array, rss

In [3]:
def load_data(data_filename, header=False):
    points = genfromtxt(data_filename, delimiter=",")
    if header:
        return points[1:]
    else:
        return points

In [4]:
data = load_data("data/sample_treino.csv", header=True)

coeffs, rss = run(data, learning_rate=0.00002, num_iterations=5000)


---
Final RSS: 0.427271889165

w0: 0.228735663008
w1: 0.130081796764
w2: 0.115498350182
w3: 0.160663125794
w4: 0.457792130014
w5: 0.0388481221501


### <font color="blue">Implementação do sklearn</font>

In [5]:
model = linear_model.LinearRegression()
y = data[:, [data.shape[1] - 1]]
X = data[:, 0:data.shape[1] - 1]
model.fit(X=X, y=y)

print("Final Score:", model.score(X=X, y=y), "\n")

sklearn_coeffs = model.intercept_.tolist() + model.coef_[0].tolist()

for i in range(len(sklearn_coeffs)):
        print("w{}:".format(i), sklearn_coeffs[i])

Final Score: 0.423803240951 

w0: 1.737711513794439
w1: 0.10304143246259931
w2: 0.046436700850734314
w3: 0.16409834419165825
w4: 0.3811784266558142
w5: 0.020278157624843418


Há alguma diferença entre os coeficientes da minha implementação e o do **sklearn**.
É bem provável que esta discrepância seja devido à métrica de *score* trabalhada pelo modelo **LinearRegression**. Sua documentação fala que é usado o R2.<br>
http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html#sklearn.linear_model.LinearRegression.score

In [6]:
pd.DataFrame({"sklearn": sklearn_coeffs, "Minha": coeffs}, index=["w0", "w1", "w2", "w3", "w4", "w5"])

Unnamed: 0,Minha,sklearn
w0,0.228736,1.737712
w1,0.130082,0.103041
w2,0.115498,0.046437
w3,0.160663,0.164098
w4,0.457792,0.381178
w5,0.038848,0.020278


In [7]:
data = pd.read_csv("data/sample_treino.csv")
data.head()

Unnamed: 0,Cálculo1,LPT,P1,IC,Cálculo2,cra
0,8.7,10.0,9.0,9.1,8.4,8.477647
1,7.0,7.0,7.7,7.0,6.2,6.851724
2,8.6,9.8,7.9,9.6,8.7,9.090588
3,7.8,8.3,6.8,8.2,8.0,7.283516
4,5.2,9.3,5.0,8.5,5.0,7.205747
