# Task 3

In [1]:
import pandas as pd
import numpy as np

from math import sqrt
from scipy.stats import skew

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 100)

In [2]:
from bokeh.charts import Scatter, Histogram, output_notebook, show
from bokeh.sampledata.autompg import autompg as df
from bokeh.layouts import gridplot
from bokeh.plotting import figure

output_notebook()

The bokeh.charts API has moved to a separate 'bkcharts' package.

This compatibility shim will remain until Bokeh 1.0 is released.
After that, if you want to use this API you will have to install
the bkcharts package explicitly.

  warn(message)


In [3]:
features = ["Vetorial","LPT","P1","IC","LP1","Cálculo2","Discreta","P2","Grafos","Fís.Clássica","LP2","Cálculo1"]
target = "cra"

In [4]:
def partition(df, train_proportion):
    n_train = round(train_proportion * len(df))
    n_valid = len(df) - n_train
    
    train = df.sample(n_train)
    test_indexes = set(df.index.tolist()) - set(train.index.tolist())
    valid = df.loc[test_indexes]
    return train, valid

def score_rmsd(real, pred):
    assert len(real) == len(pred)
    total = 0
    for i in range(len(real)):
        total += (real[i] - pred[i]) **2
        
    return sqrt(total / len(real))

def scatter_target(df, feature_name):
    p = Scatter(df, x=feature_name, y=target, title="Scatter Plot: " + feature_name, xlabel=feature_name, ylabel=target,
               width=300, height=300, tools=["reset", "pan", "wheel_zoom"], color="blue")

    return p

### <font color="blue">Loading data</font>

In [5]:
data = pd.read_csv("data/treino.csv")[features + [target]]
data.to_csv("data/treino_clean.csv", index=False)
use_data = data.copy()
print(data.shape)
data.head()

(88, 13)


Unnamed: 0,Vetorial,LPT,P1,IC,LP1,Cálculo2,Discreta,P2,Grafos,Fís.Clássica,LP2,Cálculo1,cra
0,8.6,10.0,9.0,9.1,8.6,8.4,8.3,8.8,8.2,7.9,9.4,8.7,8.477647
1,5.6,7.0,7.7,7.0,8.1,6.2,7.3,8.2,5.4,7.7,8.9,7.0,6.851724
2,10.0,9.8,7.9,9.6,8.3,8.7,8.8,9.5,9.2,8.6,9.7,8.6,9.090588
3,6.1,8.3,6.8,8.2,7.1,8.0,6.3,8.9,7.0,8.5,9.0,7.8,7.283516
4,8.8,9.3,5.0,8.5,5.1,5.0,5.8,7.1,5.4,8.7,8.2,5.2,7.205747


### <font color="blue">Understanding the data</font>

**Visão geral**

In [6]:
data.describe()

Unnamed: 0,Vetorial,LPT,P1,IC,LP1,Cálculo2,Discreta,P2,Grafos,Fís.Clássica,LP2,Cálculo1,cra
count,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0
mean,7.280682,8.480114,7.407955,8.172727,7.597727,6.323864,6.764773,7.941364,7.196591,7.107955,8.631818,7.2,7.332535
std,1.404169,0.984522,1.346278,0.894007,1.371799,1.293662,1.228403,0.990478,1.27797,0.908987,0.969008,1.228493,0.849758
min,5.0,6.2,5.0,5.9,5.0,5.0,5.0,5.3,5.0,5.0,5.0,5.0,4.874468
25%,6.275,7.7,6.5,7.5,6.6,5.1,5.675,7.3,6.3,7.0,8.2,6.275,6.841484
50%,7.1,8.5,7.75,8.2,7.8,5.8,6.75,7.95,7.2,7.0,8.9,7.2,7.274746
75%,8.325,9.3,8.325,8.8,8.6,7.5,7.6,8.8,8.2,7.5,9.2,8.125,7.883292
max,10.0,10.0,10.0,10.0,10.0,9.3,9.6,9.6,10.0,9.1,9.7,9.8,9.090588


**Distribuições**

In [7]:
plots = []
for feature in features:
    p = Histogram(data, feature, title="Distribution: " + feature, height=300, width=300)
    plots.append(p)
    
    log_applied = data[feature].apply(np.log)
    exp_applied = data[feature].apply(np.exp)
    
    print("Skewness on " + feature)
    print("Original:", round(skew(data[feature]), 3), ",  Log:", round(skew(log_applied), 3), ",  Exp:", round(skew(exp_applied), 3))
    print()
    
grid = gridplot([plots])
show(gridplot([[plots[0], plots[1], plots[2]]]))
show(gridplot([[plots[3], plots[4], plots[5]]]))
show(gridplot([[plots[6], plots[7], plots[8]]]))

Skewness on Vetorial
Original: 0.1 ,  Log: -0.217 ,  Exp: 2.354

Skewness on LPT
Original: -0.311 ,  Log: -0.508 ,  Exp: 0.923

Skewness on P1
Original: -0.371 ,  Log: -0.651 ,  Exp: 2.329

Skewness on IC
Original: -0.077 ,  Log: -0.383 ,  Exp: 1.699

Skewness on LP1
Original: -0.327 ,  Log: -0.578 ,  Exp: 1.721

Skewness on Cálculo2
Original: 0.53 ,  Log: 0.374 ,  Exp: 2.543

Skewness on Discreta
Original: 0.347 ,  Log: 0.088 ,  Exp: 2.639

Skewness on P2
Original: -0.445 ,  Log: -0.741 ,  Exp: 0.983

Skewness on Grafos
Original: 0.067 ,  Log: -0.276 ,  Exp: 2.607

Skewness on Fís.Clássica
Original: -0.564 ,  Log: -1.015 ,  Exp: 2.089

Skewness on LP2
Original: -1.905 ,  Log: -2.413 ,  Exp: 0.172

Skewness on Cálculo1
Original: 0.028 ,  Log: -0.316 ,  Exp: 2.444



Mudanças a serem aplicadas nas features baseadas em suas nas distribuições:
- **Log**: Cálculo2, Discreta
- **Exp**: LP2

In [8]:
use_data["Cálculo2"] = data["Cálculo2"].apply(np.log)
use_data["Discreta"] = data["Discreta"].apply(np.log)
use_data["LP2"] = data["LP2"].apply(np.exp)

**Relações**

In [9]:
plots = []

for feature in features:
    p = scatter_target(use_data, feature)
    plots.append(p)

grid = gridplot([[plots[0], plots[1], plots[2]],
                 [plots[3], plots[4], plots[5]],
                 [plots[6], plots[7], plots[8]],
                 [plots[9], plots[10], plots[11]]])

show(grid)

In [10]:
train, valid = partition(data, 0.7)

In [11]:
print("train:", train.shape)
print("valid:", valid.shape)

train: (62, 13)
valid: (26, 13)


### Models

In [12]:
from my_regression import MyRegression
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV
from sklearn.model_selection import cross_val_score

In [35]:
def plot_lines(x, y, title=""):
    p = figure(plot_width=600, plot_height=250, title=title, tools=["reset", "pan", "xwheel_zoom"])
    p.line(x, y, color="navy", line_width=2)
    show(p)

In [14]:
def rmse_cv(model, x, y):
    rmse= np.sqrt(-cross_val_score(model, x, y, scoring="neg_mean_squared_error", cv = 5))
    return(rmse)

In [15]:
y = use_data[target]
del use_data[target]

**Ridge**

In [27]:
alphas = [0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 20, 25, 30, 35, 50, 75, 100]

In [37]:
model_ridge = Ridge()
cv_ridge = [rmse_cv(Ridge(alpha = alpha, ), use_data.as_matrix(), y.as_matrix()).mean() for alpha in alphas]
    
plot_lines(alphas, cv_ridge)
print("min rmse:", min(cv_ridge))

min rmse: 0.57034263684


**Lasso**

In [39]:
model_lasso = LassoCV(alphas = [1, 0.1, 0.001, 0.0005]).fit(use_data.as_matrix(), y.as_matrix())

In [18]:
mr = MyRegression("data/treino_clean.csv", header=True)
w_array, rss = mr.run(learning_rate=0.0000001, verbosity="v")


---
Final RSS: 0.326764706

w0: 0.010776940833
w1: 0.0791133046615
w2: 0.0900471562901
w3: 0.0778791693115
w4: 0.0885023295072
w5: 0.0803464175717
w6: 0.0654059858306
w7: 0.0750617752168
w8: 0.0864577487894
w9: 0.079041725702
w10: 0.0764539904215
w11: 0.0914668376727
w12: 0.0768106150714
