# Task 3

In [1]:
import pandas as pd
import numpy as np

from math import sqrt
from scipy.stats import skew

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 100)

In [2]:
from bokeh.charts import Scatter, Histogram, output_notebook, show
from bokeh.sampledata.autompg import autompg as df
from bokeh.layouts import gridplot
from bokeh.plotting import figure

output_notebook()

The bokeh.charts API has moved to a separate 'bkcharts' package.

This compatibility shim will remain until Bokeh 1.0 is released.
After that, if you want to use this API you will have to install
the bkcharts package explicitly.

  warn(message)


In [3]:
features = ["Vetorial","LPT","P1","IC","LP1","Cálculo2","Discreta","P2","Grafos","Fís.Clássica","LP2","Cálculo1"]
target = "cra"

In [4]:
def partition(df, train_proportion):
    n_train = round(train_proportion * len(df))
    n_valid = len(df) - n_train
    
    train = df.sample(n_train)
    test_indexes = set(df.index.tolist()) - set(train.index.tolist())
    valid = df.loc[test_indexes]
    return train, valid

def score_rmsd(real, pred):
    assert len(real) == len(pred)
    total = 0
    for i in range(len(real)):
        total += (real[i] - pred[i]) **2
        
    return sqrt(total / len(real))

def scatter_target(df, feature_name):
    p = Scatter(df, x=feature_name, y=target, title="Scatter Plot: " + feature_name, xlabel=feature_name, ylabel=target,
               width=200, height=200, tools=["reset", "pan", "wheel_zoom"], color="blue")

    return p

### <font color="blue">Loading data</font>

In [5]:
data = pd.read_csv("data/treino.csv")[features + [target]]
data.to_csv("data/treino_clean.csv", index=False)
use_data = data.copy()
print(data.shape)
data.head()

(88, 13)


Unnamed: 0,Vetorial,LPT,P1,IC,LP1,Cálculo2,Discreta,P2,Grafos,Fís.Clássica,LP2,Cálculo1,cra
0,8.6,10.0,9.0,9.1,8.6,8.4,8.3,8.8,8.2,7.9,9.4,8.7,8.477647
1,5.6,7.0,7.7,7.0,8.1,6.2,7.3,8.2,5.4,7.7,8.9,7.0,6.851724
2,10.0,9.8,7.9,9.6,8.3,8.7,8.8,9.5,9.2,8.6,9.7,8.6,9.090588
3,6.1,8.3,6.8,8.2,7.1,8.0,6.3,8.9,7.0,8.5,9.0,7.8,7.283516
4,8.8,9.3,5.0,8.5,5.1,5.0,5.8,7.1,5.4,8.7,8.2,5.2,7.205747


### <font color="blue">Compreendendo os dados</font>

**Visão geral**

In [6]:
data.describe()

Unnamed: 0,Vetorial,LPT,P1,IC,LP1,Cálculo2,Discreta,P2,Grafos,Fís.Clássica,LP2,Cálculo1,cra
count,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0
mean,7.280682,8.480114,7.407955,8.172727,7.597727,6.323864,6.764773,7.941364,7.196591,7.107955,8.631818,7.2,7.332535
std,1.404169,0.984522,1.346278,0.894007,1.371799,1.293662,1.228403,0.990478,1.27797,0.908987,0.969008,1.228493,0.849758
min,5.0,6.2,5.0,5.9,5.0,5.0,5.0,5.3,5.0,5.0,5.0,5.0,4.874468
25%,6.275,7.7,6.5,7.5,6.6,5.1,5.675,7.3,6.3,7.0,8.2,6.275,6.841484
50%,7.1,8.5,7.75,8.2,7.8,5.8,6.75,7.95,7.2,7.0,8.9,7.2,7.274746
75%,8.325,9.3,8.325,8.8,8.6,7.5,7.6,8.8,8.2,7.5,9.2,8.125,7.883292
max,10.0,10.0,10.0,10.0,10.0,9.3,9.6,9.6,10.0,9.1,9.7,9.8,9.090588


**Distribuições**

In [7]:
plots = []
for feature in features:
    p = Histogram(data, feature, title="Distribution: " + feature, height=200, width=300)
    plots.append(p)
    
    log_applied = data[feature].apply(np.log)
    exp_applied = data[feature].apply(np.exp)
    
    print("Skewness on " + feature)
    print("Original:", round(skew(data[feature]), 3), ",  Log:", round(skew(log_applied), 3), ",  Exp:", round(skew(exp_applied), 3))
    print()
    
grid = gridplot([plots])
show(gridplot([[plots[0], plots[1], plots[2]]]))
show(gridplot([[plots[3], plots[4], plots[5]]]))
show(gridplot([[plots[6], plots[7], plots[8]]]))

Skewness on Vetorial
Original: 0.1 ,  Log: -0.217 ,  Exp: 2.354

Skewness on LPT
Original: -0.311 ,  Log: -0.508 ,  Exp: 0.923

Skewness on P1
Original: -0.371 ,  Log: -0.651 ,  Exp: 2.329

Skewness on IC
Original: -0.077 ,  Log: -0.383 ,  Exp: 1.699

Skewness on LP1
Original: -0.327 ,  Log: -0.578 ,  Exp: 1.721

Skewness on Cálculo2
Original: 0.53 ,  Log: 0.374 ,  Exp: 2.543

Skewness on Discreta
Original: 0.347 ,  Log: 0.088 ,  Exp: 2.639

Skewness on P2
Original: -0.445 ,  Log: -0.741 ,  Exp: 0.983

Skewness on Grafos
Original: 0.067 ,  Log: -0.276 ,  Exp: 2.607

Skewness on Fís.Clássica
Original: -0.564 ,  Log: -1.015 ,  Exp: 2.089

Skewness on LP2
Original: -1.905 ,  Log: -2.413 ,  Exp: 0.172

Skewness on Cálculo1
Original: 0.028 ,  Log: -0.316 ,  Exp: 2.444



Mudanças a serem aplicadas nas features baseadas em suas nas distribuições:
- **Log**: Cálculo2, Discreta
- **Exp**: LP2

<font color="red">Devido a resultados muito ruins estas operações foram canceladas</font>

In [8]:
# use_data["Cálculo2"] = data["Cálculo2"].apply(np.log)
# use_data["Discreta"] = data["Discreta"].apply(np.log)
# use_data["LP2"] = data["LP2"].apply(np.exp)

**Relações**

In [9]:
plots = []

for feature in features:
    p = scatter_target(use_data, feature)
    plots.append(p)
    print("Corr " + feature + ":", use_data[feature].corr(use_data["cra"]))

grid = gridplot([[plots[0], plots[1], plots[2], plots[3]],
                 [plots[4], plots[5], plots[6], plots[7]],
                 [plots[8], plots[9], plots[10], plots[11]]])

show(grid)

Corr Vetorial: 0.55824517165
Corr LPT: 0.259328001461
Corr P1: 0.49118783327
Corr IC: 0.571361431792
Corr LP1: 0.486700062033
Corr Cálculo2: 0.229566329562
Corr Discreta: 0.665672447754
Corr P2: 0.683019383483
Corr Grafos: 0.701973414646
Corr Fís.Clássica: 0.347942579773
Corr LP2: 0.401984833865
Corr Cálculo1: 0.308342507601


Eliminando variáveis pouco correlacionadas com o CRA.

In [10]:
del use_data["Fís.Clássica"]
del use_data["Cálculo2"]
del use_data["LPT"]

### Models

In [11]:
from my_regression import MyRegression
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, Lasso, LassoCV, LassoLarsCV, LinearRegression, RANSACRegressor  
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error

In [12]:
def plot_lines(x, y, title=""):
    p = figure(plot_width=600, plot_height=250, title=title, tools=["reset", "pan", "xwheel_zoom"])
    p.line(x, y, color="navy", line_width=2)
    show(p)

In [13]:
def run_cv(model, model_name, x, y, alphas):
    rmse_list = [rmse_cv(model(alphas=[alpha]), x, y).mean() for alpha in alphas]

    plot_lines(alphas, rmse_list, model_name)
    print("min rmse:", min(rmse_list))
    
def run_ordinary(model, model_name, x, y, alphas):
    rmse_list = [rmse_cv(model(alpha=alpha), x, y).mean() for alpha in alphas]

    plot_lines(alphas, rmse_list, model_name)
    print("min rmse:", min(rmse_list))

In [46]:
def rmse_cv(model, x, y):
    rmse= np.sqrt(-cross_val_score(model, x, y, scoring="neg_mean_squared_error", cv = 5))
    return(rmse)

#Cross validação para MyRegression
def my_rmse_cv(x, y, learning_rate=0.00001):
    df = x.copy()
    df["cra"] = y
    df = df.sample(frac=1)
    
    rmses = []
    
    partitions = [df.loc[range(0,18)], df.loc[range(18, 36)], df.loc[range(36, 54)], df.loc[range(54, 71)], df.loc[range(71, 88)]]
    
    for i in range(len(partitions)):
        indexes = [0,1,2,3,4]
        valid = partitions[i]
        
        indexes.remove(i)
        train = pd.DataFrame()
        for j in indexes:
            train = train.append(partitions[j])
        
        model = MyRegression(train.as_matrix(), header=True)
        model.run(learning_rate=learning_rate, num_iterations=5000, verbosity=False)
        
        preds = model.predict(valid)
    
        rmses.append(mean_squared_error(valid["cra"], preds))
    
    return pd.Series(rmses)

In [15]:
y = use_data[target]
del use_data[target]

**Ridge**

In [17]:
def run_ridge(X, y):
    run_cv(RidgeCV, "RidgeCV", X.as_matrix(), y.as_matrix(),
           alphas=[0.05, 0.1, 0.3, 1.0, 3.0, 5.0, 10.0, 15.0, 20.0, 25.0, 30.0, 35.0, 40.0, 50.0, 75.0, 100.0])

    ridge = Ridge(alpha=40)
    ridge.fit(X=X.as_matrix(), y=y.as_matrix())

    preds = ridge.predict(X.as_matrix())
    mean_squared_error(y, preds)
    
run_ridge(use_data, y)

min rmse: 0.527438701898


**Lasso**

In [18]:
def run_lasso(X, y):
    run_cv(LassoCV, "LassoCV", X.as_matrix(), y.as_matrix(), alphas=[0.0001, 0.001, 0.1, 0.2, 0.3, 0.5, 1])
    lasso = Lasso(alpha=0.1)
    lasso.fit(X=X.as_matrix(), y=y.as_matrix())

    preds = lasso.predict(X.as_matrix())
    mean_squared_error(y, preds)

run_lasso(use_data, y)

min rmse: 0.530576513121


**RANSACRegressor**

In [19]:
def run_ransac(X, y):
    alphas = [5, 10, 15, 20, 25, 30, 35, 40, 50]
    rmse_list = [rmse_cv(RANSACRegressor(Ridge(alpha=alpha)), X.as_matrix(), y.as_matrix()).mean() for alpha in alphas]

    plot_lines(alphas, rmse_list, "model_name")
    print("min rmse:", min(rmse_list))

    ransac = RANSACRegressor(Ridge(alpha=25))
    ransac.fit(X=X.as_matrix(), y=y.as_matrix())

    preds = ransac.predict(X.as_matrix())
    mean_squared_error(y, preds)
    
run_ransac(use_data, y)

min rmse: 0.543436019946


**Regressão Polinomial**

In [20]:
import warnings
warnings.filterwarnings('ignore')

def run_poli(X, y):
    degrees = [1,2,3,4,5,6,7,8,9,10]

    cv_lasso_poli_5 = [rmse_cv(make_pipeline(PolynomialFeatures(degree=degree), Ridge(alpha=50)), X.as_matrix(), y.as_matrix()).mean() for degree in degrees]

    plot_lines(degrees, cv_lasso_poli_5, "Polinomial")
    print("min rmse:", min(cv_lasso_poli_5))

    poli = make_pipeline(PolynomialFeatures(degree=2), Ridge(alpha=50))
    poli.fit(X=X.as_matrix(), y=y.as_matrix())

    preds = poli.predict(X.as_matrix())
    mean_squared_error(y, preds)
    
run_poli(use_data, y)

min rmse: 0.52764636478


**KNN**

In [21]:
def run_knn(X, y):
    ns = [1,2,3,4,5,6,7,8,9,10,15,20,40]

    rmse_list = [rmse_cv(KNeighborsRegressor(n_neighbors=n), X.as_matrix(), y.as_matrix()).mean() for n in ns]

    plot_lines(ns, rmse_list, "KNN")
    print("min rmse:", min(rmse_list))

    knn = KNeighborsRegressor(n_neighbors=4)
    knn.fit(X=X.as_matrix(), y=y.as_matrix())

    preds = knn.predict(X.as_matrix())
    mean_squared_error(y, preds)
    
run_knn(use_data, y)

min rmse: 0.552366861933


**MyRegression**

In [29]:
def run_my_regression(X, y):
    alphas = [0.0000010, 
              0.0000015]

    rmse_list = [my_rmse_cv(X, y, learning_rate=alpha).mean() for alpha in alphas]

    plot_lines(alphas, rmse_list, "MyRegression")
    print("min rmse:", min(rmse_list))

    mr = MyRegression(X.as_matrix(), header=True)
    mr.run(learning_rate=0.000001, num_iterations=5000)

    preds = mr.predict(use_data)
    mean_squared_error(y, preds)
    
run_my_regression(use_data, y)

min rmse: 0.3183237600255543


O melhor resultado foi econtrado com a Regressão Linear feita na primeira tarefa da disciplina.

In [44]:
test = pd.read_csv("data/teste.csv")[features + [target]]
#serão consideradas no teste apenas as colunas que foram usadas no treino (use_data)
use_cols = use_data.columns.tolist()
test = test[use_cols + [target]]

y_test = test["cra"]
del test["cra"]

mr = MyRegression(use_data.as_matrix(), header=True)
mr.run(learning_rate=0.0000010, num_iterations=5000)

preds_myreg = mr.predict(test)
print("Resultado MyRegression:", mean_squared_error(y_test, preds_myreg))

Resultado MyRegression: 0.278179824103


In [45]:
ridge = Ridge(alpha=40)
ridge.fit(X=use_data.as_matrix(), y=y.as_matrix())

preds_ridge = ridge.predict(test.as_matrix())
print("Resultado Ridge:", mean_squared_error(y_test, preds_ridge))

Resultado Ridge: 0.147510716201


Como seria a combinação de resultados?

In [49]:
comb = (pd.Series(preds_myreg) + pd.Series(preds_ridge)) / 2

In [50]:
print("Resultado MyRegression + Ridge:", mean_squared_error(y_test, comb))

Resultado MyRegression + Ridge: 0.169619210052
