In [None]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib notebook

# Outline 

* sklearn 
    * StandardScaler
    * Train-test split and K-fold cross validation
    * One-hot Encoder
    * Multivariate linear regression
* Q&A on HW3

## Normalize your dataset using sklearn

In [None]:
import pandas as pd
data=pd.read_csv("../../Datasets/wines.csv")
data.head()

In HW2, we normalize our data by our own code. This is how I did it:

In [None]:
df=data
feats=df.drop(['Start assignment','ranking'],axis=1)
rankings=df['ranking']
avg=np.average(feats,axis=0)
std=np.std(feats,axis=0)
feats=feats-avg
feats=feats/std
feats

But we can also do it through the sklearn package using [StandardScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html) funtion

In [None]:
from sklearn.preprocessing import StandardScaler

x=data.drop(["Start assignment","ranking"],axis=1).values
y=data['ranking'].values



## Train-test split and K-fold cross validation
Documentation for [train_test_split](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html)<br>
Split our dataset into train and test

In [None]:
import sklearn.model_selection as skl_model
train_feat,test_feat,train_ranking,test_ranking=...

In [None]:
print(f'{len(train_ranking)} wines for training and {len(test_ranking)} for testing')

Documentation on [KFold](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html)

In [None]:
kf = ...

Framework for doing K-fold cross validation

In [None]:
def Kfold(k,Xs,ys):
    # The total number of examples for training the network
    total_num=len(Xs)
    # Built in K-fold function in Sci-Kit Learn
    kf=skl_model.KFold(n_splits=k,shuffle=True)
    
    # kf.split: Generate indices to split data into training and test set.
    for train_selector,test_selector in kf.split(range(total_num)):
        # Decite training examples and testing examples for this fold
        train_Xs=Xs[train_selector]
        test_Xs=Xs[test_selector]
        train_ys=ys[train_selector]
        test_ys=ys[test_selector]
        
        val_array=[]
        # Split training examples further into training and validation
        train_in,val_in,train_real,val_real=skl_model.train_test_split(train_Xs,train_ys)
        
        # Fit the data to your model
        # Train the model on your data
        ...
        for _ in range(max_epoch):
            # Train model on a number of epochs, and test performance in the validation set
            ...


        # Report result for the fold with minimum error in validation set
        train_error=model.evaluate(train_Xs,train_ys)
        test_error=model.evaluate(test_Xs,test_ys)
        print("Train error:",train_error)
        print("Test error:",test_error)
        

L. Prechelt, "Early Stopping -- but when?", Neural Networks: Tricks of the trade. Springer, Berlin, Heidelberg, 1998. 55-69.
[Link](https://link.springer.com/content/pdf/10.1007%2F978-3-642-35289-8_5.pdf)

## One hot Encoder

Let's pretend for a  minute that in our wine dataset ranks and start assignments are not labels but 2 categorical features, and we want to use one-hot encoders to describe them 

Split the features into categorical features and continuous features, 

In [None]:
categorical_feats=df[['Start assignment','ranking']]
continuous_feats=x_norm

Use [one-hot encoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html) to transform the categorical features

In [None]:
from sklearn.preprocessing import OneHotEncoder


Then you can stack categorical and continuous features together for prediction

Similarly, we can use one-hot encoders to encode the output

In [None]:
output_encoder=OneHotEncoder()
y = np.array(df['ranking']).reshape(-1,1)
output_encoder.fit(y)
print(output_encoder.transform(y).toarray())

Then we can decode the prediction

## Multivariate linear regression

Let's try regression of function
$$f(x,y)=3x+2y-5$$

In [None]:
def generate_X(number):
    xs=(np.random.random(number)*2-1)*10
    ys=(np.random.random(number)*2-1)*10
    return np.hstack([xs.reshape(-1,1),ys.reshape(-1,1)])
    
def generate_data(number,stochascity=0.05):
    X=generate_X(number)
    xs=X[:,0]
    ys=X[:,1]
    fs=3*xs+2*ys-5
    stochastic_ratio=(np.random.random(number)*2-1)*stochascity+1
    return X,fs*stochastic_ratio

In [None]:
%matplotlib widget
x,y=generate_data(5000,0.1)
fig=plt.figure()
ax=fig.gca(projection='3d')
ax.scatter(x[:,0],x[:,1],y,s=0.1)

In [None]:
from sklearn.linear_model import LinearRegression
X,y=generate_data(1000)
reg=...
print(reg.score(X,y))
print(reg.coef_,reg.intercept_)

In [None]:
X=generate_X(5000)
y=...
fig=plt.figure()
ax=fig.gca(projection='3d')
ax.scatter(X[:,0],X[:,1],y,s=0.1)