# ML for Finance
## Fall 2020
## Lesson 7
---

In [None]:
# TECHNICAL CELL
%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

import os
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Import Transactions Data (https://www.dropbox.com/s/3oyrho6agd7clpm/transactions.csv?dl=1)
transactions = pd.read_csv('transactions.csv')

In [None]:
# Extract day number
transactions['day'] = transactions.tr_datetime.apply(lambda x: int(x.split(' ')[0]))

In [None]:
# Transactions per day
gr = transactions.groupby('day')['amount'].count()

In [None]:
# Plot Time Series
plt.figure(figsize=(10, 5))
plt.plot(np.array(gr.index), np.array(gr))
plt.xlabel('day')
plt.ylabel('count')
plt.show()

In [None]:
# Find minimum
np.argmin(gr)

In [None]:
# Enlarge a bit
part_transaction = transactions.loc[np.logical_and(transactions.day >= 10, transactions.day <= 63)]

gr = part_transaction.groupby('day')['amount'].count()

plt.figure(figsize=(10, 5))
plt.plot(np.array(gr.index), np.array(gr), 'o-')
plt.plot([31, 31], [10000, 15000])
plt.xlabel('day')
plt.ylabel('count')
plt.show()

In [None]:
# Except working days
part_transaction = transactions.loc[np.logical_and(transactions.day >= 31+28+31+30, transactions.day <= 31+28+31+30+15)]

gr = part_transaction.groupby('day')['amount'].count()

plt.figure(figsize=(10, 5))
plt.plot(np.array(gr.index), np.array(gr), 'o-')
plt.xlabel('day')
plt.ylabel('count')
plt.show()

In [None]:
# Housing Data (https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data)
# The file is also place in MS Teams (Files)
import seaborn as sns
sns.set(style="darkgrid")

data = pd.read_csv("train.csv")
data.head()

In [None]:
from sklearn.model_selection import train_test_split

data = data.drop(columns=["Id"])

y = data["SalePrice"]
X = data.drop(columns=["SalePrice"])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)

In [None]:
sns.distplot(y_train);

In [None]:
# Modelling
from sklearn.linear_model import LinearRegression

np.random.seed(36)
x = np.linspace(0, 1, 100)
y = np.cos(1.5 * np.pi * x)

x_objects = np.random.uniform(0, 1, size=30)
y_objects = np.cos(1.5 * np.pi * x_objects) + np.random.normal(scale=0.1, size=x_objects.shape)

from sklearn.preprocessing import PolynomialFeatures
fig, axs = plt.subplots(figsize=(16, 4), ncols=3)
for i, degree in enumerate([1, 4, 20]):
    X_objects = PolynomialFeatures(degree).fit_transform(x_objects[:, None])
    X = PolynomialFeatures(degree).fit_transform(x[:, None])
    regr = LinearRegression().fit(X_objects, y_objects)
    y_pred = regr.predict(X)
    axs[i].plot(x, y, label="Real function")
    axs[i].scatter(x_objects, y_objects, label="Data")
    axs[i].plot(x, y_pred, label="Prediction")
    if i == 0:
        axs[i].legend()
    axs[i].set_title("Degree = %d" % degree)
    axs[i].set_xlabel("$x$")
    axs[i].set_ylabel("$f(x)$")
    axs[i].set_ylim(-2, 2)

In [None]:
# Features
fig, axs = plt.subplots(figsize=(16, 5), ncols=3)
for i, feature in enumerate(["GrLivArea", "GarageArea", "TotalBsmtSF"]):
    axs[i].scatter(X_train[feature], y_train, alpha=0.2)
    axs[i].set_xlabel(feature)
    axs[i].set_ylabel("SalePrice")
plt.tight_layout()

In [None]:
# Correlations
numeric_data = X_train.select_dtypes([np.number])
numeric_data_mean = numeric_data.mean()
numeric_features = numeric_data.columns

X_train = X_train.fillna(numeric_data_mean)
X_test = X_test.fillna(numeric_data_mean)

correlations = {
    feature: np.corrcoef(X_train[feature], y_train)[0][1]
    for feature in numeric_features
}
sorted_correlations = sorted(correlations.items(), key=lambda x: x[1], reverse=True)
features_order = [x[0] for x in sorted_correlations]
correlations = [x[1] for x in sorted_correlations]

plot = sns.barplot(y=features_order, x=correlations)
plot.figure.set_size_inches(15, 10)

In [None]:
# Mean Square Error
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

model = Ridge()
model.fit(X_train[numeric_features], y_train)
y_pred = model.predict(X_test[numeric_features])
y_train_pred = model.predict(X_train[numeric_features])

print("Test MSE = %.4f" % mean_squared_error(y_test, y_pred))
print("Train MSE = %.4f" % mean_squared_error(y_train, y_train_pred))

In [None]:
# More on linear models
df = pd.read_csv('data/dataset.csv', sep = '\t')
df.index = range(len(df))
df.head()

In [None]:
# Train vs. Test
df_train, df_test = train_test_split(df, test_size = 0.2, random_state = 42)

In [None]:
df_train.head()

In [None]:
# Plot Scatter
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12,8)

df_train.plot(x='CRIM', y='MEDV', kind='scatter', s=120);

In [None]:
df_train.plot(x='RM', y='MEDV', kind='scatter', s=120);

In [None]:
# OLS

class SimpleLinearRegression():
    def __init__(self, fit_intercept = True):
        self.coef_ = []
        self.intercept_ = 0.0
        self.fit_intercept = fit_intercept
    
    def fit(self, X, y):
        self._solve(np.copy(X), np.copy(y))
        

    def predict(self, X):
        #multiply X on self.coef and add intercept
        return X.dot(self.coef_) + self.intercept_

    
    def _solve(self, X, y):
        #if we find intercept, sub mean from X and y 
        if self.fit_intercept:
            #sub mean
            X_offset = np.mean(X, axis = 0)
            X -= X_offset
        
            y_offset = np.mean(y)
            y -= y_offset

        #analytical solution, A @ B is multiply A on B like matrix
        self.coef_ = np.matmul(np.linalg.inv(X.T @ X), X.T).dot(y)
        
        if self.fit_intercept:
            self._set_intercept(X_offset, y_offset)


    def _set_intercept(self, X_offset, y_offset):
        self.intercept_ = y_offset - X_offset.dot(self.coef_)

In [None]:
X_train = df_train.RM.values.reshape(-1, 1)
y_train = df_train.MEDV.values

In [None]:
# Train the Model
model = SimpleLinearRegression()
model.fit(X_train, y_train)

In [None]:
print('Model:\nprice = %.2f + (%.2f)*rooms' % (model.intercept_, model.coef_[0]))

In [None]:
print('intercept = {}'.format(model.intercept_), 'coef = {}'.format(model.coef_[0]))

In [None]:
# Plot the Model
df_train.plot(x = 'RM', y = 'MEDV', kind = 'scatter', s=120)

#predict values
y_hat = model.predict(X_train)

plt.plot(X_train, y_hat);

In [None]:
X_train = df_train.drop(columns = ['MEDV']).values
y_train = df_train.MEDV.values

In [None]:
model = SimpleLinearRegression()
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

* $\frac{1}{N} \sum_n |\hat{y}_{n}-y_n|$ - mean absolute error
* $\frac{1}{N} \sum_n (\hat{y}_{n}-y_n)^2$ - mean squared error

In [None]:
y_test = df_test.MEDV.values
y_hat = model.predict(df_test.drop(columns = ['MEDV']).values)

In [None]:
print('MAE %.2f' % mean_absolute_error(y_test, y_hat))
print('MSE %.2f' % mean_squared_error(y_test, y_hat))

In [None]:
# Linear from sklearn
from sklearn.linear_model import LinearRegression
model = LinearRegression()
#fit Linear model
model.fit(X_train, y_train)

In [None]:
y_test = df_test.MEDV.values
y_hat = model.predict(df_test.drop(columns = ['MEDV']).values)
print('MAE %.2f' % mean_absolute_error(y_test, y_hat))
print('MSE %.2f' % mean_squared_error(y_test, y_hat))

In [None]:
# Outliers tracking
X_train = df_train.RM.values.reshape(-1, 1)
y_train = df_train.MEDV.values
n = y_train.shape[0]

In [None]:
## Add random outliers
for i in range(10):
    X_train = np.r_[X_train, [[np.random.rand()*20]]]
    y_train = np.r_[y_train, np.random.randn()*10]

In [None]:
# Plot It
plt.scatter(X_train, y_train);

In [None]:
# Model It
model = SimpleLinearRegression(fit_intercept=True)
model.fit(X_train[:n], y_train[:n])

model_ouliers = SimpleLinearRegression(fit_intercept=True)
model_ouliers.fit(X_train, y_train)

In [None]:
# Done Finally
x = np.linspace(0, max(X_train), 100).reshape(-1, 1)
y_hat = model.predict(x)
y_hat_outliers = model_ouliers.predict(x)

fig, ax = plt.subplots(1, 1, figsize=(10,5))
ax.scatter(X_train, y_train)

ax.plot(x, y_hat, c='red', label='good model')
ax.plot(x, y_hat_outliers, c='green', label='biased model')
plt.legend();

### KNN Simulation

In [None]:
# KNN
from sklearn.datasets import make_moons, load_iris # import function from the library
N = 1000
X, y = make_moons(n_samples=N, noise=0.2, random_state=11) # generate data sample

# Create an figure with a custom size
plt.figure(figsize=(9, 6))

# Plot all objects with y == 0 (class 0)
plt.scatter(X[y == 0, 0],     # selects all objects with y == 0 and the 1st column of X
            X[y == 0, 1],     # selects all objects with y == 0 and the 2nd column of X
            color='r',        # points color
            label='0')        # label for the plot legend


# Plot all objects with y == 1 (class 1)
plt.scatter(X[y == 1, 0],     # selects all objects with y == 1 and the 1st column of X
            X[y == 1, 1],     # selects all objects with y == 1 and the 2nd column of X
            color='b',        # points color
            label='1')        # label for the plot legend

plt.xlabel('X1') # set up X-axis label
plt.ylabel('X2') # set up Y-axis label

plt.legend(loc='best') # create the plot legend and set up it position
plt.grid(b=1) # create grid on the plot

plt.show() # display the plot

In [None]:
# import train_test_split function to split the sample
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.5,    # 20% for test, 80% for train
                                                    random_state=123) # shuffle objects before split

In [None]:
print("X, y shapes: ", X.shape, y.shape)
print("X_train, y_train shapes: ", X_train.shape, y_train.shape)
print("X_test, y_test shapes: ", X_test.shape, y_test.shape)

In [None]:
class KNNClassifier(object):
    
    def __init__(self, k_neighbors=1):
        """
        This is a constructor of the class. 
        Here you can define parameters (k_neighbors) of the class and 
        attributes, that are visible within all methods of the class.
        
        Parameters
        ----------
        k_neighbors : int
            Number of neighbors used for classification.
        """
        
        # Make this parameter visible in all methods of the class
        self.k_neighbors = k_neighbors
        
        # Lets define variables for data, that will be used during the classifier fit and predict
        self.X_train = None
        self.y_train = None
                
    
    def fit(self, X, y):
        """
        This method trains the KNN classifier. 
        Actualy, the KNN classifier has no training procedure.
        It just remembers data (X, y) that will be used for predictions.
        
        Parameters
        ----------
        X : numpy.array, shape = (n_objects, n_features)
            Matrix of objects that are described by their input features.
        y : numpy.array, shape = (n_objects)
            1D array with the object labels. 
            For the classification labels are integers in {0, 1, 2, ...}.
        """
        
        # Just save X and y. There is no training procedure for KNN classifier
        self.X_train = X
        self.y_train = y
        
    
    def calculate_distances(self, X, one_x):
        """
        This method calculates distances between one object and all other objects.
        
        Parameters
        ----------
        X : numpy.array, shape = (n_objects, n_features)
            Matrix of objects that are described by their input features.
        one_x : numpy.array, shape = (n_features)
        """
        
        dists = np.sqrt( np.sum( (X - one_x)**2, axis=1 ) )
        return dists
    
    
    def predict(self, X):
        """
        This methods performs labels prediction for new objects.
        
        Parameters
        ----------
        X : numpy.array, shape = (n_objects, n_features)
            Matrix of objects that are described by their input features.
            
        Returns
        -------
        y_predicted : numpy.array, shape = (n_objects)
            1D array with predicted labels. 
            For the classification labels are integers in {0, 1, 2, ...}.
        """
        
        # Create an empty list for predicted labels
        y_predicted = []
        
        # For each object in X make prediction
        for one_x in X:
            
            # one_x = [0.2, 0.57] (example)
            
            # Calculate distances between an object and all objects from train smaple
            distances = self.calculate_distances(self.X_train, one_x)
            # distances = [0.25, 0.10, 0.32, 0.05] (example)
            
            # Sort the distances and get indeces of the sorted order
            sorted_indeces = distances.argsort()
            # sorted_indeces = [3, 1, 0, 2] (example)
            
            # Get k_neighbors from train sample with the smallest distances
            k_neighbors_indeces = sorted_indeces[:self.k_neighbors] # take the first k_neighbors elements
            # k_neighbors_indeces = [3, 1, 0], for k_neighbors=2 (example)
            
            # Get labels of these k_neighbors
            k_neighbors_labels = self.y_train[k_neighbors_indeces]
            # k_neighbors_labels = [0, 1, 0] (example)
            
            # Get list of unique labels and counts of each label
            unique_labels, label_counts = np.unique(k_neighbors_labels, return_counts=True)
            # unique_labels = [0, 1] (example)
            # label_counts  = [2, 1] (example)
            
            # Get label with the maximum count
            label_max_count = unique_labels[label_counts == label_counts.max()][0]
            # label_max_count = 0 (example)
            
            # Save the predicted label
            y_predicted.append(label_max_count)
            
        return np.array(y_predicted) # return numpy.array

In [None]:
# Create a KNN classifier object
knn = KNNClassifier(k_neighbors=2)

# Train the classifier (remember, that KNN has no training procedure, but it is a tradition :)
knn.fit(X_train, y_train)

# Use the classifier to predict labels
y_test_predict = knn.predict(X_test)

In [None]:
# Let's print out the first 20 predicted labels
y_test_predict[:20]

In [None]:
# Let's print out the first 20 true test labels
y_test[:20]

In [None]:
# Create an figure with a custom size
plt.figure(figsize=(9, 6))

# Plot all objects with y == 0 (class 0)
plt.scatter(X_test[y_test == 0, 0],     # selects all objects with y == 0 and the 1st column of X
            X_test[y_test == 0, 1],     # selects all objects with y == 0 and the 2nd column of X
            color='r',                  # points color
            label='0')                  # label for the plot legend


# Plot all objects with y == 1 (class 1)
plt.scatter(X_test[y_test == 1, 0],     # selects all objects with y == 1 and the 1st column of X
            X_test[y_test == 1, 1],     # selects all objects with y == 1 and the 2nd column of X
            color='b',                  # points color
            label='1')                  # label for the plot legend


### THIS IS JUST MAGIC :)

# plot decision boundary
h=0.1
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=plt.cm.RdBu, alpha=.6, levels=1)

### THE END OF THE MAGIC


plt.xlabel('X1') # set up X-axis label
plt.ylabel('X2') # set up Y-axis label

plt.legend(loc='best') # create the plot legend and set up it position
plt.grid(b=1) # create grid on the plot

plt.show() # display the plot

Let's measure the quality using **accuracy score**:

$$
\text{Accuracy }(y\_true, y\_predict) = \frac{1}{N} \sum_{i=1}^{N} I(y\_predict_{i} == y\_true_{i})
$$

For the accuracy calculation we use function **accuracy_score** from the scukit-learn library.

In [None]:
# Import accuracy_score function
from sklearn.metrics import accuracy_score

# Calculate accuracy for the test sample
accuracy_test = accuracy_score(y_test, y_test_predict)

print("Test accuracy of KNN classifier: ", accuracy_test)

In [None]:
# Logistic
#generate two linear separated samples
np.random.seed(0)
X = np.r_[np.random.randn(20, 2) + [2, 2],
          np.random.randn(20, 2) + [-2, -2]]
y = [-1] * 20 + [1] * 20

In [None]:
fig, ax = plt.subplots(figsize=(7, 7))
ax.scatter(X[:, 0],
           X[:, 1],
           c=y,
           cmap=plt.cm.Paired);

In [None]:
class SimpleLogisticRegression:
    def __init__(self, C = 1.0, fit_intercept = True, penalty = 'l2', max_iter = 5000):
        self.C = C
        self.fit_intercept = fit_intercept
        self.penalty = penalty
        self.max_iter = max_iter
    
    
    def fit(self, X, y):        
        #zero initialization
        self.coef_ = np.zeros(X.shape[1])
        self.intercept_ = 0.0
        
        #run grad descent
        self.qual_ = self.grad_descent(X, y, 0.05)
        
    def predict_proba(self, X):
        return np.array([1 / (1 + np.exp(X.dot(self.coef_) + self.intercept_)),\
                         1 / (1 + np.exp(-X.dot(self.coef_) - self.intercept_))])
    
    def predict(self, X):
        predict = (self.predict_proba(X)[1, :] > 0.5).astype(int)
        
        #transform 0 to -1
        predict[predict == 0] = -1
        
        return predict
    
    def decision_function(self, X):
        return X.dot(self.coef_) + self.intercept_

    #labels of classes {1, -1}
    def loss(self, X, y):
        loss = np.mean(np.log(1 + np.exp((-X.dot(self.coef_) + self.intercept_) * y)))
        if self.penalty == 'l2':
            reg = np.sum(self.coef_ ** 2)
            if self.fit_intercept:
                reg += self.intercept_ ** 2
        elif self.penalty == 'l1':
            reg = np.sum(np.abs(self.coef_))
            if self.fit_intercept:
                reg += np.abs(self.intercept_)
        else:
            raise NotImplementedError
        
        return loss + reg / (self.C * X.shape[0])
    
    def grad_descent(self, X, y, eta ):
        qual = np.zeros(self.max_iter)
        
        for iterations in range(self.max_iter):
            #calculate gradient
            decision_function = -(X.dot(self.coef_) + self.intercept_)
            grad_coef = -np.exp(decision_function * y) / (1 + np.exp(decision_function * y))
            grad_coef *= y
            
            if isinstance(X, np.ndarray):
                mean_grad = np.mean(X.T * grad_coef, axis = 1)
            elif isinstance(X, scipy.sparse.csr.csr_matrix):
                mean_grad = np.array(np.mean(X.T.multiply(grad_coef), axis = 1)[:, 0]).reshape(-1)
            else:
                raise NotImplementedError

            w = np.copy(self.coef_)
            self.coef_ -= eta * mean_grad
            
            if self.fit_intercept:
                w_0 = self.intercept_
                self.intercept_ -= eta * np.mean(grad_coef)
    
            #add penalty
            if self.penalty == 'l2':
                self.coef_ -= 2 * eta * w / (self.C * X.shape[0])
                if self.fit_intercept:
                    self.intercept_ -= 2 * eta * w_0 / (self.C * X.shape[0])
            elif self.penalty == 'l1':
                self.coef_ -= eta * np.sign(w) / (self.C * X.shape[0])
                if self.fit_intercept:
                    self.intercept_ -= eta * np.sign(w_0) / (self.C * X.shape[0])
            else:
                raise NotImplementedError
            qual[iterations] = self.loss(X, y)

        return qual

In [None]:
model = SimpleLogisticRegression(C=1.0, 
                           fit_intercept=True, 
                           penalty='l2')
model.fit(X, y)

In [None]:
print('w_0 = %f' % model.intercept_)
print('w_1, w_2 = ', model.coef_)

In [None]:
y_hat = model.predict(X)
y_hat

In [None]:
y_hat_proba = model.predict_proba(X)
y_hat_proba[:20, :]

In [None]:
dec_func = model.decision_function(X)
dec_func

In [None]:
fig, ax = plt.subplots(figsize=(7, 7))
x0, x1 = np.meshgrid(np.arange(-3, 3, 0.1),
                       np.arange(-3, 3, 0.1))
xx0, xx1 = x0.ravel(), x1.ravel()

X_grid = np.c_[xx0, xx1, ]

y_hat = model.decision_function(X_grid)
y_hat = y_hat.reshape(x0.shape)

plt.contour(x0, x1, y_hat, levels=[0])


ax.scatter(X[:, 0],
           X[:, 1],
           c=y,
           cmap=plt.cm.Paired);

In [None]:
# Quality vs. Loss
qual = model.qual_
x = np.arange(0, len(qual))
plt.title('quality')
plt.plot(x, qual)
plt.xlabel('iterations')
plt.ylabel('log-loss with l2');