In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# Ex. 10

In [2]:
dataset_file = '../datasets/Weekly.csv'
data = pd.read_csv(dataset_file)
data.head()

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
0,1990,0.816,1.572,-3.936,-0.229,-3.484,0.154976,-0.27,Down
1,1990,-0.27,0.816,1.572,-3.936,-0.229,0.148574,-2.576,Down
2,1990,-2.576,-0.27,0.816,1.572,-3.936,0.159837,3.514,Up
3,1990,3.514,-2.576,-0.27,0.816,1.572,0.16163,0.712,Up
4,1990,0.712,3.514,-2.576,-0.27,0.816,0.153728,1.178,Up


In [3]:
print(data.Year.unique())
print(data.dtypes)
print(data.loc[data.isna().any(axis=1)])
print(data.Direction.unique())

[1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003
 2004 2005 2006 2007 2008 2009 2010]
Year           int64
Lag1         float64
Lag2         float64
Lag3         float64
Lag4         float64
Lag5         float64
Volume       float64
Today        float64
Direction     object
dtype: object
Empty DataFrame
Columns: [Year, Lag1, Lag2, Lag3, Lag4, Lag5, Volume, Today, Direction]
Index: []
['Down' 'Up']


In [4]:
data.Direction = data.Direction.apply(lambda v: 1 if v == 'Up' else 0)

In [5]:
data.describe()

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
count,1089.0,1089.0,1089.0,1089.0,1089.0,1089.0,1089.0,1089.0,1089.0
mean,2000.048669,0.150585,0.151079,0.147205,0.145818,0.139893,1.574618,0.149899,0.555556
std,6.033182,2.357013,2.357254,2.360502,2.360279,2.361285,1.686636,2.356927,0.497132
min,1990.0,-18.195,-18.195,-18.195,-18.195,-18.195,0.087465,-18.195,0.0
25%,1995.0,-1.154,-1.154,-1.158,-1.158,-1.166,0.332022,-1.154,0.0
50%,2000.0,0.241,0.241,0.241,0.238,0.234,1.00268,0.241,1.0
75%,2005.0,1.405,1.409,1.409,1.409,1.405,2.053727,1.405,1.0
max,2010.0,12.026,12.026,12.026,12.026,12.026,9.328214,12.026,1.0


In [None]:
sns.pairplot(data, hue='Direction')

<seaborn.axisgrid.PairGrid at 0x7eff040a8dc0>

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(data.corr(), annot=True)

There is no patterns in data except for obvious pattern in relationship between `Year` and `Volume`.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report
import statsmodels.api as sm

In [None]:
X = data.loc[:, ['Lag1', 'Lag2', 'Lag3', 'Lag4', 'Lag5', 'Volume']]
y = data.Direction
model = sm.Logit(y, sm.add_constant(X)).fit()
model.summary()

Only `Lag2` variable appears to be significant.

In [None]:
from collections import Counter

def get_col_mx(model, test_X, test_y, thr = .5):
    if isinstance(model, (LinearDiscriminantAnalysis, LogisticRegression)):
        preds = model.predict_proba(test_X)[:, 1]
    else:
        preds = model.predict(test_X)
    preds = pd.Series([1 if pred > thr else 0 for pred in preds])
    
    col_mx = Counter({'TP': 0, 'FP': 0, 'FN': 0, 'TN': 0})
    
    for pred, true in zip(preds, test_y):
        if pred:
            if pred == true:
                col_mx['TP'] += 1
            else:
                col_mx['FP'] += 1
        else:
            if pred == true:
                col_mx['TN'] += 1
            else:
                col_mx['FN'] += 1

    return col_mx

def get_error_rate(col_mx):
    return (col_mx['FP'] + col_mx['FN']) / sum(col_mx.values())

In [None]:
mx = get_col_mx(model, sm.add_constant(X), y)
error_rate = get_error_rate(mx)
print(mx)
print(1 - error_rate)
print('True positive rate:', mx['TP'] / (mx['TP'] + mx['FP']))
print('True negative rate:', mx['TN'] / (mx['TN'] + mx['FN']))


Confusion matrix tells that model is little bit more correct in predicting positive market direction. But the error rate shows that model is not much better than random guessing.

In [None]:
train, test = data.loc[data.Year < 2009].index, data.loc[data.Year >= 2009].index
train_X, test_X = X.iloc[train].Lag2, X.iloc[test].Lag2
train_X, test_X = train_X.values.reshape(-1, 1), test_X.values.reshape(-1, 1)
train_y, test_y = y.iloc[train], y.iloc[test]

In [None]:
train_X.shape, test_X.shape

In [None]:
model1 = LogisticRegression().fit(train_X, train_y)
mx = get_col_mx(model1, test_X, test_y)
error_rate = get_error_rate(mx)
print(mx)
print(1 - error_rate)

In [None]:
model2 = LinearDiscriminantAnalysis().fit(train_X, train_y)
mx = get_col_mx(model2, test_X, test_y)
error_rate = get_error_rate(mx)
print(mx)
print(1 - error_rate)

In [None]:
model3 = QuadraticDiscriminantAnalysis().fit(train_X, train_y)
mx = get_col_mx(model3, test_X, test_y)
error_rate = get_error_rate(mx)
print(mx)
print(1 - error_rate)

In [None]:
model4 = KNeighborsClassifier(n_neighbors=1).fit(train_X, train_y)
mx = get_col_mx(model4, test_X, test_y)
error_rate = get_error_rate(mx)
print(mx)
print(1 - error_rate)

Linears methods appears to better fit to data. KNN performs worse than all.

In [None]:
cols = ['Lag1', 'Lag2', 'Lag3', 'Lag4', 'Lag5', 'Volume']
inter_terms = {}
for i in range(len(cols) - 1):
    for j in range(i + 1, len(cols)):
        inter_terms['x'.join([cols[i], cols[j]])] = np.array([data[cols[i]], data[cols[j]], data[cols[i]] * data[cols[j]]])

sq_terms = {}
for col in cols:
    sq_terms[col+'^2'] = np.array([data[col], data[col] ** 2])

In [None]:
print('Logistic Regression with Interterms')
results = pd.DataFrame(index=inter_terms.keys(), columns=['TP', 'TN', 'FN', 'FP', 'error'])
for key, terms in inter_terms.items():
    train_X, test_X = terms.T[train], terms.T[test]
    train_y, test_y = y.iloc[train], y.iloc[test]
    
    model = LogisticRegression().fit(train_X, train_y)
    mx = get_col_mx(model, test_X, test_y)
    error_rate = get_error_rate(mx)
    
    results.loc[key]['FP'] = mx['FP']
    results.loc[key]['FN'] = mx['FN']
    results.loc[key]['TP'] = mx['TP']
    results.loc[key]['TN'] = mx['TN']
    results.loc[key]['error'] = error_rate
results.sort_values(by='error')

For logistic regression model `Lag2` and `Lag3` interterms shows best results.

In [None]:
print('Logistic Regression with square terms')
results = pd.DataFrame(index=sq_terms.keys(), columns=['TP', 'TN', 'FN', 'FP', 'error'])
for key, terms in sq_terms.items():
    train_X, test_X = terms.T[train], terms.T[test]
    train_y, test_y = y.iloc[train], y.iloc[test]
    
    model = LogisticRegression().fit(train_X, train_y)
    mx = get_col_mx(model, test_X, test_y)
    error_rate = get_error_rate(mx)
    
    results.loc[key]['FP'] = mx['FP']
    results.loc[key]['FN'] = mx['FN']
    results.loc[key]['TP'] = mx['TP']
    results.loc[key]['TN'] = mx['TN']
    results.loc[key]['error'] = error_rate

results.sort_values(by='error')

And again `Lag2` is the winner. But square term doesn't give any improvements from base model.

In [None]:
print('LDA with Interterms\n')
results = pd.DataFrame(index=inter_terms.keys(), columns=['TP', 'TN', 'FN', 'FP', 'error'])
for key, terms in inter_terms.items():
    train_X, test_X = terms.T[train], terms.T[test]
    train_y, test_y = y.iloc[train], y.iloc[test]
    
    model = LinearDiscriminantAnalysis().fit(train_X, train_y)
    mx = get_col_mx(model, test_X, test_y)
    error_rate = get_error_rate(mx)
    
    results.loc[key]['FP'] = mx['FP']
    results.loc[key]['FN'] = mx['FN']
    results.loc[key]['TP'] = mx['TP']
    results.loc[key]['TN'] = mx['TN']
    results.loc[key]['error'] = error_rate

print(results.sort_values(by='error'))

print('\nLDA with square terms\n')
results = pd.DataFrame(index=sq_terms.keys(), columns=['TP', 'TN', 'FN', 'FP', 'error'])
for key, terms in sq_terms.items():
    train_X, test_X = terms.T[train], terms.T[test]
    train_y, test_y = y.iloc[train], y.iloc[test]
    
    model = LinearDiscriminantAnalysis().fit(train_X, train_y)
    mx = get_col_mx(model, test_X, test_y)
    error_rate = get_error_rate(mx)
    
    results.loc[key]['FP'] = mx['FP']
    results.loc[key]['FN'] = mx['FN']
    results.loc[key]['TP'] = mx['TP']
    results.loc[key]['TN'] = mx['TN']
    results.loc[key]['error'] = error_rate

print(results.sort_values(by='error'))

The results for LDA pretty much the same as for logistic regression.

In [None]:
print('QDA with Interterms\n')
results = pd.DataFrame(index=inter_terms.keys(), columns=['TP', 'TN', 'FN', 'FP', 'error'])
for key, terms in inter_terms.items():
    train_X, test_X = terms.T[train], terms.T[test]
    train_y, test_y = y.iloc[train], y.iloc[test]
    
    model = QuadraticDiscriminantAnalysis().fit(train_X, train_y)
    mx = get_col_mx(model, test_X, test_y)
    error_rate = get_error_rate(mx)
    
    results.loc[key]['FP'] = mx['FP']
    results.loc[key]['FN'] = mx['FN']
    results.loc[key]['TP'] = mx['TP']
    results.loc[key]['TN'] = mx['TN']
    results.loc[key]['error'] = error_rate

print(results.sort_values(by='error'))

print('\nQDA with square terms\n')
results = pd.DataFrame(index=sq_terms.keys(), columns=['TP', 'TN', 'FN', 'FP', 'error'])
for key, terms in sq_terms.items():
    train_X, test_X = terms.T[train], terms.T[test]
    train_y, test_y = y.iloc[train], y.iloc[test]
    
    model = QuadraticDiscriminantAnalysis().fit(train_X, train_y)
    mx = get_col_mx(model, test_X, test_y)
    error_rate = get_error_rate(mx)
    
    results.loc[key]['FP'] = mx['FP']
    results.loc[key]['FN'] = mx['FN']
    results.loc[key]['TP'] = mx['TP']
    results.loc[key]['TN'] = mx['TN']
    results.loc[key]['error'] = error_rate

print(results.sort_values(by='error'))

QDA performs worse except for the case with squared `Lag2`.

In [None]:
print('KNN with Interterm\n')
k_array = [1, 3, 5, 10, 50, 100]
results = pd.DataFrame(index=k_array, columns=['TP', 'TN', 'FN', 'FP', 'error'])
for k in k_array:
    train_X, test_X = inter_terms['Lag2xLag3'].T[train], inter_terms['Lag2xLag3'].T[test]
    train_y, test_y = y.iloc[train], y.iloc[test]
    
    model = KNeighborsClassifier(n_neighbors=k).fit(train_X, train_y)
    mx = get_col_mx(model, test_X, test_y)
    error_rate = get_error_rate(mx)
    
    results.loc[k]['FP'] = mx['FP']
    results.loc[k]['FN'] = mx['FN']
    results.loc[k]['TP'] = mx['TP']
    results.loc[k]['TN'] = mx['TN']
    results.loc[k]['error'] = error_rate

print(results.sort_values(by='error'))

print('\nKNN with square term\n')
results = pd.DataFrame(index=k_array, columns=['TP', 'TN', 'FN', 'FP', 'error'])
for k in k_array:
    train_X, test_X = sq_terms['Lag2^2'].T[train], sq_terms['Lag2^2'].T[test]
    train_y, test_y = y.iloc[train], y.iloc[test]
    
    model = KNeighborsClassifier(n_neighbors=k).fit(train_X, train_y)
    mx = get_col_mx(model, test_X, test_y)
    error_rate = get_error_rate(mx)
    
    results.loc[k]['FP'] = mx['FP']
    results.loc[k]['FN'] = mx['FN']
    results.loc[k]['TP'] = mx['TP']
    results.loc[k]['TN'] = mx['TN']
    results.loc[k]['error'] = error_rate

print(results.sort_values(by='error'))

KNN also doesn't perform very well.

# Ex. 11

In [None]:
dataset_file = '../datasets/Auto.csv'
data = pd.read_csv(dataset_file, index_col='name')
data = data.loc[data.horsepower != '?']
data.horsepower = data.horsepower.apply(int)
data['mpg01'] = data.mpg.apply(lambda v: 1 if v > data.mpg.median() else 0)
data.head()

In [None]:
sns.pairplot(data.drop(['mpg'], axis=1), hue='mpg01')

In [None]:
_, axes = plt.subplots(2, 3, figsize=(16,12))
for i, col in enumerate(['displacement', 'horsepower', 'weight', 'acceleration', 'year']):
    sns.boxplot(data.mpg01, data[col], ax=axes[i//3, i%3])

`displacement` and `weight` seems to be very helpful in predicting `mpg01`.

In [None]:
train_data = data.iloc[:-100]
test_data = data.iloc[-100:]

In [None]:
model = LinearDiscriminantAnalysis().fit(train_data[['displacement', 'horsepower', 'weight']], train_data['mpg01'])
mx = get_col_mx(model, test_data[['displacement', 'horsepower', 'weight']], test_data['mpg01'])
error_rate = get_error_rate(mx)
print(mx)
print(1 - error_rate)

In [None]:
model = QuadraticDiscriminantAnalysis().fit(train_data[['displacement', 'horsepower', 'weight']], train_data['mpg01'])
mx = get_col_mx(model, test_data[['displacement', 'horsepower', 'weight']], test_data['mpg01'])
error_rate = get_error_rate(mx)
print(mx)
print(1 - error_rate)

In [None]:
model = LogisticRegression().fit(train_data[['displacement', 'horsepower', 'weight']], train_data['mpg01'])
mx = get_col_mx(model, test_data[['displacement', 'horsepower', 'weight']], test_data['mpg01'])
error_rate = get_error_rate(mx)
print(mx)
print(1 - error_rate)

In [None]:
model = KNeighborsClassifier(n_neighbors=1).fit(train_data[['displacement', 'horsepower', 'weight']], train_data['mpg01'])
mx = get_col_mx(model, test_data[['displacement', 'horsepower', 'weight']], test_data['mpg01'])
error_rate = get_error_rate(mx)
print('K =', 1)
print(mx)
print(1 - error_rate)

model = KNeighborsClassifier(n_neighbors=3).fit(train_data[['displacement', 'horsepower', 'weight']], train_data['mpg01'])
mx = get_col_mx(model, test_data[['displacement', 'horsepower', 'weight']], test_data['mpg01'])
error_rate = get_error_rate(mx)
print('K =', 3)
print(mx)
print(1 - error_rate)

model = KNeighborsClassifier(n_neighbors=5).fit(train_data[['displacement', 'horsepower', 'weight']], train_data['mpg01'])
mx = get_col_mx(model, test_data[['displacement', 'horsepower', 'weight']], test_data['mpg01'])
error_rate = get_error_rate(mx)
print('K =', 5)
print(mx)
print(1 - error_rate)

model = KNeighborsClassifier(n_neighbors=10).fit(train_data[['displacement', 'horsepower', 'weight']], train_data['mpg01'])
mx = get_col_mx(model, test_data[['displacement', 'horsepower', 'weight']], test_data['mpg01'])
error_rate = get_error_rate(mx)
print('K =', 10)
print(mx)
print(1 - error_rate)

# Ex. 13

In [None]:
dataset_file = '../datasets/Boston.csv'
data = pd.read_csv(dataset_file, index_col=0)
data['crim_bin'] = data.crim.apply(lambda v: 1 if v > data.crim.median() else 0)
data.head()

In [None]:
data.corr().crim_bin

Most related variables are `indus`, `nox`, `age`, `dis` and `tax`.

In [None]:
sns.pairplot(data[['indus', 'nox', 'age', 'dis', 'tax', 'crim_bin']], hue='crim_bin')

In [None]:
important_cols = ['indus', 'nox', 'age', 'dis', 'tax']
train_data = data.iloc[:-100]
test_data = data.iloc[-100:]

In [None]:
model = LinearDiscriminantAnalysis().fit(train_data[important_cols], train_data['crim_bin'])
mx = get_col_mx(model, test_data[important_cols], test_data['crim_bin'])
error_rate = get_error_rate(mx)
print(mx)
print(1 - error_rate)

In [None]:
model = QuadraticDiscriminantAnalysis().fit(train_data[important_cols], train_data['crim_bin'])
mx = get_col_mx(model, test_data[important_cols], test_data['crim_bin'])
error_rate = get_error_rate(mx)
print(mx)
print(1 - error_rate)

In [None]:
model = LogisticRegression().fit(train_data[important_cols], train_data['crim_bin'])
mx = get_col_mx(model, test_data[important_cols], test_data['crim_bin'])
error_rate = get_error_rate(mx)
print(mx)
print(1 - error_rate)

In [None]:
model = KNeighborsClassifier(n_neighbors=1).fit(train_data[important_cols], train_data['crim_bin'])
mx = get_col_mx(model, test_data[important_cols], test_data['crim_bin'])
error_rate = get_error_rate(mx)
print('K =', 1)
print(mx)
print(1 - error_rate)

model = KNeighborsClassifier(n_neighbors=3).fit(train_data[important_cols], train_data['crim_bin'])
mx = get_col_mx(model, test_data[important_cols], test_data['crim_bin'])
error_rate = get_error_rate(mx)
print('K =', 3)
print(mx)
print(1 - error_rate)

model = KNeighborsClassifier(n_neighbors=5).fit(train_data[important_cols], train_data['crim_bin'])
mx = get_col_mx(model, test_data[important_cols], test_data['crim_bin'])
error_rate = get_error_rate(mx)
print('K =', 5)
print(mx)
print(1 - error_rate)

model = KNeighborsClassifier(n_neighbors=10).fit(train_data[important_cols], train_data['crim_bin'])
mx = get_col_mx(model, test_data[important_cols], test_data['crim_bin'])
error_rate = get_error_rate(mx)
print('K =', 10)
print(mx)
print(1 - error_rate)

KNN shows best performance among all models.