In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

## create lag data

In [2]:
df = pd.read_csv('data/stock_price.csv')

# create lag data
vclose = df['close']
df['Lag1'] = vclose / vclose.shift(1) - 1
df['Lag2'] = vclose / vclose.shift(2) - 1
df['Lag3'] = vclose / vclose.shift(3) - 1
df['Lag4'] = vclose / vclose.shift(4) - 1
df['Lag5'] = vclose / vclose.shift(5) - 1

df = df.dropna()

# 주가 상승하락 기준1
# df['direction'] = np.sign(df['Lag1'])
# df.loc[df.direction == 0, 'direction'] = 1.0

# 주가 상승하락 기준2
df['TLag'] = df['Lag1']+df['Lag2']+df['Lag3']+df['Lag4']+df['Lag5']
df['direction'] = np.sign(df['TLag'])

print(df.shape)
print(df.head())

(4795, 15)
   change  close   code        date   high    low   open  volume      Lag1  \
5    1150  40000  30200  1999-02-01  41500  39000  39000  534900  0.029601   
6     550  39450  30200  1999-02-02  40000  39000  40000  244160 -0.013750   
7     550  38900  30200  1999-02-03  39200  38500  39200  219700 -0.013942   
8     200  38700  30200  1999-02-04  38750  37950  37950  132180 -0.005141   
9      50  38750  30200  1999-02-05  39400  38200  38700  128550  0.001292   

       Lag2      Lag3      Lag4      Lag5      TLag  direction  
5  0.028278  0.061008  0.111111  0.081081  0.311079        1.0  
6  0.015444  0.014139  0.046419  0.095833  0.158085        1.0  
7 -0.027500  0.001287  0.000000  0.031830 -0.008324       -1.0  
8 -0.019011 -0.032500 -0.003861 -0.005141 -0.065655       -1.0  
9 -0.003856 -0.017744 -0.031250 -0.002574 -0.054132       -1.0  


## split train / test

In [3]:
X = df[['Lag1','Lag2','Lag3','Lag4','Lag5']]
y = df['direction']

idx = 4495
X_train = X[:idx]
y_train = y[:idx]
X_test = X[idx:]
y_test = y[idx:]

print(X_train.shape)
print(X_test.shape)

(4495, 5)
(300, 5)


## Logistic Regression

In [4]:
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print('')
print(model.score(X_test, y_test))

[[162   1]
 [  9 128]]

0.966666666667


## kNN

In [5]:
model_knn = KNeighborsClassifier(100)
model_knn.fit(X_train, y_train)

y_pred = model_knn.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print('')
print(model_knn.score(X_test, y_test))

[[162   1]
 [  0 137]]

0.996666666667


## SVM

In [6]:
model_svc = SVC(kernel='linear')
model_svc.fit(X_train, y_train)

y_pred = model_svc.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print('')
print(model_knn.score(X_test, y_test))

[[162   1]
 [  5 132]]

0.996666666667
