# 주식 가격 예측


In [7]:
import sys
import random
import pickle

import pandas as pd
import numpy as np
import json

import math
import datetime

from sklearn import preprocessing, cross_validation, svm
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.svm import LinearSVC

from sklearn.externals import joblib
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
import xgboost

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score


import matplotlib.pyplot as plt


### Reading Dataset

In [2]:
# read train data
with open('./data/train_data.json') as fp:
    json_str = fp.read()
    json_data = json.loads(json_str)

In [3]:
train_df = pd.DataFrame(json_data, columns=['date','name','open','close','low','high','vol'])

In [4]:
with open('./data/grading.input.txt') as fp:
    sbm_in = fp.read()
    sbm_in = sbm_in.splitlines()

In [72]:
# teller_df = train_df[train_df['name']=='teller']
pred_lst = []
total = 0
true_t = 0
for idx, name in enumerate(sbm_in[:]):

    teller_df = train_df[train_df['name']==name][:]
    teller_df = teller_df.set_index('date')

    teller_df['hl_pct'] = (teller_df['high'] - teller_df['low'])/(teller_df['low']*100)
    teller_df['pct_chng'] = (teller_df['close'] - teller_df['open'])/(teller_df['open']*100)



    # 예측 컬럼
    forecast_col = 'close'
    # 예측 기간 하루
    forecast_out = int(1)
    # forecast_out에 맞춰 label을 변경
    teller_df['label'] = teller_df[forecast_col].shift(-forecast_out)
    # 방향 예측을 위한 전처리
    teller_df['direction'] = teller_df['label'] >= teller_df['close']

    # X, y
    X = teller_df[['close','hl_pct','pct_chng']]
    X = preprocessing.scale(X)
    X_forecast_out = X[-forecast_out:]
    X = X[:-forecast_out]

    y = np.array(teller_df['label'])
    y = y[:-forecast_out]
    y_d = np.array(teller_df['direction'])
    y_d = y_d[:-forecast_out]

    if idx % 50 == 0:
        print(idx,'counts name: ',name, '. Length of y: ', len(y))
    
    model = SVR(kernel='linear',C=0.01).fit(X,y)
    #model = LinearRegression().fit(X,y)
    model_d = LogisticRegression(C=0.1).fit(X,y_d)
    #model_d = xgboost.XGBClassifier(n_estimators=100,max_depth=7).fit(X,y_d)
    y_pred = model.predict(X_forecast_out)
    y_d_pred = model_d.predict(X_forecast_out)

    if y_d_pred:
        val = '+'
    else:
        val = '-'
   # val = '+'
    val += ' ' + str(float(y_pred))

    pred_lst.append(str(val))

0 counts name:  teller . Length of y:  249
50 counts name:  speeders . Length of y:  1760
100 counts name:  hairs . Length of y:  1760
150 counts name:  thyristors . Length of y:  1760
200 counts name:  jurisdictions . Length of y:  1760
250 counts name:  anchors . Length of y:  1760
300 counts name:  shovels . Length of y:  1760
350 counts name:  tab . Length of y:  1760
400 counts name:  tires . Length of y:  1760
450 counts name:  similarity . Length of y:  1760
500 counts name:  leaf . Length of y:  124


In [65]:
# valid 채점용 데이터
with open('price_rst.txt') as fp:
    price_rst = fp.read()
    price_rst = price_rst.splitlines()

In [66]:
model_d.feature_importances_

array([ 0.35031846,  0.30391264,  0.34576887], dtype=float32)

In [67]:
for i in range(len(pred_lst)):
    pred_lst[i] = pred_lst[i].split(' ')
    
for i in range(len(price_rst)):
    price_rst[i] = price_rst[i].split(' ')

valid_df = pd.concat([pd.DataFrame(pred_lst), pd.DataFrame(price_rst)], axis=1)
valid_df.columns =['pred_0','pred_1','true_0','true_1']

valid_df['s2_1'] = 5-(valid_df['pred_1'].map(float) - valid_df['true_1'].map(float))**2
valid_df['s2'] = valid_df['s2_1'].map(lambda x: 0 if x <= 0 else x)

valid_df['s1'] = valid_df['pred_0'] == valid_df['true_0']
valid_df['s1'] = valid_df['s1'].map(lambda x: 5 if x == True else 0)

valid_df['score'] = valid_df['s1']+valid_df['s2']

10*valid_df['score'].sum()/len(valid_df)

72.6726136633497

73.27141605855927

#### submission

In [56]:
!pwd

/Users/yoo/Desktop/elice/round_4


In [73]:
# submission
# 73.2714
output_file = 'submission.txt'

with open(output_file, 'w') as fp:
    for _test_pred in pred_lst:
        fp.write(_test_pred)
        fp.write('\n')

### Regression

In [539]:
y_pred = model.predict(X_test)

In [540]:
print('rmse: {}'.format(mean_squared_error(y_test, y_pred)))

rmse: 1343.0829117246076


In [532]:
for i in range(20):
    print(y_test[i], y_pred[i])

49.912 54.4608268379
115.115 54.4014640812
70.356 49.654247328
22.48 53.6749650498
156.512 58.9931905539
37.36 51.7156741119
48.873 52.7771904789
62.408 53.7650887306
61.211 50.6574986059
138.516 54.4643328079
126.36 55.0027553166
31.89 53.7229694809
54.82 51.7443669807
47.695 50.2150099343
69.639 54.7606501126
70.935 52.2044161391
38.447 50.0328951249
31.18 55.7271108202
59.315 50.4827361197
55.674 51.5835367604


###  randomforest?

In [541]:
#model_2 = RandomForestClassifier(n_estimators=200, max_depth=6).fit(X_train, y_d_train)
model_2 = LogisticRegression(C=0.1).fit(X_train, y_d_train) # 0.62
#model_2 = LinearSVC(C=0.02).fit(X_train,y_d_train) # 0.01 -> 0.62

In [542]:
y_d_pred = model_2.predict(X_test)

In [543]:
accuracy_score(y_d_test, y_d_pred)

0.51136363636363635

In [544]:
confusion_matrix(y_d_test,y_d_pred)

array([[116,  57],
       [115,  64]], dtype=int64)

In [545]:
t_x = train_df[['date']][train_df['name']==sbm_in[-1]]
t_x = t_x['date'].map(lambda x: ''.join(x.split('-')))
t_y = train_df[['close']][train_df['name']==sbm_in[-1]]

In [351]:
plt.plot(t_x, t_y)
plt.show()

In [320]:
price_rst =[]
for i in range(501)[:]:
    before_price = float(train_df[['close']][train_df['name'] ==sbm_in[i]].iloc[-2])
    true_price = float(train_df[['close']][train_df['name'] ==sbm_in[i]].iloc[-1])
    
    if true_price > before_price:
        val = '+'
    else:
        val = '-'
        
    val += ' '+ str(true_price)
    price_rst.append(val)

In [322]:
output_file = 'price_rst.txt'

with open(output_file, 'w') as fp:
    for _test_pred in price_rst:
        fp.write(_test_pred)
        fp.write('\n')

In [313]:
pd.DataFrame(price_rst).iloc[100]

name      hairs
close    24.701
Name: 850362, dtype: object

In [280]:
price_lst = [] 
for idx, name in enumerate(sbm_in):
    price_lst.append(tr)
    

'teller'

In [None]:
lst = []
rst = []
for idx, name in enumerate(sbm_in):
    dates = train_df[train_df['name'] == name]['date']
    dates = dates.map(lambda x : int(''.join(x.split('-'))))
    prices = train_df[train_df['name'] == name]['close']

    prices = np.reshape(prices,[-1,1])
    dates = np.reshape(dates,[-1,1])

    lst.append(SVR(kernel='rbf',C=1e3).fit(dates,prices))
    
    
    
    
    pred = float(lst[idx].predict(20171230))
    
    if pred > prices[-1]:
        row = '+'
    else: 
        row = '-'
    
    row += ' ' + str(pred)
    rst.append(row)

  return getattr(obj, method)(*args, **kwds)
  y = column_or_1d(y, warn=True)


In [None]:
rst

In [259]:
# submission

output_file = 'submission.txt'

with open(output_file, 'w') as fp:
    for _test_pred in rst:
        fp.write(_test_pred)
        fp.write('\n')

In [264]:
with open('/elice/data/grading.input.txt') as fp:
    sbm_in = fp.read()
    sbm_in = sbm_in.splitlines()


In [267]:
train_df['name'].unique()[:10]

array(['teller', 'winch', 'bay', 'admissions', 'mules', 'horizons',
       'gross', 'residues', 'null', 'flake'], dtype=object)

In [268]:
sbm_in[:10]

['teller',
 'winch',
 'bay',
 'admissions',
 'mules',
 'firearms',
 'horizons',
 'gross',
 'residues',
 'null']

In [179]:
for i in lst:
    

Unnamed: 0,date,name,open,close,high,low,vol
0,2016-01-05,teller,123.677,125.591,122.459,126.181,2163918


In [184]:
prices[-2]

array([ 122.641])

In [198]:
import matplotlib.pyplot as plt

In [187]:
svr_t.predict(20161228)

array([ 122.7410345])

In [31]:
train_df[:3]

Unnamed: 0,date,name,start,end,min,max,vol
0,2016-01-05,teller,123.677,125.591,122.459,126.181,2163918
1,2016-01-06,teller,125.134,120.089,120.067,125.768,2382872
2,2016-01-07,teller,116.413,114.925,114.67,119.469,2488061


In [3]:
# read train data
with open('./data/train_data.json') as fp:
    json_str = fp.read()
    json_data = json.loads(json_str)
    
# convert to dataframe
train_df = pd.DataFrame(json_data)

# train data preprocessing
train_df['rating_cat'] = train_df['rating'].apply(lambda x: 
                    'NEG' if 1<= x <=3 
                     else 
                      ('NEU' if 4<=x<=7 
                     else 'POS'))

# read test data in/out
with open('./data/test.input') as fp:
    test_in = fp.read()
    test_in = test_in.splitlines()

with open('./data/test.output') as fp:
    test_out = fp.read()
    test_out = test_out.splitlines()


In [None]:
train_df.head()

In [None]:
test_in.head()

###  Converting text to vector with Tfid 

In [4]:
# vectorize train using Tfid(bag of words)

twitter = Twitter()

def tokenize_pos(doc):
    return ['/'.join(t) for t in twitter.pos(doc, norm=True, stem=True)]

vectorizer =  TfidfVectorizer(tokenizer=tokenize_pos, ngram_range=(1,2), use_idf=False, smooth_idf=False)


y = train_df.rating_cat
X = vectorizer.fit_transform(train_df.review)

### Model fitting & validtaion check

In [8]:
# fitting train data with classifier

model = SGDClassifier(alpha=1.9e-6, n_iter=19).fit(X, y)



# predict test data
feature_list = vectorizer.get_feature_names()

test_vectorizer =  TfidfVectorizer(tokenizer=tokenize_pos, ngram_range=(1,2), vocabulary = feature_list)
X_test = test_vectorizer.fit_transform(test_in[:8400])


test_pred = model.predict(X_test)

# accuracy
print(accuracy_score(test_out[:8400], test_pred))

0.776547619048


In [10]:
# optimizing alpha 

alpha = np.arange(2.5e-6,3.5e-6,1e-7)

ite = 5 #  반복
score = np.zeros([len(alpha), ite])

for i, val in enumerate(alpha):
    for k in range(ite): 
        model = SGDClassifier(alpha=val).fit(X, y)
        test_pred = model.predict(X_test)

        score[i][k] = accuracy_score(test_out[:8400], test_pred)
        
print(score)
print(score.mean(axis=1))

[[ 0.77178571  0.77535714  0.775       0.7747619   0.77309524]
 [ 0.77535714  0.77488095  0.77428571  0.775       0.77452381]
 [ 0.77333333  0.77321429  0.77464286  0.7747619   0.77547619]
 [ 0.77607143  0.77202381  0.77404762  0.77345238  0.77357143]
 [ 0.77285714  0.77345238  0.77440476  0.77404762  0.77511905]
 [ 0.77607143  0.77452381  0.77369048  0.77464286  0.77392857]
 [ 0.77559524  0.77535714  0.77416667  0.77416667  0.77452381]
 [ 0.77333333  0.77369048  0.77488095  0.77238095  0.77559524]
 [ 0.77488095  0.77285714  0.77404762  0.77380952  0.77630952]
 [ 0.775       0.77380952  0.77607143  0.77404762  0.7752381 ]]
[ 0.774       0.77480952  0.77428571  0.77383333  0.77397619  0.77457143
  0.7747619   0.77397619  0.77438095  0.77483333]


In [11]:
# optimizing n-iter 

n = np.arange(15,25)


ite = 5 #  반복
score = np.zeros([len(n), ite])


for i, val in enumerate(n):
    for k in range(ite): 
        
        model = SGDClassifier(alpha=2.9e-6, n_iter=val).fit(X, y)
        test_pred = model.predict(X_test)

        score[i][k] = accuracy_score(test_out[:8400], test_pred)
        
print(score)
print(score.mean(axis=1))

[[ 0.77559524  0.7752381   0.77428571  0.77428571  0.77380952]
 [ 0.77583333  0.775       0.77452381  0.77464286  0.775     ]
 [ 0.77416667  0.77559524  0.7752381   0.77452381  0.77464286]
 [ 0.77440476  0.77571429  0.77309524  0.77404762  0.77559524]
 [ 0.77392857  0.77547619  0.77464286  0.77452381  0.77452381]
 [ 0.77440476  0.7747619   0.7747619   0.77452381  0.77392857]
 [ 0.7747619   0.7752381   0.7747619   0.77452381  0.77595238]
 [ 0.77571429  0.7752381   0.77464286  0.77404762  0.77488095]
 [ 0.77535714  0.77511905  0.77428571  0.77511905  0.77511905]
 [ 0.77547619  0.77428571  0.77511905  0.775       0.77392857]]
[ 0.77464286  0.775       0.77483333  0.77457143  0.77461905  0.77447619
  0.77504762  0.77490476  0.775       0.7747619 ]


### Final model

In [12]:
model = SGDClassifier(alpha=2.9e-6, n_iter=21).fit(X, y)

### Predict submission data

In [13]:
# submission

with open('./data/grading.input') as fp:
    sbm_in = fp.read()
    sbm_in = sbm_in.splitlines()

feature_list = vectorizer.get_feature_names()

test_vectorizer =  TfidfVectorizer(tokenizer=tokenize_pos, ngram_range=(1,2), vocabulary = feature_list)
sbm_test = test_vectorizer.fit_transform(sbm_in[:8400])

sbm_pred = model.predict(sbm_test)

output_file = 'submission.txt'


with open(output_file, 'w') as fp:
    for _test_pred in sbm_pred:
        fp.write(_test_pred)
        fp.write('\n')