In [2]:
from newstock.news import Newstock

In [4]:
samsung_x, samsung_y = Newstock(name='삼성전자').get_data_by_name()

In [9]:
import pandas as pd
import datetime
from datetime import datetime as dt
from dateutil.relativedelta import *

class TimeBasedCV(object):
    '''
    Parameters 
    ----------
    train_period: int
        number of time units to include in each train set
        default is 30
    test_period: int
        number of time units to include in each test set
        default is 7
    freq: string
        frequency of input parameters. possible values are: days, months, years, weeks, hours, minutes, seconds
        possible values designed to be used by dateutil.relativedelta class
        deafault is days
    '''
    
    
    def __init__(self, train_period=30, test_period=7, freq='days'):
        self.train_period = train_period
        self.test_period = test_period
        self.freq = freq

        
        
    def split(self, data, validation_split_date=None, date_column='record_date', gap=0):
        '''
        Generate indices to split data into training and test set
        
        Parameters 
        ----------
        data: pandas DataFrame
            your data, contain one column for the record date 
        validation_split_date: datetime.date()
            first date to perform the splitting on.
            if not provided will set to be the minimum date in the data after the first training set
        date_column: string, deafult='record_date'
            date of each record
        gap: int, default=0
            for cases the test set does not come right after the train set,
            *gap* days are left between train and test sets
        
        Returns 
        -------
        train_index ,test_index: 
            list of tuples (train index, test index) similar to sklearn model selection
        '''
        
        # check that date_column exist in the data:
        try:
            data[date_column]
        except:
            raise KeyError(date_column)
                    
        train_indices_list = []
        test_indices_list = []

        if validation_split_date==None:
            validation_split_date = data[date_column].min().date() + eval('relativedelta('+self.freq+'=self.train_period)')
        
        start_train = validation_split_date - eval('relativedelta('+self.freq+'=self.train_period)')
        end_train = start_train + eval('relativedelta('+self.freq+'=self.train_period)')
        start_test = end_train + eval('relativedelta('+self.freq+'=gap)')
        end_test = start_test + eval('relativedelta('+self.freq+'=self.test_period)')

        while end_test <= data[date_column].max().date() + eval('relativedelta('+self.freq+'=1)'):
            # train indices:
            cur_train_indices = list(data[(data[date_column].dt.date>=start_train) & 
                                     (data[date_column].dt.date<end_train)].index)

            # test indices:
            cur_test_indices = list(data[(data[date_column].dt.date>=start_test) &
                                    (data[date_column].dt.date<end_test)].index)
            
            print("Train period:",start_train,"<= day <" , end_train, ", Test period", start_test, "<= day <", end_test,
                  "# train records", len(cur_train_indices), ", # test records", len(cur_test_indices))

            train_indices_list.append(cur_train_indices)
            test_indices_list.append(cur_test_indices)

            # update dates:
            start_train = start_train + eval('relativedelta('+self.freq+'=self.test_period)')
            end_train = start_train + eval('relativedelta('+self.freq+'=self.train_period)')
            start_test = end_train + eval('relativedelta('+self.freq+'=gap)')
            end_test = start_test + eval('relativedelta('+self.freq+'=self.test_period)')

        # mimic sklearn output  
        index_output = [(train,test) for train,test in zip(train_indices_list,test_indices_list)]

        self.n_splits = len(index_output)
        
        return index_output
    
    
    def get_n_splits(self):
        """Returns the number of splitting iterations in the cross-validator
        Returns
        -------
        n_splits : int
            Returns the number of splitting iterations in the cross-validator.
        """
        return self.n_splits 

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin

class TextSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def __init__(self, text='text'):
        self.text = text

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.text]

In [11]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

class Buy_Sell_transformer(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def __init__(self, cut_percent=0.01):
        self.cut_percent = cut_percent

    def fit(self, y):
        return self

    def transform(self, y):
        y = pd.cut(y, bins=[-np.inf, self.cut_percent, np.inf], labels=['nothing','buy'])
        return y

In [12]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

y_process = Pipeline([
    ('Buy_Sell_transformer', Buy_Sell_transformer()),
])

x_process = Pipeline([
    ('selector', TextSelector()),
    ('td_idf', TfidfVectorizer()),
])

pipeline = Pipeline([
                ('process', x_process),
                ('clf', MultinomialNB())
               ])

In [18]:
TfidfVectorizer(samsung_x['text'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input=141      [머니투데이 반준환 기자] [[특징주]]삼성전자가 1% 안팎의 반등세를 보이고 있다...
142      81년을 이어온 삼성의 무(無)노조 경영이 깨지고 있다. 삼성전자(지난해 11월),...
143      [서울경제] 코스피가 2일 나흘 만에 상승으로 출발했다. 이날 오전 9시 3분 코스...
144      코스피 지수가 개인과 기관 매수세에 1990대 강세로 출발했다.2일 코스피 지수는 ...
145      신종 코로나바이러스 감염증(코로나19) 공포에 지난 주 8.1% 폭락한 코스피지수가...
                               ...                        
17437    [이데일리 김윤지...
17440    [서울경제] PC용 D램 현물가격이 최근 두달간 가파르게 하락하면서 ‘반도체 다운사...
17441    (서울=뉴스1) 주성호 기자 = 서울 강남역에서 300일 넘게 고공농성을 벌였던 삼...
Name: text, Length: 7793, dtype: object,
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=No

In [15]:
tscv = TimeBasedCV(train_period=40, test_period=49, freq='days')
cv_sets = tscv.split(samsung_x.reset_index(drop=True), validation_split_date=None, date_column='timestamps', gap=0)

Train period: 2020-03-02 <= day < 2020-04-11 , Test period 2020-04-11 <= day < 2020-05-30 # train records 4072 , # test records 3721


In [16]:
pipeline.get_params()

{'memory': None,
 'steps': [('process', Pipeline(memory=None,
            steps=[('selector', TextSelector(text='text')),
                   ('td_idf',
                    TfidfVectorizer(analyzer='word', binary=False,
                                    decode_error='strict',
                                    dtype=<class 'numpy.float64'>,
                                    encoding='utf-8', input='content',
                                    lowercase=True, max_df=1.0, max_features=None,
                                    min_df=1, ngram_range=(1, 1), norm='l2',
                                    preprocessor=None, smooth_idf=True,
                                    stop_words=None, strip_accents=None,
                                    sublinear_tf=False,
                                    token_pattern='(?u)\\b\\w\\w+\\b',
                                    tokenizer=None, use_idf=True,
                                    vocabulary=None))],
            verbose=False)),
 

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = [
    {
        'process__vectorizer__vectorizer': [TfidfVectorizer(), CountVectorizer()]
    }
]

gscv = GridSearchCV(pipeline, parameters, cv=cv_sets, scoring=scoring, refit='my_precision', n_jobs=4, return_train_score=False, verbose=3)
gscv.fit(samsung_x.reset_index(drop=True), y_process.transform(samsung_y).reset_index(drop=True))