In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os, json, heapq, operator
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from bisect import bisect, bisect_left, bisect_right
from sklearn.metrics import precision_score, recall_score
from sklearn.preprocessing import PolynomialFeatures as PF
%matplotlib inline

In [2]:
base = os.path.dirname(os.getcwd())

price_path = os.path.join(base, 'prices')

In [20]:
class InputManager:
    def __init__(self, reach, shift, length):
        self.R = reach
        self.S = shift
        self.L = length
        self.file_list = {f: pd.read_csv(os.path.join(price_path, f)).set_index('<DTYYYYMMDD>') for f in os.listdir(price_path) if not f.startswith('WIG')}
        self.samples = []
        self.volume_samples = []
        self.wig_samples = []
        self.wig20_samples = []
        self.labels = []
        self.indicators = []
        self.encoding = 'R_' + str(self.R) + 'S_' + str(self.S)
        self.fatalities = 0
        
    def extract_data(self, start_date='19900101', end_date='20140101'):
        wig = pd.read_csv(os.path.join(price_path, 'WIG.csv')).set_index('<DTYYYYMMDD>')
        wig20 = pd.read_csv(os.path.join(price_path, 'WIG20.csv')).set_index('<DTYYYYMMDD>')
        wig_dates = list(map(str, wig.index))
            
        wig_start = bisect(wig_dates, start_date)
        wig_end = bisect(wig_dates, end_date)

        wig20_start = bisect(list(map(str, wig.index)), start_date)
        
        wig_levels = wig['<CLOSE>'].as_matrix()[wig_start:wig_end]
        wig20_levels = wig20['<CLOSE>'].as_matrix()[wig20_start:wig_end]
        N = len(wig_dates[wig_start:wig_end])
        M = len(wig_dates[wig20_start:wig_end])
        wig_dates_dict = dict(zip(wig_dates[wig_start:wig_end], list(range(N))))
        wig20_dates_dict = dict(zip(wig_dates[wig20_start:wig_end], list(range(M))))
        
        for name, file in self.file_list.items():
            print(name)
            dates = list(map(str, file.index))
            start_ind = bisect(dates, start_date)
            end_ind = bisect(dates, end_date)
            final = bisect(dates, (datetime.strptime(end_date, '%Y%m%d')+timedelta(days=self.R+3)).strftime('%Y%m%d'))
            if end_ind == final:
                end_ind -= self.R
            dates = dates[start_ind:final]
            if len(dates) < self.L:
                continue
            dates_ind_dict = dict(zip(dates, list(range(len(dates)))))
            if len(dates) == 0 or end_ind < 0:
                continue
            
            prices = file['<CLOSE>'].as_matrix()[start_ind:final]
            volumes = file['<VOL>'].as_matrix()[start_ind:final]
            
            
            
            self.comp_sample = []
            self.comp_labels = []
            end_date_dt = datetime.strptime(dates[end_ind-start_ind], '%Y%m%d')
            
            
            for run in range(self.S):
                
                things = prices[run:]
                other_things = volumes[run:]

                current_date = datetime.strptime(dates[run], '%Y%m%d')
                slices = []
                vol_slices = []
                wig_slices = []
                wig20_slices = []
                respective_dates = []
                while current_date < end_date_dt:
                    c = current_date.strftime('%Y%m%d')
                    while current_date < end_date_dt and c not in dates_ind_dict:
                        current_date += timedelta(days=1)
                        c = current_date.strftime('%Y%m%d')

                    if current_date == end_date_dt:
                        break
                    slices.append(prices[dates_ind_dict[c]])
                    vol_slices.append(volumes[dates_ind_dict[c]])
                    wig_slices.append(wig_levels[wig_dates_dict[c]])
                    wig20_slices.append(wig20_levels[wig20_dates_dict[c]])

                    respective_dates.append(dates[dates_ind_dict[c]])
                    current_date += timedelta(days=self.S)

                for nr in range(len(slices)-self.L-1):

                    last_date = respective_dates[nr+self.L+1]
                    check_date = datetime.strptime(last_date, '%Y%m%d')+timedelta(days=self.R)
                    while check_date < end_date_dt + timedelta(days=self.R+3) and check_date.strftime('%Y%m%d') not in dates_ind_dict:
                        check_date += timedelta(days=1)

                    if check_date.strftime('%Y%m%d') not in dates_ind_dict:
                        continue
                    self.samples.append(slices[nr:nr+self.L+1])
                    self.volume_samples.append(vol_slices[nr:nr+self.L+1])

                    self.wig_samples.append(wig_slices[nr:nr+self.L+1])
                    self.wig20_samples.append(wig20_slices[nr:nr+self.L+1])

                    result = ((prices[dates_ind_dict[check_date.strftime('%Y%m%d')]] - prices[dates_ind_dict[last_date]]) > 0)/prices[dates_ind_dict[last_date]]
                    if result == 1:
                        self.labels.append([0,1])
                    elif result == 0:
                        self.labels.append([1,0])

                        
#                 except:
#                     print('something bad happend at', name, ' while run number', run)
#                     continue
        
        self.samples = np.array(self.samples)
        self.volume_samples = np.array(self.volume_samples)
        
    def scale(self):
        if len(self.samples)==0:
            print('no samples to preprocess')
        sample_mean = np.mean(self.samples)
        sample_std = np.std(self.samples)
        self.samples = (self.samples-sample_mean)/sample_std
        

    def polynom(self, inplace=True, degree=2, what='both'):
        if len(self.samples)==0:
            print('no samples to preprocess')
            
        pf = PF(degree=degree, interaction_only=True, include_bias=False)
        if inplace:
            if what == 'prices':
                self.samples = pf.fit_transform(self.samples)
            elif what == 'volumes':
                self.volume_samples = pf.fit_transform(self.volume_samples)
            elif what == 'both':
                self.samples = pf.fit_transform(self.samples)
                self.volume_samples = pf.fit_transform(self.volume_samples)
            return
        return pf.fit_transform(self.samples)
        
    
    
    def rel(self, inplace=False):
        if len(self.samples) == 0:
            print('no samples to preprocess')
        
        D = (self.volume_samples[:, -1][None].T - self.volume_samples[:, :-1])/self.volume_samples[:, :-1]
        C = (self.samples[:, -1][None].T - self.samples[:, :-1])/self.samples[:, :-1]
        if inplace:
            self.samples = C
            self.volume_samples = D
            return
        return C, D
    
    def join_prices_volumes(self):
        return np.hstack([self.samples, self.volume_samples, self.wig_samples])
    
    def discretize(self):
        self.labels[self.labels > 0] = [0,1]
        self.labels[self.labels < 0] = [1,0]
        
        
    def get_data(self):
        return self.samples, self.labels
                
                    
        
                
            
        


In [22]:
news = os.path.join(base, 'news')

In [24]:
companies = os.listdir(news)

In [25]:
from bs4 import BeautifulSoup as BS

In [65]:

for comp in companies:
    print(comp)
    l = os.path.join(news, comp)
    a = []
    for file in os.listdir(l):
        if file == 'changes.txt':
            continue
        o = open(os.path.join(l,file), 'r').read()
        try:
            soup = BS(o, 'html.parser').title.string.lower()
            if 'termin' in soup:
                a.append( (file.split(' ')[0], soup) )
                
        except:
            print(file)
            continue
        
        
    a = sorted(a, key=operator.itemgetter(0))
    
    diffs = np.diff([int(x[0].split('-')[0]) for x in a])
    years = set([int(x[0].split('-')[0]) for x in a])
    if 2 in diffs:
        mi, ma = min(years), max(years)
        missed = []
        for y in range(len(years)):
            if mi + y not in years:
                missed.append(mi+y)
        print(missed)

FORTE
WIELTON
LENA LIGHTING SA
ROPCZYCE
COAL ENERGY
LENTEX
LABO PRINT
BIOTON SA
DOM DEVELOPMENT
PA NOVA
I2 DEVELOPMENT
ECHO INVESTMENT
[2003]
ACTION
ES-SYSTEM
PROCAD
ERG
POLSKA GRUPA ODLEWNICZA
B3SYSTEM
ESOTIQ & HENDERSON
MLP GROUP
DECORA
GROCLIN
INTEGER.PL
[2013]
SANWIL
COLIAN HOLDING
SARE SPOLKA AKCYJNA
WIRTUALNA POLSKA HOLDING
STAPORKOW
[2008]
ELKOP
MEGARON
MDI ENERGIA
AUTO PARTNER
AMICA
[2009]
CIECH S.A.
[2014]
ADIUVO INVESTMENTS
FAMUR
[2013]
GTC
ENEA
[2010]
PRAIRIE MINING LIMITED
ORION
PRIMA MODA
ARCUS
AB
NEUCA
MANGATA HOLDING SA
RESBUD
AGROTON PUBLIC LIMITED
ALCHEMIA
COMPERIA.PL SPOLKA AKCYJNA
PAK
NORTH COAST
BUDIMEX
CERAMIKA NOWA GALA
[2013]
MIRACULUM
DEKPOL
MW TRADE
TARCZYNSKI
NOVITA
WISTIL
[2007, 2009]
SUNEX
INVESTMENT FRIENDS
IZOSTAL SA
AGORA
[2008]
PKP CARGO
IQ PARTNERS
PROCHEM
MO-BRUK SA
ABM SOLID
EUROHOLD BULGARIA AD
MUZA
GRAAL
ZPUE
TIM
[2002]
COMP
[2007, 2009]
WASKO
ORANGE POLSKA
REDWOOD HOLDING
ALUMETAL
ORBIS
PCC INTERMODAL
SOPHARMA AD
MOL NYRT.
[2007]
ATENDE
GLOBAL COSM