In [1]:
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pathlib import Path
import jpx_tokyo_market_prediction

from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split

In [2]:
train_df = pd.read_csv('../input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv', parse_dates=True)
valid_df = pd.read_csv('../input/jpx-tokyo-stock-exchange-prediction/supplemental_files/stock_prices.csv', parse_dates=True)
test_df = pd.read_csv('../input/jpx-tokyo-stock-exchange-prediction/example_test_files/stock_prices.csv', parse_dates=True)

In [3]:
print(train_df.shape )
print(valid_df.shape )


print("train_df Null Check : {}".format(train_df.isnull().sum()))
print('----------------------------------------------------------')
print("valid_df Null Check : {}".format(valid_df.isnull().sum()))

In [4]:
prices = valid_df

del train_df, valid_df

In [5]:
prices = prices.drop(["ExpectedDividend"],axis=1)

## Function

In [6]:
average = pd.DataFrame(prices.groupby("SecuritiesCode").Target.mean())
def get_avg(_id_):
    return average.loc[_id_]



def getadvance(x):
    ret = 0
    if x > 0:
        ret = 1
    return(ret)

def get_month(dt):
    x = dt.strftime("%m")
    return(x)


def cat_col(data) :
    data['SecuritiesCode'] = data['SecuritiesCode'].astype('category')
    data['SupervisionFlag'] = data['SupervisionFlag'].astype('category')
    data['advance'] = data['advance'].astype('category')
    data['AdjustmentFactor'] = data['AdjustmentFactor'].astype('category')
    data['Month'] = data['Month'].astype('category')
    return data 



In [7]:
prices["Avg"] = prices["SecuritiesCode"].apply(get_avg)


prices['pClose'] = prices['Close'].shift(-1)
prices['delta'] = prices['Close'] - prices['pClose']
prices['advance'] = list(map(getadvance, prices['delta']))
prices['Date'] = pd.to_datetime(prices['Date'], format = "%Y-%m-%d")
prices['Month'] =  list(map(get_month, prices['Date']))

prices = prices.sort_values(by = "Date").reset_index()
df_s = prices[['Date', 'Close']].reset_index(drop = True)
df_s['20D-EMA'] = df_s['Close'].ewm(span=20,adjust=False).mean()
df_s['50D-EMA'] = df_s['Close'].ewm(span=50,adjust=False).mean()
df_s['100D-EMA'] = df_s['Close'].ewm(span=100,adjust=False).mean()
prices = pd.concat([prices, df_s['20D-EMA'], df_s['50D-EMA'], df_s['100D-EMA']], axis = 1)


prices = cat_col(prices)

In [8]:
prices.isnull().sum()

In [9]:
prices.columns

In [10]:
features = ['Date', 'SecuritiesCode', 'Open', 'High', 'Low', 'Close',
       'Volume', 'AdjustmentFactor', 'SupervisionFlag', 'Avg',
       'pClose', 'delta', 'advance', 'Month', '20D-EMA', '50D-EMA','100D-EMA']

In [11]:
prices.isnull().sum()

In [12]:
# data segmentation
prices['Date'] = prices['Date'].dt.strftime("%Y%m%d").astype(int)

X = prices[features]
y = prices['Target']
codes = X.SecuritiesCode.unique()

In [13]:
# prices.Date = pd.to_datetime(prices.Date)
# prices['Date'] = prices['Date'].dt.strftime("%Y%m%d").astype(int)
# X=prices[["Date","SecuritiesCode","Avg"]]
# y=prices[["Target"]]
# codes = X.SecuritiesCode.unique()

In [14]:
params = {'num_leaves' : 500,
          'learning_rate' : 0.99,
          'n_estimators': 8000
          }
# params = {'num_leaves' : 500,
#            'learning_rate' : 0.05,
#            'n_estimators': 100
#            }

In [15]:
model = LGBMRegressor(**params)

In [16]:
model.fit(X,y)
model.score(X,y)

In [17]:
# env = jpx_tokyo_market_prediction.make_env()
# iter_test = env.iter_test()

# for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:
#     ds=[prices, options, financials, trades, secondary_prices, sample_prediction]
#     sample_prediction["Avg"] = sample_prediction["SecuritiesCode"].apply(get_avg)
#     df = sample_prediction[["Date","SecuritiesCode","Avg"]]
#     df.Date = pd.to_datetime(df.Date)
#     df['Date'] = df['Date'].dt.strftime("%Y%m%d").astype(int)
    
#     sample_prediction["Prediction"] = model.predict(df)
#     sample_prediction = sample_prediction.sort_values(by = "Prediction", ascending=False)
#     sample_prediction.Rank = np.arange(0,2000)
#     sample_prediction = sample_prediction.sort_values(by = "SecuritiesCode", ascending=True)
#     sample_prediction.drop(["Prediction"],axis=1)
#     submission = sample_prediction[["Date","SecuritiesCode","Rank"]]
#     print('-------------------------------sample_prediction------------------------------')
#     print(submission)
#     print('------------------------------------------------------------------------------')
#     env.predict(submission)

In [18]:
env = jpx_tokyo_market_prediction.make_env()
iter_test = env.iter_test()

for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:
    ds=[prices, options, financials, trades, secondary_prices, sample_prediction]
        
    prices["Avg"] = sample_prediction["SecuritiesCode"].apply(get_avg)
    
    prices['pClose'] = prices['Close'].shift(-1)
    prices['delta'] = prices['Close'] - prices['pClose']
    prices['advance'] = list(map(getadvance, prices['delta']))
    prices['Date'] = pd.to_datetime(prices['Date'], format = "%Y-%m-%d")
    prices['Month'] =  list(map(get_month, prices['Date']))
    
    df_s = prices[['Date', 'Close']].reset_index(drop = True)
    df_s['20D-EMA'] = df_s['Close'].ewm(span=20,adjust=False).mean()
    df_s['50D-EMA'] = df_s['Close'].ewm(span=50,adjust=False).mean()
    df_s['100D-EMA'] = df_s['Close'].ewm(span=100,adjust=False).mean()
    prices = pd.concat([prices, df_s['20D-EMA'], df_s['50D-EMA'], df_s['100D-EMA']], axis = 1)

    prices = cat_col(prices)
    prices['Date'] = prices['Date'].dt.strftime("%Y%m%d").astype(int)
    print('-------------------------------prices------------------------------')
    print(prices)
    print('------------------------------------------------------------------------------')    
    
    prices = prices.drop(['RowId','ExpectedDividend'],axis=1)

    sample_prediction["Prediction"] = model.predict(prices)
    print('-------sample_prediction--------')
    print(sample_prediction)
    sample_prediction = sample_prediction.sort_values(by = "Prediction", ascending=False)
    sample_prediction.Rank = np.arange(0,2000)
    sample_prediction = sample_prediction.sort_values(by = "SecuritiesCode", ascending=True)
    sample_prediction.drop(["Prediction"],axis=1)
    submission = sample_prediction[["Date","SecuritiesCode","Rank"]]
    print('-------------------------------submission------------------------------')
    print(submission)
    print('------------------------------------------------------------------------------')
    env.predict(submission)