In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import numpy as np
import lightgbm as lgb

In [2]:
ls data

000858.SZ.csv  603259.SS.csv


In [3]:
data = pd.read_csv(f'data/603259.SS.csv')
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2018-05-08,11.020408,11.020408,11.020408,13.222789,13.007851,69946
1,2018-05-09,14.545068,14.545068,14.545068,14.545068,14.308635,48594
2,2018-05-10,15.999149,15.999149,15.999149,15.999149,15.739079,19992
3,2018-05-11,17.59779,17.59779,17.59779,17.59779,17.311735,129818
4,2018-05-14,19.357992,19.357992,19.357992,19.357992,19.043324,173372


In [4]:
data.shape

(939, 7)

In [5]:
data["labels"] = ((data['Adj Close'].shift(-1) > data['Open'].shift(-1)) & (data["Open"].shift(-2) >= data["Adj Close"].shift(-1))).astype(int)

In [6]:
data["labels"].value_counts()

0    661
1    278
Name: labels, dtype: int64

In [7]:
data["target"] =  (data['Open'].shift(-2) - data['Open'].shift(-1)).astype(float)

In [8]:
data.tail()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,labels,target
934,2022-03-14,97.699997,100.5,95.800003,96.629997,96.153839,23854095,0,2.019997
935,2022-03-15,93.0,98.160004,91.010002,93.169998,92.710892,31989210,1,10.970001
936,2022-03-16,95.019997,102.400002,94.110001,101.57,101.069496,40315476,0,0.389999
937,2022-03-17,105.989998,109.199997,104.0,106.910004,106.383194,45039883,0,
938,2022-03-18,106.379997,106.379997,103.309998,103.620003,103.109398,29234823,0,


In [9]:
data.drop([937, 938], axis=0, inplace=True)

In [10]:
data.query("labels==1").sort_values(by="target")

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,labels,target
406,2020-01-06,52.416664,52.619045,51.154762,51.547619,51.034634,33691269,1,0.244049
515,2020-06-19,71.458336,75.099998,71.191666,74.316666,73.797806,29203285,1,0.308334
322,2019-08-30,49.404762,52.440475,49.404762,51.607143,51.093563,17024327,1,0.333332
142,2018-12-03,34.863945,35.918365,34.778912,35.799320,35.217400,15667344,1,0.335884
243,2019-05-10,32.312923,33.375851,31.972790,33.282310,32.741299,14339812,1,0.403912
...,...,...,...,...,...,...,...,...,...
719,2021-04-22,118.991669,119.166664,116.758331,117.158333,116.340363,16743538,1,10.841667
935,2022-03-15,93.000000,98.160004,91.010002,93.169998,92.710892,31989210,1,10.970001
783,2021-07-27,143.899994,152.000000,143.130005,144.600006,143.887466,33480115,1,12.119995
660,2021-01-21,130.750000,134.241669,130.000000,133.333328,132.402420,20579223,1,12.416672


In [11]:
data.set_index("Date", inplace=True)
data.drop("target", axis=1, inplace=True)
data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,labels
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-05-08,11.020408,11.020408,11.020408,13.222789,13.007851,69946,0
2018-05-09,14.545068,14.545068,14.545068,14.545068,14.308635,48594,0
2018-05-10,15.999149,15.999149,15.999149,15.999149,15.739079,19992,0
2018-05-11,17.59779,17.59779,17.59779,17.59779,17.311735,129818,0
2018-05-14,19.357992,19.357992,19.357992,19.357992,19.043324,173372,0


In [None]:
# def focal_loss(y_true, y_pred):
#     gamma = 2.0
#     alpha = 0.25
#     eps = 1e-8
    
#     y_pred = 1.0 / (1.0 + np.exp(-y_pred))
#     y_pred = np.clip(y_pred, eps, 1.0 - eps)
    
#     pt = np.where(y_true == 1, y_pred, 1 - y_pred)
#     loss = -alpha * np.power(1 - pt, gamma) * np.log(pt)
    
#     return "focal_loss", np.mean(loss), False

# def focal_loss_gradient(y_true, y_pred):
#     gamma = 2.0
#     alpha = 0.25
#     eps = 1e-8
    
#     y_pred = 1.0 / (1.0 + np.exp(-y_pred))
#     y_pred = np.clip(y_pred, eps, 1.0 - eps)
    
#     pt = np.where(y_true == 1, y_pred, 1 - y_pred)
    # grad = (pt - y_true) * alpha * gamma * np.power(1 - pt, gamma - 1) * (np.log(pt) - np.log(1 - pt))
    # hess = pt * (1 - pt) * alpha * gamma * np.power(1 - pt, gamma - 1) * (gamma - 2 * pt * gamma + pt + np.log(pt) - np.log(1 - pt))
    
    # return grad, hess

def model_train(train_x, train_y, val_x, val_y):
    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'f1_score',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'verbosity': -1,
        'is_unbalance': True
    }
    
    weight = 1 / train_y.value_counts()
    train_weight = train_y.map(weight)
    # print(train_weight)
    # train_x, train_y = SMOTE().fit_resample(train_x, train_y)
    
    train = lgb.Dataset(train_x, label=train_y, weight=train_weight)
    val = lgb.Dataset(val_x, label=val_y)
    model = lgb.train(params, train, num_boost_round=100, valid_sets=[val])
    
    pred_df = pd.DataFrame(model.predict(val_x), columns=["pred"])
    pred_df["binary_pred"] = pred_df["pred"].apply(lambda x: 1 if x>0.5 else 0)
    f1 = f1_score(val_y, pred_df["binary_pred"])
    return model, f1

In [None]:
selected_feas = data.drop("labels", axis=1).columns.tolist()

In [12]:
Exp = True
if Exp:
    exp_times = 20
    cv_score_list = []
    oof_list = []
    for exp in range(exp_times):
        print("\n--------- exp {} start --------".format(exp+1))
        start_time = time.time()
        split = StratifiedKFold(n_splits=5, shuffle=True, random_state=exp)
        oof = np.zeros(len(train_data))
        for i, (train_index, val_index) in enumerate(split.split(data, data["labels"])):
            print(f"\nfold {i}: ")
            train = data.iloc[train_index]
            val = data.iloc[val_index] 
            train_x = train.drop("labels", axis=1)
            train_y = train["labels"]
            val_x = val.drop("labels", axis=1)
            val_y = val["labels"]
            
            model, f1 = model_train(train_x[selected_feas], train_y, val_x[selected_feas], val_y)
            # joblib.dump(model, f'./lgb_models/sub7/{exp}_{i}.pkl')
            # model = joblib.load(f'./lgb_models/sub7/{exp}_{i}.pkl')
            oof[val_index] = model.predict(val_x[selected_feas])

        end_time = time.time()
        print("\ncost time(min): {}".format(round(end_time-start_time)/60))
        oof_df = pd.DataFrame(oof, columns=["pred"])
        oof_df["binary_pred"] = oof_df["pred"].apply(lambda x: 1 if x>0.5 else 0)
        cv_score = f1_score(data["labels"], oof_df["binary_pred"])
        oof_list.append(oof)
        print(f"cv score: {cv_score}")

        cv_score_list.append(cv_score)
        print("------------- exp {} end ----------".format(exp+1))

    print(f"\ncv score ave: {np.mean(cv_score_list)}, std: {np.std(cv_score_list)}")
    oof_ensem_df = pd.DataFrame(sum(oof_list)/exp_times, columns=["pred"], index=train_data.index)
    oof_ensem_df["binary_pred"] = oof_ensem_df["pred"].apply(lambda x: 1 if x>0.5 else 0)
    ensemble_oof_score = f1_score(data["labels"], oof_ensem_df["binary_pred"])
    print(f"ensemble oof score: {ensemble_oof_score}")

KeyError: "['target'] not found in axis"