In [None]:
class Dataset_Custom_Stock(Dataset):
    def __init__(self, args, root_path, flag='train', size=None,
                 features='S', data_path='/home/liyuante/llm4ts/us_stock.csv',
                 target='OT', scale=True, timeenc=0, freq='h', seasonal_patterns=None):
        self.args = args
        if size == None:
            self.seq_len = 24 * 4 * 4
            self.label_len = 24 * 4
            self.pred_len = 24 * 4
        else:
            self.seq_len = size[0]
            self.label_len = size[1]
            self.pred_len = size[2]
        assert flag in ['train', 'test', 'val']
        type_map = {'train': 0, 'val': 1, 'test': 2}
        self.set_type = type_map[flag]

        self.features = features
        self.target = target
        self.scale = scale
        self.timeenc = timeenc
        self.freq = freq

        self.root_path = root_path
        self.data_path = data_path
        self.__read_data__()

    def __read_data__(self):
        self.scaler = StandardScaler()
        df_raw = pd.read_csv(os.path.join(self.root_path, self.data_path))

        cols = list(df_raw.columns)
        cols.remove(self.target)
        cols.remove('date')
        cols.remove('PERMNO')
        df_raw = df_raw[['PERMNO', 'date'] + cols + [self.target]]

        # 分组处理每个股票的数据
        grouped = df_raw.groupby('PERMNO')
        data_list = []
        stamp_list = []

        for permno, group in grouped:
            group = group.sort_values('date')
            num_train = int(len(group) * 0.7)
            num_test = int(len(group) * 0.2)
            num_vali = len(group) - num_train - num_test
            border1s = [0, num_train - self.seq_len, len(group) - num_test - self.seq_len]
            border2s = [num_train, num_train + num_vali, len(group)]
            border1 = border1s[self.set_type]
            border2 = border2s[self.set_type]

            if self.features == 'M' or self.features == 'MS':
                cols_data = group.columns[2:]  # 去掉 PERMNO 和 date 列
                df_data = group[cols_data]
            elif self.features == 'S':
                df_data = group[[self.target]]

            if self.scale:
                train_data = df_data[border1s[0]:border2s[0]]
                self.scaler.fit(train_data.values)
                data = self.scaler.transform(df_data.values)
            else:
                data = df_data.values

            df_stamp = group[['date']][border1:border2]
            df_stamp['date'] = pd.to_datetime(df_stamp.date)
            if self.timeenc == 0:
                df_stamp['month'] = df_stamp.date.apply(lambda row: row.month, 1)
                df_stamp['day'] = df_stamp.date.apply(lambda row: row.day, 1)
                df_stamp['weekday'] = df_stamp.date.apply(lambda row: row.weekday(), 1)
                df_stamp['hour'] = df_stamp.date.apply(lambda row: row.hour, 1)
                data_stamp = df_stamp.drop(['date'], 1).values
            elif self.timeenc == 1:
                data_stamp = time_features(pd.to_datetime(df_stamp['date'].values), freq=self.freq)
                data_stamp = data_stamp.transpose(1, 0)

            data_list.append(data[border1:border2])
            stamp_list.append(data_stamp)

        self.data_x = np.concatenate(data_list, axis=0)
        self.data_y = self.data_x
        self.data_stamp = np.concatenate(stamp_list, axis=0)

        if self.set_type == 0 and self.args.augmentation_ratio > 0:
            self.data_x, self.data_y, augmentation_tags = run_augmentation_single(self.data_x, self.data_y, self.args)

    def __getitem__(self, index):
        s_begin = index
        s_end = s_begin + self.seq_len
        r_begin = s_end - self.label_len
        r_end = r_begin + self.label_len + self.pred_len

        seq_x = self.data_x[s_begin:s_end]
        seq_y = self.data_y[r_begin:r_end]
        seq_x_mark = self.data_stamp[s_begin:s_end]
        seq_y_mark = self.data_stamp[r_begin:r_end]

        return seq_x, seq_y, seq_x_mark, seq_y_mark

    def __len__(self):
        return len(self.data_x) - self.seq_len - self.pred_len + 1

    def inverse_transform(self, data):
        return self.scaler.inverse_transform(data)


In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

df = pd.read_csv('/home/liyuante/llm4ts/us_stock.csv')
df['date'] = pd.to_datetime(df['date'])

print(df.head())

target = 'RETX'
cols = list(df.columns)
cols.remove(target)
cols.remove('date')
cols.remove('PERMNO')
cols.remove('SPREAD')
cols.remove('RET')
cols.remove('ALTPRCDT')
cols.remove('BID')
cols.remove('ASK')

df = df[['PERMNO', 'date'] + cols + [target]]

# 按照股票代码划分数据
grouped = df.groupby('PERMNO')

# 定义一个函数来处理每个股票组的数据
def process_group(group):
    # 将字符 'B', 'C' 转化为 NaN
    group = group.replace({'B': np.nan, 'C': np.nan})
    
    # 将 BIDLO, ASKHI, PRC, VOL, ALTPRC 转化为比值
    group[['BIDLO', 'ASKHI', 'PRC', 'VOL', 'ALTPRC']] = group[['BIDLO', 'ASKHI', 'PRC', 'VOL', 'ALTPRC']].apply(pd.to_numeric, errors='coerce')
    group[['BIDLO', 'ASKHI', 'PRC', 'VOL', 'ALTPRC']] = group[['BIDLO', 'ASKHI', 'PRC', 'VOL', 'ALTPRC']].div(group[['BIDLO', 'ASKHI', 'PRC', 'VOL', 'ALTPRC']].shift(1))
    
    return group

# 对每个组应用该函数
processed = grouped.apply(process_group)

# 重置索引
processed.reset_index(drop=True, inplace=True)

processed.dropna(inplace=True)

processed.to_csv('/home/liyuante/llm4ts/us_stock_processed.csv', index=False)



   PERMNO       date  BIDLO  ASKHI    PRC      VOL        RET    BID    ASK  \
0   10107 1986-02-28    NaN    NaN    NaN      NaN        NaN    NaN    NaN   
1   10107 1986-03-31  26.00  29.50  27.50  64786.0          C  27.25  27.50   
2   10107 1986-04-30  27.25  34.00  32.25  19056.0   0.172727  31.75  32.25   
3   10107 1986-05-30  31.00  34.75  34.75   9810.0   0.077519  34.75  35.00   
4   10107 1986-06-30  29.75  34.25  30.75  10238.0  -0.115108  30.50  30.75   

   ALTPRC  SPREAD    ALTPRCDT       RETX  
0   28.00     NaN  1986-03-13        NaN  
1   27.50     NaN  1986-03-31          C  
2   32.25     NaN  1986-04-30   0.172727  
3   34.75     NaN  1986-05-30   0.077519  
4   30.75     NaN  1986-06-30  -0.115108  


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  processed = grouped.apply(process_group)


In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

df = pd.read_csv('/home/liyuante/llm4ts/us_stock_processed.csv')

df

Unnamed: 0,PERMNO,date,BIDLO,ASKHI,PRC,VOL,ALTPRC,RETX
0,10107,1986-04-30,1.048077,1.152542,1.172727,0.294138,1.172727,0.172727
1,10107,1986-05-30,1.137615,1.022059,1.077519,0.514798,1.077519,0.077519
2,10107,1986-06-30,0.959677,0.985612,0.884892,1.043629,0.884892,-0.115108
3,10107,1986-07-31,0.915966,0.912409,0.926829,1.568373,0.926829,-0.073171
4,10107,1986-08-29,1.009174,0.980000,1.000000,0.575388,1.000000,0.000000
...,...,...,...,...,...,...,...,...
12337,92655,2023-08-31,1.064389,1.000000,0.941169,0.585500,0.941169,-0.058831
12338,92655,2023-09-29,0.999287,0.998728,1.057934,1.143003,1.057934,0.057934
12339,92655,2023-10-31,1.069776,1.057067,1.062219,1.037789,1.062219,0.062219
12340,92655,2023-11-30,1.042063,1.025157,1.032508,0.896984,1.032508,0.032508


In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

df = pd.read_csv('/home/liyuante/llm4ts/us_stock.csv')
df['date'] = pd.to_datetime(df['date'])

# df.tail()

target = 'RET'
cols = list(df.columns)
cols.remove(target)
cols.remove('date')
cols.remove('PERMNO')
cols.remove('SPREAD')
cols.remove('RETX')
cols.remove('ALTPRCDT')
cols.remove('BID')
cols.remove('ASK')

df.dropna(inplace=True)
# print(df)
df = df[cols].apply(pd.to_numeric, errors='coerce')
df = df.pct_change()
df.dropna(inplace=True)
print(df)
scaler = MinMaxScaler()
df_normalized = pd.DataFrame(scaler.fit_transform(df), columns=cols)
print(df_normalized)
df.dropna(inplace=True)
df_raw = df_normalized[['PERMNO', 'date'] + cols + [target]]
df_raw.dropna(inplace=True)
grouped = df_raw.groupby('PERMNO')

for permno, group in grouped:
    print(group)
    break


          BIDLO     ASKHI       PRC         VOL    ALTPRC
489   -1.885714 -0.071429 -0.115385    0.759519 -0.115385
509    0.483871 -2.346154  0.405797   -0.779043  0.405797
530    1.956522 -3.742857  1.927835    0.418814  1.927835
551   -0.250000 -0.152778 -0.204225    0.635786 -0.204225
5835  -1.600000 -0.420334 -0.405310   -0.967240 -0.405310
11221 -3.303922  1.545278  1.142857  106.983051  1.142857
      BIDLO     ASKHI       PRC       VOL    ALTPRC
0  0.269598  0.694277  0.124264  0.015996  0.124264
1  0.720052  0.264120  0.347645  0.001743  0.347645
2  1.000000  0.000000  1.000000  0.012840  1.000000
3  0.580545  0.678893  0.086186  0.014850  0.086186
4  0.323912  0.628298  0.000000  0.000000  0.000000
5  0.000000  1.000000  0.663554  1.000000  0.663554


KeyError: "['PERMNO', 'date', 'RET'] not in index"

In [26]:
import pandas as pd

# 读取CSV文件
df = pd.read_csv('/home/liyuante/llm4ts/us_stock.csv')
df['date'] = pd.to_datetime(df['date'])

# 定义目标列和需要保留的列
target = 'RET'
cols = list(df.columns)
cols.remove(target)
cols.remove('date')
cols.remove('PERMNO')
cols.remove('SPREAD')
cols.remove('RETX')
cols.remove('ALTPRCDT')
cols.remove('BID')
cols.remove('ASK')

# 计算需要百分比变化的列
df_numeric = df[cols].apply(pd.to_numeric, errors='coerce')
df_pct_change = df_numeric.pct_change() / df_numeric.shift(1)

# 保留'PERMNO'和'date'列
df_final = pd.concat([df[['PERMNO', 'date']], df_pct_change], axis=1)

# 删除缺失值
df_final.dropna(inplace=True)

# 打印结果
print(df_final)

# # 保留原始数据中的'PERMNO', 'date'和需要的列
# df_raw = pd.concat([df[['PERMNO', 'date']], df[cols], df[[target]]], axis=1)
# df_raw.dropna(inplace=True)

# 按'PERMNO'分组
grouped = df_final.groupby('PERMNO')

# 打印第一个分组的数据
for permno, group in grouped:
    print(group)
    break


       PERMNO       date     BIDLO     ASKHI       PRC           VOL    ALTPRC
2       10107 1986-04-30  0.001849  0.005171  0.006281 -1.089529e-05  0.006281
3       10107 1986-05-30  0.005050  0.000649  0.002404 -2.546188e-05  0.002404
4       10107 1986-06-30 -0.001301 -0.000414 -0.003312  4.447396e-06 -0.003312
5       10107 1986-07-31 -0.002825 -0.002557 -0.002380  5.551599e-05 -0.002380
6       10107 1986-08-29  0.000337 -0.000640  0.000000 -2.644406e-05  0.000000
...       ...        ...       ...       ...       ...           ...       ...
12589   92655 2023-08-31  0.000144  0.000000 -0.000116 -4.620912e-07 -0.000116
12590   92655 2023-09-29 -0.000001 -0.000002  0.000122  2.722833e-07  0.000122
12591   92655 2023-10-31  0.000147  0.000112  0.000123  6.295008e-08  0.000123
12592   92655 2023-11-30  0.000083  0.000047  0.000061 -1.653581e-07  0.000061
12593   92655 2023-12-29 -0.000053 -0.000009 -0.000087  1.829156e-07 -0.000087

[12374 rows x 7 columns]
     PERMNO       date    

In [4]:
import pandas as pd

df = pd.read_csv('/home/liyuante/llm4ts/us_stock_all.csv')
df.head()
df.tail()

  df = pd.read_csv('/home/liyuante/llm4ts/us_stock_all.csv')


Unnamed: 0,PERMNO,date,TICKER,BIDLO,ASKHI,PRC,VOL,RET,BID,ASK,OPENPRC,NUMTRD,RETX
263616,92655,2023-12-22,UNH,518.02002,523.01001,520.31,1759571.0,0.000827,520.47998,520.48999,519.88,,0.000827
263617,92655,2023-12-26,UNH,517.96997,521.47998,520.03003,1390912.0,-0.000538,520.10999,520.12,519.88,,-0.000538
263618,92655,2023-12-27,UNH,519.35999,523.15997,522.78998,1851840.0,0.005307,522.95001,523.09003,519.75,,0.005307
263619,92655,2023-12-28,UNH,522.94,527.87,524.90002,2001208.0,0.004036,525.07001,525.21002,523.46997,,0.004036
263620,92655,2023-12-29,UNH,523.91998,528.23999,526.46997,2080197.0,0.002991,526.53998,526.94,525.97998,,0.002991


In [18]:
import pandas as pd

# 读取CSV文件
df = pd.read_csv('/home/liyuante/llm4ts/us_stock_all.csv')
df['date'] = pd.to_datetime(df['date'])

# 定义目标列和需要保留的列
target = 'RET'
cols = list(df.columns)
cols.remove(target)
cols.remove('date')
cols.remove('PERMNO')
cols.remove('TICKER')
cols.remove('NUMTRD')
cols.remove('RETX')
cols.remove('BID')
cols.remove('ASK')

# 计算需要百分比变化的列
df_numeric = df[cols].apply(pd.to_numeric, errors='coerce')
df_pct_change = df_numeric.pct_change() / df_numeric.shift(1)

# 保留'PERMNO'和'date'列
df_final = pd.concat([df[['TICKER', 'date']], df_pct_change, df[['RETX']]], axis=1)

# 删除缺失值
df_final.dropna(inplace=True)

# 打印结果
print(df_final)

# 按'PERMNO'分组
grouped = df_final.groupby('TICKER')

# 打印第一个分组的数据
for permno, group in grouped:
    print(group)
    break


  df = pd.read_csv('/home/liyuante/llm4ts/us_stock_all.csv')


       TICKER       date         BIDLO     ASKHI       PRC           VOL  \
1585     MSFT 1992-06-16 -4.444444e-04 -0.000247 -0.000523  3.165859e-08   
1586     MSFT 1992-06-17 -1.902497e-04 -0.000555 -0.000071  1.223073e-07   
1587     MSFT 1992-06-18 -9.780429e-05  0.000280 -0.000024 -1.015274e-07   
1588     MSFT 1992-06-19  3.471533e-04 -0.000089  0.000239 -1.221880e-07   
1589     MSFT 1992-06-22 -2.361805e-04  0.000045  0.000139  1.966328e-07   
...       ...        ...           ...       ...       ...           ...   
263616    UNH 2023-12-22  5.697367e-06  0.000008  0.000002 -1.179126e-07   
263617    UNH 2023-12-26 -1.865138e-07 -0.000006 -0.000001 -1.190724e-07   
263618    UNH 2023-12-27  5.180980e-06  0.000006  0.000010  2.382505e-07   
263619    UNH 2023-12-28  1.327233e-05  0.000017  0.000008  4.355627e-08   
263620    UNH 2023-12-29  3.583550e-06  0.000001  0.000006  1.972342e-08   

             OPENPRC       RETX  
1585    8.888889e-05  -0.039604  
1586   -5.262927e-0

In [16]:
import pandas as pd

df['date'] = pd.to_datetime(df_final['date'])
df = df[df['date'] >= '2000-01-01']

# 计算每个 Ticker 的时间跨度区间
time_span = df.groupby('TICKER')['date'].agg(['min', 'max']).reset_index()

# 计算每个 Ticker 的时间跨度区间
time_span['time_span'] = time_span['max'] - time_span['min']

# 显示结果
print(time_span)


   TICKER        min        max time_span
0    AAPL 2000-01-03 2023-12-29 8761 days
1    AMGN 2000-01-03 2023-12-29 8761 days
2    AMZN 2000-01-03 2023-12-29 8761 days
3     AXP 2000-01-03 2023-12-29 8761 days
4     BAC 2000-01-03 2023-12-29 8761 days
5     BEL 2000-01-03 2000-06-30  179 days
6     CAT 2000-01-03 2023-12-29 8761 days
7     CMB 2000-01-03 2000-12-29  361 days
8     CRM 2004-06-24 2023-12-29 7127 days
9    CSCO 2000-01-03 2023-12-29 8761 days
10   CYSP 2000-01-03 2000-04-05   93 days
11    DIS 2000-01-03 2023-12-29 8761 days
12    DOW 2000-01-03 2017-09-01 6451 days
13   FOTO 2000-01-03 2001-08-22  597 days
14   GSVI 2000-04-06 2001-03-14  342 days
15    ICG 2023-03-17 2023-12-29  287 days
16    JNJ 2000-01-03 2023-12-29 8761 days
17    JPM 2001-01-02 2023-12-29 8396 days
18    KOG 2006-06-21 2014-12-08 3092 days
19    MCD 2000-01-03 2023-12-29 8761 days
20    MMM 2000-01-03 2023-12-29 8761 days
21   MSFT 2000-01-03 2023-12-29 8761 days
22    NKE 2000-01-03 2023-12-29 87

In [23]:
import pandas as pd

# 假设您的数据已经加载到 DataFrame 中
df['date'] = pd.to_datetime(df['date'])

# 筛选出2000年之后的数据
df_filtered = df[df['date'] >= '2000-01-01']

# 计算每个 TICKER 的时间跨度区间
time_span = df_filtered.groupby('TICKER')['date'].agg(['min', 'max']).reset_index()

# 计算每个 TICKER 的时间跨度天数
time_span['time_span_days'] = (time_span['max'] - time_span['min']).dt.days

# 筛选出时间跨度为8761天的记录
time_span_filtered = time_span[time_span['time_span_days'] == 8761]

# 显示结果
print(time_span_filtered)


   TICKER        min        max  time_span_days
0    AAPL 2000-01-03 2023-12-29            8761
1    AMGN 2000-01-03 2023-12-29            8761
2    AMZN 2000-01-03 2023-12-29            8761
3     AXP 2000-01-03 2023-12-29            8761
4     BAC 2000-01-03 2023-12-29            8761
6     CAT 2000-01-03 2023-12-29            8761
9    CSCO 2000-01-03 2023-12-29            8761
11    DIS 2000-01-03 2023-12-29            8761
16    JNJ 2000-01-03 2023-12-29            8761
19    MCD 2000-01-03 2023-12-29            8761
20    MMM 2000-01-03 2023-12-29            8761
21   MSFT 2000-01-03 2023-12-29            8761
22    NKE 2000-01-03 2023-12-29            8761
23     PG 2000-01-03 2023-12-29            8761
24    UNH 2000-01-03 2023-12-29            8761
27    WMT 2000-01-03 2023-12-29            8761


In [19]:
import pandas as pd

# 假设您的数据已经加载到 DataFrame 中
df = df_final
df['date'] = pd.to_datetime(df['date'])

# 筛选出2000年之后的数据
df_filtered = df[df['date'] >= '2000-01-01']

# 计算每个 TICKER 的时间跨度区间
time_span = df_filtered.groupby('TICKER')['date'].agg(['min', 'max']).reset_index()

# 计算每个 TICKER 的时间跨度天数
time_span['time_span_days'] = (time_span['max'] - time_span['min']).dt.days

# 筛选出时间跨度为8761天的 TICKER
tickers_with_8761_days = time_span[time_span['time_span_days'] == 8761]['TICKER']

# 让原始 df 仅保留这些 TICKER 的记录
df_final = df_filtered[df_filtered['TICKER'].isin(tickers_with_8761_days)]

# 显示结果
print(df_final)


       TICKER       date         BIDLO     ASKHI       PRC           VOL  \
3492     MSFT 2000-01-03 -3.144872e-04  0.000063 -0.000014  4.867886e-07   
3493     MSFT 2000-01-04  1.992985e-05 -0.000107 -0.000290  8.186480e-10   
3494     MSFT 2000-01-05 -2.281735e-04 -0.000055  0.000094  6.830411e-09   
3495     MSFT 2000-01-06 -8.359184e-05 -0.000185 -0.000294 -4.403477e-09   
3496     MSFT 2000-01-07 -9.046294e-05 -0.000125  0.000119  4.360924e-09   
...       ...        ...           ...       ...       ...           ...   
263616    UNH 2023-12-22  5.697367e-06  0.000008  0.000002 -1.179126e-07   
263617    UNH 2023-12-26 -1.865138e-07 -0.000006 -0.000001 -1.190724e-07   
263618    UNH 2023-12-27  5.180980e-06  0.000006  0.000010  2.382505e-07   
263619    UNH 2023-12-28  1.327233e-05  0.000017  0.000008  4.355627e-08   
263620    UNH 2023-12-29  3.583550e-06  0.000001  0.000006  1.972342e-08   

             OPENPRC       RETX  
3492   -9.053871e-06  -0.001606  
3493   -2.767315e-0

In [21]:
import os

# 创建一个输出文件夹来存储 CSV 文件
output_folder = 'ticker_csv_files'
os.makedirs(output_folder, exist_ok=True)

# 对每个 TICKER 进行分组并保存到单独的 CSV 文件
for ticker, group in df_final.groupby('TICKER'):
    # 创建文件名，以 TICKER 名字命名
    filename = os.path.join(output_folder, f'{ticker}.csv')
    # 保存分组数据到 CSV 文件
    group.to_csv(filename, index=False)

print("所有文件已保存到 'ticker_csv_files' 文件夹中。")

所有文件已保存到 'ticker_csv_files' 文件夹中。


In [24]:
import pandas as pd

df = pd.read_csv('/home/liyuante/llm4ts/ticker_csv_files/AAPL.csv')
print(df)


     TICKER        date     BIDLO     ASKHI       PRC           VOL   OPENPRC  \
0      AAPL  2000-01-03  0.000221  0.000909  0.000863  1.518989e-06  0.000386   
1      AAPL  2000-01-04 -0.000048 -0.000148 -0.000753 -8.029787e-09  0.000307   
2      AAPL  2000-01-05  0.000177 -0.000005  0.000143  1.118394e-07 -0.000384   
3      AAPL  2000-01-06 -0.000754 -0.000291 -0.000832 -2.858331e-09  0.000221   
4      AAPL  2000-01-07  0.000055 -0.000524  0.000499 -5.719446e-08 -0.000855   
...     ...         ...       ...       ...       ...           ...       ...   
6029   AAPL  2023-12-22 -0.000014 -0.000043 -0.000028 -4.418144e-09 -0.000024   
6030   AAPL  2023-12-26 -0.000004 -0.000040 -0.000015 -6.058560e-09 -0.000041   
6031   AAPL  2023-12-27 -0.000047 -0.000010  0.000003  2.332135e-08 -0.000030   
6032   AAPL  2023-12-28  0.000057  0.000031  0.000012 -6.127177e-09  0.000045   
6033   AAPL  2023-12-29 -0.000039 -0.000007 -0.000028  7.425538e-09 -0.000006   

          RETX  
0     0.08

In [26]:
import pandas as pd

# Assuming your dataframe is named 'df' and the date column is named 'date'
df['date'] = pd.to_datetime(df['date'])

# Filter the dataframe for the year 2016
df_2016 = df[df['date'].dt.year == 2016]

# Find the index of the last day of 2016
last_day_index = df_2016[df_2016['date'] == df_2016['date'].max()].index[0]

print("Index of the last day of 2016:", last_day_index)


Index of the last day of 2016: 4273


In [27]:
import pandas as pd

# Assuming your dataframe is named 'df' and the date column is named 'date'
df['date'] = pd.to_datetime(df['date'])

# Filter the dataframe for the year 2020
df_2020 = df[df['date'].dt.year == 2020]

# Find the index of the last day of 2020
last_day_index = df_2020[df_2020['date'] == df_2020['date'].max()].index[0]

print("Index of the last day of 2020:", last_day_index)


Index of the last day of 2020: 5280


In [28]:
import pandas as pd

start_date = '2021-01-01'
end_date = '2023-12-31'

date_range = pd.date_range(start=start_date, end=end_date)
num_days = len(date_range)

print("Number of days from 2021-01-01 to 2023-12-31:", num_days)


Number of days from 2021-01-01 to 2023-12-31: 1095


In [33]:
import pandas as pd

start_date2 = '2000-01-01'
end_date2 = '2016-12-31'

df_filtered = df[(df['date'] >= start_date2) & (df['date'] <= end_date2)]

# 获取数据中独特的日期数
unique_dates_in_data = df_filtered['date'].nunique()

print("Number of unique days in data from 2000-01-01 to 2016-12-31:", unique_dates_in_data)


Number of unique days in data from 2000-01-01 to 2016-12-31: 4274


In [25]:
num_train = int(len(df) * 0.7)
print(num_train)
num_test = int(len(df) * 0.2)
print(num_test)

4223
1206


In [4]:
import numpy as np

file_path = '/home/liyuante/llm4ts/Time-Series-Library/results/long_term_forecast_usa_1_Transformer_MY_ftM_sl96_ll48_pl96_dm16_nh8_el2_dl1_df2048_expand2_dc4_fc3_ebtimeF_dtTrue_Exp_0/true.npy'

data = np.load(file_path)
print(data)


[[[-0.10933667 -0.24987295 -0.08952767 -0.06186125 -0.9672878 ]
  [-0.16738214  0.08650918 -0.13024166 -0.32311437  0.41691202]
  [-0.07888711 -0.33419135 -0.08027037 -0.10675785 -1.3011038 ]
  ...
  [-0.01974932 -0.13382104 -0.12877095 -0.04473356 -0.46438015]
  [-0.19979034 -0.03977328 -0.07261777 -0.26888725 -0.0924532 ]
  [ 0.21964464  0.17360942 -0.12788124  0.12973325  0.73971653]]

 [[-0.16738214  0.08650918 -0.13024166 -0.32311437  0.41691202]
  [-0.07888711 -0.33419135 -0.08027037 -0.10675785 -1.3011038 ]
  [ 0.02111994  0.29418325 -0.12822914  0.01859067  1.2291118 ]
  ...
  [-0.19979034 -0.03977328 -0.07261777 -0.26888725 -0.0924532 ]
  [ 0.21964464  0.17360942 -0.12788124  0.12973325  0.73971653]
  [-0.00147149 -0.16596603 -0.11425286  0.16251981 -0.59580874]]

 [[-0.07888711 -0.33419135 -0.08027037 -0.10675785 -1.3011038 ]
  [ 0.02111994  0.29418325 -0.12822914  0.01859067  1.2291118 ]
  [ 0.05367218  0.05116602 -0.11902136  0.25609985  0.27757004]
  ...
  [ 0.21964464  0.

In [3]:
import numpy as np

file_path = '/home/liyuante/llm4ts/Time-Series-Library/results/long_term_forecast_usa_1_Transformer_MY_ftM_sl96_ll48_pl96_dm16_nh8_el2_dl1_df2048_expand2_dc4_fc3_ebtimeF_dtTrue_Exp_0/pred.npy'

data = np.load(file_path)
print(data)


[[[-0.00778739]
  [ 0.00222146]
  [-0.00411569]
  ...
  [-0.00748175]
  [-0.01391876]
  [-0.00872239]]

 [[-0.03435422]
  [-0.00043218]
  [-0.00787693]
  ...
  [-0.01039387]
  [-0.01660348]
  [-0.01929486]]

 [[ 0.00880417]
  [-0.00367717]
  [-0.01195544]
  ...
  [-0.01331899]
  [-0.01964212]
  [-0.00225623]]

 ...

 [[-0.02791192]
  [ 0.00908184]
  [ 0.00504025]
  ...
  [-0.03542716]
  [-0.01896821]
  [-0.02391225]]

 [[-0.00118321]
  [ 0.00275208]
  [-0.00258481]
  ...
  [-0.01431011]
  [-0.02496526]
  [-0.02835831]]

 [[ 0.00956569]
  [-0.00433493]
  [-0.01119736]
  ...
  [-0.02091309]
  [-0.03120854]
  [-0.02647299]]]


In [3]:
from data_provider.data_loader import Dataset_ETT_hour, Dataset_ETT_minute, Dataset_Custom, Dataset_M4, PSMSegLoader, \
    MSLSegLoader, SMAPSegLoader, SMDSegLoader, SWATSegLoader, UEAloader, Dataset_Custom_Stock
from data_provider.uea import collate_fn
from torch.utils.data import DataLoader

data_dict = {
    'ETTh1': Dataset_ETT_hour,
    'ETTh2': Dataset_ETT_hour,
    'ETTm1': Dataset_ETT_minute,
    'ETTm2': Dataset_ETT_minute,
    'custom': Dataset_Custom,
    'm4': Dataset_M4,
    'PSM': PSMSegLoader,
    'MSL': MSLSegLoader,
    'SMAP': SMAPSegLoader,
    'SMD': SMDSegLoader,
    'SWAT': SWATSegLoader,
    'UEA': UEAloader,
    'MY': Dataset_Custom_Stock
}


def data_provider(args, flag):
    Data = data_dict[args.data]
    timeenc = 0 if args.embed != 'timeF' else 1

    shuffle_flag = False if flag == 'test' else True
    drop_last = False
    batch_size = args.batch_size
    freq = args.freq

    if args.task_name == 'anomaly_detection':
        drop_last = False
        data_set = Data(
            args = args,
            root_path=args.root_path,
            win_size=args.seq_len,
            flag=flag,
        )
        print(flag, len(data_set))
        data_loader = DataLoader(
            data_set,
            batch_size=batch_size,
            shuffle=shuffle_flag,
            num_workers=args.num_workers,
            drop_last=drop_last)
        return data_set, data_loader
    elif args.task_name == 'classification':
        drop_last = False
        data_set = Data(
            args = args,
            root_path=args.root_path,
            flag=flag,
        )

        data_loader = DataLoader(
            data_set,
            batch_size=batch_size,
            shuffle=shuffle_flag,
            num_workers=args.num_workers,
            drop_last=drop_last,
            collate_fn=lambda x: collate_fn(x, max_len=args.seq_len)
        )
        return data_set, data_loader
    else:
        if args.data == 'm4':
            drop_last = False
        data_set = Data(
            args = args,
            root_path=args.root_path,
            data_path=args.data_path,
            flag=flag,
            size=[args.seq_len, args.label_len, args.pred_len],
            features=args.features,
            target=args.target,
            timeenc=timeenc,
            freq=freq,
            seasonal_patterns=args.seasonal_patterns
        )
        print(flag, len(data_set))
        data_loader = DataLoader(
            data_set,
            batch_size=batch_size,
            shuffle=shuffle_flag,
            num_workers=args.num_workers,
            drop_last=drop_last)
        return data_set, data_loader


  from .autonotebook import tqdm as notebook_tqdm


NameError: name 'Dataset_ETT_hour' is not defined

In [2]:
import pandas as pd

# Load the pickle file
df = pd.read_pickle('/home/liyuante/llm4ts/Time-Series-Library/test_data.pkl')

# View the contents of the DataFrame
print(df)


ModuleNotFoundError: No module named 'data_provider'