In [142]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pathlib import Path


class GetData:
    '''
    ----------------------------------------------------------------------
      
    provided methods:
        get_ticker_list()
        get_price_data()
        
    ----------------------------------------------------------------------
    '''
    
    def __init__(self, path):
        self.path = path

    def get_ticker_list(self):
        path_price = self.path/Path('FS_sp500_Value.csv')
        temp_df = pd.read_csv(path_price)
        list_ticker = sorted(list(set(temp_df['Ticker'].to_list())))

        return list_ticker
    
    def get_price_data(self, price_type, ticker_list=[], ignore_index=False, only_recent=False, recent_len=1):
        '''
        option:
            price_type: choose:"High", "Low", "Open", "Close", "Adj Close"
            ticker_list: input a list of ticker of stocks
            ignore_index: True => ignore original index
            only_recent: True => get newest price
            recent_len: number：　the number of newest price
        '''
        self.ticker_list = ticker_list
        path_price = self.path/Path('FS_sp500_Value.csv')
        combined_price = pd.read_csv(path_price)
        df_price = pd.DataFrame({'Ticker':[] ,f'{price_type}': []})

        for symbol in self.ticker_list:
            temp_df = combined_price[combined_price.Ticker.str.fullmatch(symbol)].copy()
            if only_recent == True:
                df_price = pd.concat([df_price,temp_df.loc[temp_df.index[len(temp_df)-recent_len:], ['Ticker', f'{price_type}']]], ignore_index=ignore_index)
            else:
                df_price = pd.concat([df_price,temp_df.loc[:,['Ticker', f'{price_type}']]], ignore_index=ignore_index)
        return df_price
    
    @staticmethod
    def test_null(f_name):
        path = dataset_path/Path(f_name)
        df = pd.read_csv(path)
        print(f'Now check if  {f_name} has null:')
        for c in df.columns:
            print(f"{f_name}'s column——{c} :  {df.loc[:,c].isnull().all()}")

time: 3.22 ms (started: 2021-12-25 10:34:52 +00:00)


In [143]:
dataset_path = Path('./dataset')
get_data = GetData(dataset_path)
print(help(GetData))

Help on class GetData in module __main__:

class GetData(builtins.object)
 |  GetData(path)
 |  
 |  ----------------------------------------------------------------------
 |    
 |  provided methods:
 |      get_ticker_list()
 |      get_price_data()
 |      
 |  ----------------------------------------------------------------------
 |  
 |  Methods defined here:
 |  
 |  __init__(self, path)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  get_price_data(self, price_type, ticker_list=[], ignore_index=False, only_recent=False, recent_len=1)
 |      option:
 |          price_type: choose:"High", "Low", "Open", "Close", "Adj Close"
 |          ticker_list: input a list of ticker of stocks
 |          ignore_index: True => ignore original index
 |          only_recent: True => get newest price
 |          recent_len: number：　the number of newest price
 |  
 |  get_ticker_list(self)
 |  
 |  -------------------------------------------------------------------

In [137]:
# e.g.
t_list = getdata.get_ticker_list()
print(len(t_list), t_list)

505 ['A', 'AAL', 'AAP', 'AAPL', 'ABBV', 'ABC', 'ABMD', 'ABT', 'ACN', 'ADBE', 'ADI', 'ADM', 'ADP', 'ADSK', 'AEE', 'AEP', 'AES', 'AFL', 'AIG', 'AIZ', 'AJG', 'AKAM', 'ALB', 'ALGN', 'ALK', 'ALL', 'ALLE', 'ALXN', 'AMAT', 'AMCR', 'AMD', 'AME', 'AMGN', 'AMP', 'AMT', 'AMZN', 'ANET', 'ANSS', 'ANTM', 'AON', 'AOS', 'APA', 'APD', 'APH', 'APTV', 'ARE', 'ATO', 'ATVI', 'AVB', 'AVGO', 'AVY', 'AWK', 'AXP', 'AZO', 'BA', 'BAC', 'BAX', 'BBY', 'BDX', 'BEN', 'BF-B', 'BIIB', 'BIO', 'BK', 'BKNG', 'BKR', 'BLK', 'BLL', 'BMY', 'BR', 'BRK-B', 'BSX', 'BWA', 'BXP', 'C', 'CAG', 'CAH', 'CARR', 'CAT', 'CB', 'CBOE', 'CBRE', 'CCI', 'CCL', 'CDNS', 'CDW', 'CE', 'CERN', 'CF', 'CFG', 'CHD', 'CHRW', 'CHTR', 'CI', 'CINF', 'CL', 'CLX', 'CMA', 'CMCSA', 'CME', 'CMG', 'CMI', 'CMS', 'CNC', 'CNP', 'COF', 'COG', 'COO', 'COP', 'COST', 'CPB', 'CPRT', 'CRL', 'CRM', 'CSCO', 'CSX', 'CTAS', 'CTLT', 'CTSH', 'CTVA', 'CTXS', 'CVS', 'CVX', 'CZR', 'D', 'DAL', 'DD', 'DE', 'DFS', 'DG', 'DGX', 'DHI', 'DHR', 'DIS', 'DISCA', 'DISCK', 'DISH', 'DLR',

In [138]:
print(help(getdata.get_price_data))

# e.g.
test_get_price = getdata.get_price_data(price_type='High', ticker_list=['A','AAL', 'CCI'])
test_get_price_recent = getdata.get_price_data(price_type='High', ticker_list=['A','AAL', 'CCI'], only_recent=True, recent_len=10)
test_get_price_ignore_ind = getdata.get_price_data(price_type='High', ticker_list=['A','AAL', 'CCI'], ignore_index=True)

display(test_get_price, test_get_price_recent, test_get_price_ignore_ind)

Help on method get_price_data in module __main__:

get_price_data(price_type, ticker_list=[], ignore_index=False, only_recent=False, recent_len=1) method of __main__.GetData instance
    option:
        price_type: choose:"High", "Low", "Open", "Close", "Adj Close"
        ticker_list: input a list of ticker of stocks
        ignore_index: True => ignore original index
        only_recent: True => get newest price
        recent_len: number：　the number of newest price

None


Unnamed: 0,Ticker,High
0,A,22.725323
1,A,22.625179
2,A,22.331903
3,A,22.174536
4,A,22.045780
...,...,...
232600,CCI,199.000000
232601,CCI,198.070007
232602,CCI,200.470001
232603,CCI,198.919998


Unnamed: 0,Ticker,High
2872,A,140.320007
2873,A,138.220001
2874,A,136.910004
2875,A,138.410004
2876,A,138.580002
2877,A,138.919998
2878,A,140.369995
2879,A,143.350006
2880,A,143.600006
2881,A,143.850006


Unnamed: 0,Ticker,High
0,A,22.725323
1,A,22.625179
2,A,22.331903
3,A,22.174536
4,A,22.045780
...,...,...
8641,CCI,199.000000
8642,CCI,198.070007
8643,CCI,200.470001
8644,CCI,198.919998


time: 8.27 s (started: 2021-12-25 10:33:44 +00:00)


In [144]:
# test if NaN exist
get_data.test_null('FS_sp500_Value.csv')

Now check if  FS_sp500_Value.csv has null:
FS_sp500_Value.csv's col-Unnamed: 0 :  False
FS_sp500_Value.csv's col-Ticker :  False
FS_sp500_Value.csv's col-Date :  False
FS_sp500_Value.csv's col-High :  False
FS_sp500_Value.csv's col-Low :  False
FS_sp500_Value.csv's col-Open :  False
FS_sp500_Value.csv's col-Close :  False
FS_sp500_Value.csv's col-Volume :  False
FS_sp500_Value.csv's col-Adj Close :  False
time: 1.37 s (started: 2021-12-25 10:34:59 +00:00)


In [None]:
# energy sector
energy_list = ['APA', 'COG', 'COP', 'CVX', 'DVN', 'EOG', 'FANG', 'HAL', 'HES', 'KMI', 'MPC', 'MRO', 'NOV', 'OKE', 'OXY', 'PSX', 'PXD', 'SLB', 'VLO', 'WMB', 'XOM']