In [1]:
import yfinance as yf
import pandas as pd
from datetime import datetime, timedelta
import pytz
from dateutil.relativedelta import relativedelta

# New list of potential dividend aristocrats
potential_dividend_aristocrats = ['ABT', 'ANF', 'ACN', 'ADBE', 'AES', 'AET', 'AFL', 'A', 'APD', 'AA', 'ATI', 'ALL', 'MO', 'AEE', 'AEP', 'AXP', 'AIG', 'AMT', 'AMP', 'AMGN', 'APH', 'ADI', 'AON', 'APA', 'AIV', 'AAPL', 'AMAT',
                                  'AIZ', 'ADSK', 'ADP', 'AVB', 'AVY', 'BAC', 'BK', 'BAX', 'BDX', 'BMS', 'BBY', 'BIG', 'BLK', 'HRB', 'BA', 'BWA', 'BXP', 'BMY', 'CHRW', 'CA', 'CPB', 'COF', 'CCL', 'CAT', 'CNP', 'CF', 'SCHW',
                                  'CHK', 'CB', 'CI', 'CLF', 'CLX', 'CME', 'CMS', 'CTSH', 'CMCSA', 'CMA', 'CAG', 'COP', 'CNX', 'ED', 'STZ', 'GLW', 'COST', 'CCI', 'CSX', 'CMI', 'CVS', 'DHI', 'DHR', 'DRI', 'DE', 'DELL', 'XRAY',
                                  'DVN', 'DFS', 'D', 'DOW', 'DTE', 'DD', 'DUK', 'DNB', 'EMN', 'ETN', 'EBAY', 'EIX', 'EA', 'EMC', 'ETR', 'EOG', 'EQT', 'EFX', 'EQR', 'EL', 'EXC', 'EXPE', 'EXPD', 'FDX', 'FIS', 'FHN', 'FE', 'FLS',
                                  'FLR', 'FMC', 'FTI', 'F', 'FCX', 'GME', 'GCI', 'GPS', 'GE', 'GIS', 'GNW', 'GILD', 'GS', 'GT', 'GOOG', 'GWW', 'HAL', 'HOG', 'HIG', 'HAS', 'HP', 'HES', 'HPQ', 'HD', 'HON', 'HST', 'HUM', 'HBAN',
                                  'IR', 'INTC', 'ICE', 'IBM', 'IFF', 'IGT', 'IP', 'IPG', 'INTU', 'IVZ', 'IRM', 'JBL', 'JCI', 'JPM', 'JNPR', 'K', 'KEY', 'KMB', 'KIM', 'KMI', 'KLAC', 'KSS', 'KR', 'LH', 'LRCX', 'LEG', 'LEN',
                                  'LLY', 'LNC', 'LMT', 'L', 'MTB', 'MRO', 'MPC', 'MAR', 'MMC', 'MAS', 'MA', 'MAT', 'MKC', 'MCK', 'MRK', 'MET', 'MCHP', 'MU', 'MSFT', 'TAP', 'MCO', 'MS', 'MOS', 'MSI', 'MUR', 'NBR', 'NDAQ', 'NOV',
                                  'NTAP', 'NWL', 'NEM', 'NWSA', 'NEE', 'NKE', 'NI', 'NE', 'JWN', 'NSC', 'NTRS', 'NOC', 'NRG', 'NUE', 'NVDA', 'OXY', 'OMC', 'OKE', 'ORCL', 'OI', 'PCAR', 'PH', 'PDCO', 'PAYX', 'BTU', 'PRGO', 'PFE',
                                  'PCG', 'PM', 'PSX', 'PNW', 'PBI', 'PNC', 'RL', 'PPG', 'PPL', 'PX', 'PFG', 'PGR', 'PLD', 'PRU', 'PEG', 'PSA', 'PHM', 'PWR', 'QCOM', 'DGX', 'RRC', 'RF', 'RSG', 'RHI', 'ROK', 'ROP', 'ROST', 'R',
                                  'CRM', 'SCG', 'SLB', 'SNI', 'STX', 'SEE', 'SHLD', 'SRE', 'SHW', 'SPG', 'SLM', 'SJM', 'SNA', 'SO', 'LUV', 'SBUX', 'HOT', 'STT', 'SYK', 'SUN', 'TROW', 'TEL', 'TER', 'TXN', 'TXT', 'HSY', 'TRV',
                                  'TMO', 'TWX', 'TJX', 'TRIP', 'TSN', 'USB', 'UNP', 'UNH', 'UPS', 'X', 'UNM', 'VFC', 'VLO', 'VTR', 'VRSN', 'VZ', 'V', 'VNO', 'VMC', 'DIS', 'WFC', 'WDC', 'WU', 'WY', 'WHR', 'WMB', 'WEC', 'WYNN',
                                  'XEL', 'XRX', 'XYL', 'YUM', 'ZION']

def get_potential_data(symbols, start_date, end_date):
    all_data = []

    for symbol in symbols:
        try:
            stock = yf.Ticker(symbol)
            hist_data = stock.history(start=start_date, end=end_date, interval="1wk")  # Weekly interval

            if hist_data.empty:
                print(f"No historical data for {symbol}.")
                continue

            # Calculate metrics for each row of hist_data
            for date, row in hist_data.iterrows():
                dividends = stock.dividends[:date]
                current_price = row['Close'] if not hist_data.empty else None
                dividend_sum = dividends.sum() if not dividends.empty else None
                dividend_yield = (dividend_sum / current_price) if current_price and dividend_sum else None

                # Handle stock.info fields with proper default values
                payout_ratio = stock.info.get('payoutRatio', None)
                eps = stock.info.get('epsTrailingTwelveMonths', None)
                pe_ratio = stock.info.get('trailingPE', None)
                roe = stock.info.get('returnOnEquity', None)
                free_cash_flow = stock.info.get('freeCashflow', None)
                gross_margin = stock.info.get('grossMargins', None)
                operating_margin = stock.info.get('operatingMargins', None)
                net_profit_margin = stock.info.get('profitMargins', None)
                price_to_book_ratio = stock.info.get('priceToBook', None)
                price_to_sales_ratio = stock.info.get('priceToSalesTrailing12Months', None)
                ev_ebitda = stock.info.get('enterpriseToEbitda', None)
                revenue_growth_rate = stock.info.get('revenueGrowth', None) * 100 if stock.info.get('revenueGrowth') else None
                earnings_growth_rate = stock.info.get('earningsQuarterlyGrowth', None) * 100 if stock.info.get('earningsQuarterlyGrowth') else None
                asset_turnover_ratio = stock.info.get('assetTurnover', None)
                inventory_turnover_ratio = stock.info.get('inventoryTurnover', None)
                price_to_cash_flow_ratio = stock.info.get('priceToCashflow', None)
                market_cap = stock.info.get('marketCap', None)
                interest_expense = stock.info.get('interestExpense', None)
                earnings_before_interest_and_taxes = stock.info.get('earningsBeforeInterestAndTaxes', None)
                interest_coverage_ratio = (earnings_before_interest_and_taxes / interest_expense) if interest_expense else None
                current_ratio = stock.info.get('currentRatio', None)
                quick_ratio = stock.info.get('quickRatio', None)
                total_assets = stock.info.get('totalAssets', None)
                employee_count = stock.info.get('fullTimeEmployees', None)
                price_volatility = hist_data['Close'].std() if not hist_data.empty else None
                avg_volume = hist_data['Volume'].mean() if not hist_data.empty else None

                all_data.append({
                    'symbol': symbol,
                    'date': date,
                    'close_price': current_price,
                    'dividend_yield': dividend_yield,
                    'payout_ratio': payout_ratio,
                    'eps': eps,
                    'pe_ratio': pe_ratio,
                    'roe': roe,
                    'free_cash_flow': free_cash_flow,
                    'gross_margin': gross_margin,
                    'operating_margin': operating_margin,
                    'net_profit_margin': net_profit_margin,
                    'price_to_book_ratio': price_to_book_ratio,
                    'price_to_sales_ratio': price_to_sales_ratio,
                    'ev_ebitda': ev_ebitda,
                    'revenue_growth_rate': revenue_growth_rate,
                    'earnings_growth_rate': earnings_growth_rate,
                    'asset_turnover_ratio': asset_turnover_ratio,
                    'inventory_turnover_ratio': inventory_turnover_ratio,
                    'price_to_cash_flow_ratio': price_to_cash_flow_ratio,
                    'market_cap': market_cap,
                    'interest_expense': interest_expense,
                    'interest_coverage_ratio': interest_coverage_ratio,
                    'current_ratio': current_ratio,
                    'quick_ratio': quick_ratio,
                    'total_assets': total_assets,
                    'employee_count': employee_count,
                    'price_volatility': price_volatility,
                    'avg_volume': avg_volume
                })

        except Exception as e:
            print(f"Error processing {symbol}: {e}")

    df = pd.DataFrame(all_data)
    return df

end_date = datetime(2023, 12, 31, tzinfo=pytz.utc)
start_date = end_date - relativedelta(years=20)

# Get the data
potential_data = get_potential_data(potential_dividend_aristocrats, start_date, end_date)
print(potential_data.head())

ERROR:yfinance:$SHLD: possibly delisted; no price data found  (1wk 2003-12-31 00:00:00+00:00 -> 2023-12-31 00:00:00+00:00) (Yahoo error = "Data doesn't exist for startDate = 1072828800, endDate = 1703980800")


No historical data for SHLD.
  symbol                      date  close_price  dividend_yield  payout_ratio  \
0    ABT 2003-12-29 00:00:00-05:00    12.045953             NaN        0.6688   
1    ABT 2004-01-05 00:00:00-05:00    11.577005             NaN        0.6688   
2    ABT 2004-01-12 00:00:00-05:00    11.144120        0.009869        0.6688   
3    ABT 2004-01-19 00:00:00-05:00    11.133348        0.009879        0.6688   
4    ABT 2004-01-26 00:00:00-05:00    11.161848        0.009853        0.6688   

    eps   pe_ratio      roe  free_cash_flow  gross_margin  ...  \
0  None  37.264984  0.14441    5.387750e+09       0.55475  ...   
1  None  37.264984  0.14441    5.387750e+09       0.55475  ...   
2  None  37.264984  0.14441    5.387750e+09       0.55475  ...   
3  None  37.264984  0.14441    5.387750e+09       0.55475  ...   
4  None  37.264984  0.14441    5.387750e+09       0.55475  ...   

   price_to_cash_flow_ratio    market_cap  interest_expense  \
0                      N

In [2]:
print(potential_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 302860 entries, 0 to 302859
Data columns (total 29 columns):
 #   Column                    Non-Null Count   Dtype                           
---  ------                    --------------   -----                           
 0   symbol                    302860 non-null  object                          
 1   date                      302860 non-null  datetime64[ns, America/New_York]
 2   close_price               302860 non-null  float64                         
 3   dividend_yield            274741 non-null  float64                         
 4   payout_ratio              277575 non-null  float64                         
 5   eps                       0 non-null       object                          
 6   pe_ratio                  272883 non-null  float64                         
 7   roe                       286990 non-null  float64                         
 8   free_cash_flow            275851 non-null  float64                        

In [3]:
potential_df_cleaned = potential_data.dropna(axis=1, how='all')
potential_df_cleaned = potential_df_cleaned.drop(columns = ['total_assets'])
potential_df_cleaned = potential_df_cleaned.dropna(axis=0, how='any')
print(potential_df_cleaned.info())
print(potential_data.info())

<class 'pandas.core.frame.DataFrame'>
Index: 177918 entries, 2 to 300771
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype                           
---  ------                --------------   -----                           
 0   symbol                177918 non-null  object                          
 1   date                  177918 non-null  datetime64[ns, America/New_York]
 2   close_price           177918 non-null  float64                         
 3   dividend_yield        177918 non-null  float64                         
 4   payout_ratio          177918 non-null  float64                         
 5   pe_ratio              177918 non-null  float64                         
 6   roe                   177918 non-null  float64                         
 7   free_cash_flow        177918 non-null  float64                         
 8   gross_margin          177918 non-null  float64                         
 9   operating_margin      177918 non-null  flo

In [4]:
potential_aristocrats_20_clean = potential_df_cleaned

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
colab_notebooks_path = '/content/drive/My Drive/Colab Notebooks/'
potential_file_path = colab_notebooks_path + 'potential_aristocrats_20_clean.csv'

In [8]:
potential_aristocrats_20_clean.to_csv(potential_file_path, index=False)