In [32]:
import yfinance as yf
import pandas as pd
import pandas_datareader.data as web
from datetime import datetime
from pandas_datareader.oecd import OECDReader
from yahooquery import Ticker

start_date = datetime(2020, 1, 1)
end_date = datetime(2023, 12, 31)

companies = ["IBM","AAPL","JPM"]

In [33]:
all_data = pd.DataFrame()

for company in companies:
    tick = yf.Ticker(company)
    data = tick.history(start=start_date, end=end_date)
    data['Company'] = company
    all_data = pd.concat([all_data, data])
    all_data = all_data.sort_values(by='Date', ascending=True)

In [34]:
all_data

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Company
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-01-02 00:00:00-05:00,103.987339,104.695998,103.810180,104.310860,3293436,0.0,0.0,IBM
2020-01-02 00:00:00-05:00,71.962075,73.021202,71.707013,72.960472,135480400,0.0,0.0,AAPL
2020-01-02 00:00:00-05:00,121.720893,122.861575,121.259402,122.852859,10803700,0.0,0.0,JPM
2020-01-03 00:00:00-05:00,102.885847,103.879509,102.878145,103.478966,2482890,0.0,0.0,IBM
2020-01-03 00:00:00-05:00,72.183120,73.016327,72.025225,72.251137,146322800,0.0,0.0,AAPL
...,...,...,...,...,...,...,...,...
2023-12-28 00:00:00-05:00,166.413947,167.701232,166.070009,167.347473,6320100,0.0,0.0,JPM
2023-12-28 00:00:00-05:00,193.629953,194.148591,192.662500,193.071426,34049900,0.0,0.0,AAPL
2023-12-29 00:00:00-05:00,160.675597,161.097516,159.772872,160.479355,2525600,0.0,0.0,IBM
2023-12-29 00:00:00-05:00,193.390581,193.889267,191.226284,192.024185,42628800,0.0,0.0,AAPL


In [35]:
print('Shape of balance sheet for companies')
for company in companies:
    ticker = Ticker(company)
    
    balance_sheet = ticker.balance_sheet(frequency='annual')
    balance_sheet['asOfDate'] = pd.to_datetime(balance_sheet['asOfDate'])
    balance_sheet = balance_sheet[(balance_sheet['asOfDate'] >= '2020-01-01') & (balance_sheet['asOfDate'] <= '2023-12-31')]
    balance_sheet = balance_sheet.sort_values(by='asOfDate')
    balance_sheet = balance_sheet.dropna(axis=1)
    
    print(company, balance_sheet.shape)
    print(balance_sheet.columns.tolist())
    
print('Shape of cashflow for companies')
for company in companies:
    ticker = Ticker(company)
    
    cashflow = ticker.cash_flow(frequency='annual')
    cashflow['asOfDate'] = pd.to_datetime(cashflow['asOfDate'])
    cashflow = cashflow[(cashflow['asOfDate'] >= '2020-01-01') & (cashflow['asOfDate'] <= '2023-12-31')]
    cashflow = cashflow.sort_values(by='asOfDate')
    cashflow = cashflow.dropna(axis=1)

    print(company, cashflow.shape)
    print(cashflow.columns.tolist())

print('Shape of income statement for companies')
for company in companies:
    ticker = Ticker(company)
    
    income_statement = ticker.income_statement(frequency='annual')
    income_statement['asOfDate'] = pd.to_datetime(income_statement['asOfDate'])
    income_statement = income_statement[(income_statement['asOfDate'] >= '2020-01-01') & (income_statement['asOfDate'] <= '2023-12-31')]
    income_statement = income_statement.sort_values(by='asOfDate')
    income_statement = income_statement.dropna(axis=1)
    
    print(company, income_statement.shape)
    print(income_statement.columns.tolist())

Shape of balance sheet for companies
IBM (4, 87)
['asOfDate', 'periodType', 'currencyCode', 'AccountsPayable', 'AccountsReceivable', 'AccumulatedDepreciation', 'AllowanceForDoubtfulAccountsReceivable', 'AssetsHeldForSaleCurrent', 'BuildingsAndImprovements', 'CapitalLeaseObligations', 'CapitalStock', 'CashAndCashEquivalents', 'CashCashEquivalentsAndShortTermInvestments', 'CommonStock', 'CommonStockEquity', 'CurrentAccruedExpenses', 'CurrentAssets', 'CurrentCapitalLeaseObligation', 'CurrentDebt', 'CurrentDebtAndCapitalLeaseObligation', 'CurrentDeferredAssets', 'CurrentDeferredLiabilities', 'CurrentDeferredRevenue', 'CurrentLiabilities', 'DefinedPensionBenefit', 'DerivativeProductLiabilities', 'EmployeeBenefits', 'FinishedGoods', 'GainsLossesNotAffectingRetainedEarnings', 'Goodwill', 'GoodwillAndOtherIntangibleAssets', 'GrossAccountsReceivable', 'GrossPPE', 'Inventory', 'InvestedCapital', 'LandAndImprovements', 'LongTermCapitalLeaseObligation', 'LongTermDebt', 'LongTermDebtAndCapitalLease

In [36]:
balance_sheet

Unnamed: 0_level_0,asOfDate,periodType,currencyCode,AccountsPayable,AccountsReceivable,AdditionalPaidInCapital,CapitalStock,CashAndCashEquivalents,CashFinancial,CommonStock,...,StockholdersEquity,TangibleBookValue,TotalAssets,TotalCapitalization,TotalDebt,TotalEquityGrossMinorityInterest,TotalLiabilitiesNetMinorityInterest,TradingSecurities,TreasurySharesNumber,TreasuryStock
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
JPM,2020-12-31,12M,USD,140291000000.0,90503000000.0,88394000000.0,34168000000.0,527609000000.0,24874000000.0,4105000000.0,...,279354000000.0,195863000000.0,3384757000000.0,561039000000.0,326893000000.0,279354000000.0,3105403000000.0,372481000000.0,1055499000.0,88184000000.0
JPM,2021-12-31,12M,USD,169172000000.0,102570000000.0,88415000000.0,38943000000.0,740834000000.0,26438000000.0,4105000000.0,...,294127000000.0,202598000000.0,3743567000000.0,595132000000.0,354599000000.0,294127000000.0,3449440000000.0,330865000000.0,1160785000.0,105415000000.0
JPM,2022-12-31,12M,USD,188692000000.0,125189000000.0,89044000000.0,31509000000.0,567234000000.0,27697000000.0,4105000000.0,...,292332000000.0,204069000000.0,3665743000000.0,588197000000.0,339892000000.0,292332000000.0,3373411000000.0,360112000000.0,1170676000.0,107336000000.0
JPM,2023-12-31,12M,USD,161960000000.0,107363000000.0,90128000000.0,31509000000.0,624151000000.0,29066000000.0,4105000000.0,...,327878000000.0,236093000000.0,3875393000000.0,719703000000.0,436537000000.0,327878000000.0,3547515000000.0,411613000000.0,1228275000.0,116217000000.0


In [37]:
cashflow

Unnamed: 0_level_0,asOfDate,periodType,currencyCode,BeginningCashPosition,CashDividendsPaid,CashFlowFromContinuingFinancingActivities,CashFlowFromContinuingInvestingActivities,CashFlowFromContinuingOperatingActivities,ChangeInAccountPayable,ChangeInCashSupplementalAsReported,...,NetPreferredStockIssuance,NetShortTermDebtIssuance,OperatingCashFlow,OtherNonCashItems,PreferredStockIssuance,PreferredStockPayments,PurchaseOfInvestment,RepaymentOfDebt,RepurchaseOfCapitalStock,SaleOfInvestment
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
JPM,2020-12-31,12M,USD,263631000000.0,-12690000000.0,596645000000.0,-261912000000.0,-79910000000.0,7415000000.0,263978000000.0,...,3070000000.0,4438000000.0,-79910000000.0,13750000000.0,4500000000.0,-1430000000.0,-409545000000.0,-105055000000.0,-7947000000.0,228793000000.0
JPM,2021-12-31,12M,USD,527609000000.0,-12858000000.0,275993000000.0,-129344000000.0,78084000000.0,43162000000.0,213225000000.0,...,4775000000.0,7773000000.0,78084000000.0,-8575000000.0,7350000000.0,-2575000000.0,-360541000000.0,-54932000000.0,-20983000000.0,263720000000.0
JPM,2022-12-31,12M,USD,740834000000.0,-13562000000.0,-126257000000.0,-137819000000.0,107119000000.0,58614000000.0,-173600000000.0,...,-7434000000.0,-8984000000.0,107119000000.0,26055000000.0,0.0,-7434000000.0,-159934000000.0,-45556000000.0,-10596000000.0,172401000000.0
JPM,2023-12-31,12M,USD,567234000000.0,-13463000000.0,-25571000000.0,67643000000.0,12974000000.0,-25388000000.0,56917000000.0,...,0.0,-1934000000.0,12974000000.0,7292000000.0,0.0,0.0,-119640000000.0,-64880000000.0,-9824000000.0,215234000000.0


In [38]:
income_statement

Unnamed: 0_level_0,asOfDate,periodType,currencyCode,BasicAverageShares,BasicEPS,DilutedAverageShares,DilutedEPS
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
JPM,2020-12-31,12M,USD,3082400000.0,8.89,3087400000.0,8.88
JPM,2021-12-31,12M,USD,3021500000.0,15.39,3026600000.0,15.36
JPM,2022-12-31,12M,USD,2965800000.0,12.1,2970000000.0,12.09
JPM,2023-09-30,TTM,USD,2950650000.0,16.77,2955075000.0,16.76
JPM,2023-12-31,12M,USD,2938600000.0,16.25,2943100000.0,16.23
JPM,2023-12-31,TTM,USD,2938600000.0,16.25,2943100000.0,16.23


In [39]:
from yahooquery import Ticker

aapl = Ticker("AAPL")

aapl_info = aapl.asset_profile

for key, value in aapl_info['AAPL'].items():
    print(f"{key}: {value}")

address1: One Apple Park Way
city: Cupertino
state: CA
zip: 95014
country: United States
phone: 408 996 1010
website: https://www.apple.com
industry: Consumer Electronics
industryKey: consumer-electronics
industryDisp: Consumer Electronics
sector: Technology
sectorKey: technology
sectorDisp: Technology
longBusinessSummary: Apple Inc. designs, manufactures, and markets smartphones, personal computers, tablets, wearables, and accessories worldwide. The company offers iPhone, a line of smartphones; Mac, a line of personal computers; iPad, a line of multi-purpose tablets; and wearables, home, and accessories comprising AirPods, Apple TV, Apple Watch, Beats products, and HomePod. It also provides AppleCare support and cloud services; and operates various platforms, including the App Store that allow customers to discover and download applications and digital content, such as books, music, video, games, and podcasts. In addition, the company offers various services, such as Apple Arcade, a g

In [40]:
import yfinance as yf
import pandas as pd
from yahooquery import Ticker

# List of companies you are interested in
companies = ["IBM", "AAPL", "JPM"]

# Initialize an empty DataFrame to hold all company data
df = pd.DataFrame()

# Iterate over each company
for company in companies:
    ticker = Ticker(company)
    company_info = ticker.asset_profile
    if company in company_info:
        info = company_info[company]
        data = {
            'name': company,
            'industry': info.get('industry', 'N/A'),
            'industryKey': info.get('industryKey', 'N/A'),
            'industryDisp': info.get('industryDisp', 'N/A'),
            'sector': info.get('sector', 'N/A'),
            'sectorKey': info.get('sectorKey', 'N/A'),
            'sectorDisp': info.get('sectorDisp', 'N/A'),
            'longBusinessSummary': info.get('longBusinessSummary', 'N/A'),
            'fullTimeEmployees': info.get('fullTimeEmployees', 'N/A')
        }
        df_temp = pd.DataFrame([data])
        df = pd.concat([df, df_temp], ignore_index=True)

In [41]:
df

Unnamed: 0,name,industry,industryKey,industryDisp,sector,sectorKey,sectorDisp,longBusinessSummary,fullTimeEmployees
0,IBM,Information Technology Services,information-technology-services,Information Technology Services,Technology,technology,Technology,"International Business Machines Corporation, t...",282200
1,AAPL,Consumer Electronics,consumer-electronics,Consumer Electronics,Technology,technology,Technology,"Apple Inc. designs, manufactures, and markets ...",150000
2,JPM,Banks - Diversified,banks-diversified,Banks - Diversified,Financial Services,financial-services,Financial Services,JPMorgan Chase & Co. operates as a financial s...,313206


In [42]:
df['longBusinessSummary'][2]

'JPMorgan Chase & Co. operates as a financial services company worldwide. It operates through four segments: Consumer & Community Banking (CCB), Corporate & Investment Bank (CIB), Commercial Banking (CB), and Asset & Wealth Management (AWM). The CCB segment offers deposit, investment and lending products, cash management, and payments and services; mortgage origination and servicing activities; residential mortgages and home equity loans; and credit cards, auto loans, leases, and travel services to consumers and small businesses through bank branches, ATMs, and digital and telephone banking. The CIB segment provides investment banking products and services, including corporate strategy and structure advisory, and equity and debt market capital-raising services, as well as loan origination and syndication; payments; and cash and derivative instruments, risk management solutions, prime brokerage, and research. This segment also offers securities services, including custody, fund accounti

In [43]:
import os
import warnings
import logging
from transformers import BertTokenizer, BertModel, logging as transformers_logging
from sklearn.neural_network import BernoulliRBM
import torch
import numpy as np

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  
os.environ['PYTHONWARNINGS'] = 'ignore'  

logging.getLogger('transformers').setLevel(logging.ERROR)
logging.getLogger('torch').setLevel(logging.ERROR)

transformers_logging.set_verbosity_error()

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

text = df['longBusinessSummary'][1]

inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')

with torch.no_grad():
    outputs = model(**inputs)

sentence_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()

print("size of word embedding：", sentence_embedding.shape)

rbm = BernoulliRBM(n_components=100, learning_rate=0.01, n_iter=10, random_state=42)

sentence_embedding = (sentence_embedding - sentence_embedding.min()) / (sentence_embedding.max() - sentence_embedding.min())

sentence_embedding = sentence_embedding.reshape(1, -1)

transformed_embedding = rbm.fit_transform(sentence_embedding)

print("Shape after dimension reduction：", transformed_embedding.shape)
print("Vector ：", transformed_embedding)


size of word embedding： (768,)
Shape after dimension reduction： (1, 100)
Vector ： [[2.78800107e-05 5.17582839e-06 1.63133263e-05 1.23069740e-05
  2.24590276e-05 4.56476155e-05 1.69823943e-05 2.94510173e-05
  6.55808544e-05 3.41049599e-05 1.22913152e-05 2.47535518e-05
  3.49889378e-05 3.73652983e-05 2.77613908e-05 1.88640643e-05
  4.42496603e-05 7.30344573e-06 1.33656040e-05 3.18344719e-05
  1.66940481e-05 3.63946929e-05 1.29088248e-05 2.47327862e-05
  2.59913140e-05 1.68999504e-05 2.11448423e-05 1.99441911e-05
  3.14571225e-05 4.74153458e-05 2.25306994e-05 3.24929642e-05
  2.86341401e-05 8.02502473e-06 1.55015150e-05 3.49891016e-05
  2.45198498e-05 5.34464052e-05 1.86632933e-05 1.43555772e-05
  2.15972959e-05 9.72416237e-05 1.85011831e-05 1.22451474e-05
  4.32099914e-05 8.56511833e-05 3.15313664e-05 1.81377036e-05
  3.96536379e-05 5.35719846e-05 7.59837121e-06 1.32740688e-05
  1.22744714e-05 2.71749868e-05 1.43094467e-05 1.65126730e-05
  1.77908660e-05 1.24507478e-05 2.11923234e-05 2.2

In [44]:
import os
import warnings
import logging
from transformers import BertTokenizer, BertModel, logging as transformers_logging
from sklearn.neural_network import BernoulliRBM
import torch
import numpy as np
import pandas as pd

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  
os.environ['PYTHONWARNINGS'] = 'ignore'  

warnings.filterwarnings('ignore')

logging.getLogger('transformers').setLevel(logging.ERROR)
logging.getLogger('torch').setLevel(logging.ERROR)

transformers_logging.set_verbosity_error()

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')


def process_summary(text):
    inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
    with torch.no_grad():
        outputs = model(**inputs)
    sentence_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
    sentence_embedding = (sentence_embedding - sentence_embedding.min()) / (sentence_embedding.max() - sentence_embedding.min())
    sentence_embedding = sentence_embedding.reshape(1, -1)
    rbm = BernoulliRBM(n_components=100, learning_rate=0.01, n_iter=10, random_state=42)
    transformed_embedding = rbm.fit_transform(sentence_embedding)
    return transformed_embedding.flatten()

embeddings = []
for i in range(len(df)):
    text = df['longBusinessSummary'][i]
    embedding = process_summary(text)
    embeddings.append(embedding)

result_df = pd.DataFrame(embeddings, columns=[f'feature_{i}' for i in range(100)])
result_df.insert(0, 'name', df['name'])

result_df

Unnamed: 0,name,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_90,feature_91,feature_92,feature_93,feature_94,feature_95,feature_96,feature_97,feature_98,feature_99
0,IBM,3.3e-05,6e-06,1.9e-05,1.5e-05,2.6e-05,5.4e-05,2e-05,3.5e-05,7.9e-05,...,3.4e-05,2.8e-05,4.1e-05,3.4e-05,2.3e-05,1.6e-05,3.2e-05,4.1e-05,1.9e-05,2.2e-05
1,AAPL,2.8e-05,5e-06,1.6e-05,1.2e-05,2.2e-05,4.6e-05,1.7e-05,2.9e-05,6.6e-05,...,3e-05,2.4e-05,3.5e-05,2.9e-05,1.9e-05,1.3e-05,2.7e-05,3.4e-05,1.6e-05,1.9e-05
2,JPM,1.8e-05,3e-06,1e-05,8e-06,1.4e-05,2.9e-05,1e-05,1.8e-05,4.3e-05,...,1.9e-05,1.5e-05,2.2e-05,1.8e-05,1.2e-05,8e-06,1.8e-05,2.1e-05,1e-05,1.2e-05


In [45]:
merged_df = pd.merge(all_data.reset_index(), result_df, left_on='Company', right_on='name')

# Set 'Date' as index again after merging
merged_df.set_index('Date', inplace=True)
merged_df = merged_df.sort_values(by='Date', ascending=True)
merged_df.drop(columns=['name'], inplace=True)
# Display the merged and ranked DataFrame
merged_df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Company,feature_0,feature_1,...,feature_90,feature_91,feature_92,feature_93,feature_94,feature_95,feature_96,feature_97,feature_98,feature_99
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-02 00:00:00-05:00,103.987339,104.695998,103.810180,104.310860,3293436,0.0,0.0,IBM,0.000033,0.000006,...,0.000034,0.000028,0.000041,0.000034,0.000023,0.000016,0.000032,0.000041,0.000019,0.000022
2020-01-02 00:00:00-05:00,71.962075,73.021202,71.707013,72.960472,135480400,0.0,0.0,AAPL,0.000028,0.000005,...,0.000030,0.000024,0.000035,0.000029,0.000019,0.000013,0.000027,0.000034,0.000016,0.000019
2020-01-02 00:00:00-05:00,121.720893,122.861575,121.259402,122.852859,10803700,0.0,0.0,JPM,0.000018,0.000003,...,0.000019,0.000015,0.000022,0.000018,0.000012,0.000008,0.000018,0.000021,0.000010,0.000012
2020-01-03 00:00:00-05:00,102.885847,103.879509,102.878145,103.478966,2482890,0.0,0.0,IBM,0.000033,0.000006,...,0.000034,0.000028,0.000041,0.000034,0.000023,0.000016,0.000032,0.000041,0.000019,0.000022
2020-01-03 00:00:00-05:00,72.183120,73.016327,72.025225,72.251137,146322800,0.0,0.0,AAPL,0.000028,0.000005,...,0.000030,0.000024,0.000035,0.000029,0.000019,0.000013,0.000027,0.000034,0.000016,0.000019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-28 00:00:00-05:00,160.881662,160.881662,160.332163,160.675598,2071300,0.0,0.0,IBM,0.000033,0.000006,...,0.000034,0.000028,0.000041,0.000034,0.000023,0.000016,0.000032,0.000041,0.000019,0.000022
2023-12-28 00:00:00-05:00,193.629953,194.148591,192.662500,193.071426,34049900,0.0,0.0,AAPL,0.000028,0.000005,...,0.000030,0.000024,0.000035,0.000029,0.000019,0.000013,0.000027,0.000034,0.000016,0.000019
2023-12-29 00:00:00-05:00,193.390581,193.889267,191.226284,192.024185,42628800,0.0,0.0,AAPL,0.000028,0.000005,...,0.000030,0.000024,0.000035,0.000029,0.000019,0.000013,0.000027,0.000034,0.000016,0.000019
2023-12-29 00:00:00-05:00,160.675597,161.097516,159.772872,160.479355,2525600,0.0,0.0,IBM,0.000033,0.000006,...,0.000034,0.000028,0.000041,0.000034,0.000023,0.000016,0.000032,0.000041,0.000019,0.000022


In [46]:
all_data

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Company
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-01-02 00:00:00-05:00,103.987339,104.695998,103.810180,104.310860,3293436,0.0,0.0,IBM
2020-01-02 00:00:00-05:00,71.962075,73.021202,71.707013,72.960472,135480400,0.0,0.0,AAPL
2020-01-02 00:00:00-05:00,121.720893,122.861575,121.259402,122.852859,10803700,0.0,0.0,JPM
2020-01-03 00:00:00-05:00,102.885847,103.879509,102.878145,103.478966,2482890,0.0,0.0,IBM
2020-01-03 00:00:00-05:00,72.183120,73.016327,72.025225,72.251137,146322800,0.0,0.0,AAPL
...,...,...,...,...,...,...,...,...
2023-12-28 00:00:00-05:00,166.413947,167.701232,166.070009,167.347473,6320100,0.0,0.0,JPM
2023-12-28 00:00:00-05:00,193.629953,194.148591,192.662500,193.071426,34049900,0.0,0.0,AAPL
2023-12-29 00:00:00-05:00,160.675597,161.097516,159.772872,160.479355,2525600,0.0,0.0,IBM
2023-12-29 00:00:00-05:00,193.390581,193.889267,191.226284,192.024185,42628800,0.0,0.0,AAPL


In [47]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

text = (
    "Apple Inc. designs, manufactures, and markets smartphones, personal computers, tablets, wearables, and accessories "
    "worldwide. The company offers iPhone, a line of smartphones; Mac, a line of personal computers; iPad, a line of multi-purpose "
    "tablets; and wearables, home, and accessories comprising AirPods, Apple TV, Apple Watch, Beats products, and HomePod. It also "
    "provides AppleCare support and cloud services; and operates various platforms, including the App Store that allow customers to "
    "discover and download applications and digital content, such as books, music, video, games, and podcasts. In addition, the company "
    "offers various services, such as Apple Arcade, a game subscription service; Apple Fitness+, a personalized fitness service; Apple Music, "
    "which offers users a curated listening experience with on-demand radio stations; Apple News+, a subscription news and magazine service; "
    "Apple TV+, which offers exclusive original content; Apple Card, a co-branded credit card; and Apple Pay, a cashless payment service, as well "
    "as licenses its intellectual property. The company serves consumers, and small and mid-sized businesses; and the education, enterprise, and "
    "government markets. It distributes third-party applications for its products through the App Store. The company also sells its products through "
    "its retail and online stores, and direct sales force; and third-party cellular network carriers, wholesalers, retailers, and resellers. Apple Inc. "
    "was founded in 1976 and is headquartered in Cupertino, California."
)

tokens = tokenizer.tokenize(text)

num_tokens = len(tokens)

print("Number of tokens:", num_tokens)

Number of tokens: 311


In [48]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

text = (
    'JPMorgan Chase & Co. operates as a financial services company worldwide. It operates through four segments: Consumer & Community Banking (CCB), Corporate & Investment Bank (CIB), Commercial Banking (CB), and Asset & Wealth Management (AWM). The CCB segment offers deposit, investment and lending products, cash management, and payments and services; mortgage origination and servicing activities; residential mortgages and home equity loans; and credit cards, auto loans, leases, and travel services to consumers and small businesses through bank branches, ATMs, and digital and telephone banking. The CIB segment provides investment banking products and services, including corporate strategy and structure advisory, and equity and debt market capital-raising services, as well as loan origination and syndication; payments; and cash and derivative instruments, risk management solutions, prime brokerage, and research. This segment also offers securities services, including custody, fund accounting and administration, and securities lending products for asset managers, insurance companies, and public and private investment funds. The CB segment provides financial solutions, including lending, payments, investment banking, and asset management to small and midsized companies, local governments, nonprofit clients, and large corporations, as well as investors, developers, and owners of multifamily, office, retail, industrial, and affordable housing properties. The AWM segment offers multi-asset investment management solutions in equities, fixed income, alternatives, and money market funds to institutional clients and retail investors; and retirement products and services, brokerage, custody, estate planning, lending, deposits, and investment management products to high net worth clients. JPMorgan Chase & Co. was founded in 1799 and is headquartered in New York, New York.'
)

tokens = tokenizer.tokenize(text)

num_tokens = len(tokens)

print("Number of tokens:", num_tokens)

Number of tokens: 352
