In [1]:
import pandas as pd

In [2]:
tickers = pd.read_csv('ticker_list.csv')
tickernames = tickers['name']
biotech_tickers = tickernames.to_list()

In [3]:
import random
import string

def random_string(length):
    x = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(length))
    return x

random_string(13)

'nuf68vv282mkh'

In [5]:
import requests

In [6]:
for ticker in biotech_tickers:
    query = {'s': random_string(13), 't': ticker}
    res = requests.get('https://www.macrotrends.net/assets/php/stock_data_download.php', params=query)
    decoded = res.content.decode('utf-8')
    with open(f'tickers/{ticker}.csv', 'w') as f:
        f.write(decoded)
        f.close()

## Getting data for the same time period

At this point, we have CSV files for each stock saved as ticker.csv in the `tickers` folder in this directory.

### Remove the download message

The real data starts at the CSV column headings, with `date`, so we can remove everything before that

In [7]:
with open('tickers/ABCM.csv', 'r+') as f:
    content = f.read()
    date_index = content.index('date')
    cropped = content[date_index:]
    f.seek(0)
    f.write(cropped)
    f.close()

In [8]:
for ticker in biotech_tickers:
    file_path = f'tickers/{ticker}.csv'
    with open(file_path, 'r+') as f:
        content = f.read()
        date_index = content.index('date')
        cropped = content[date_index:]
        f.seek(0)
        f.write(cropped)
        f.close()

## Reduce data to just the opening prices

In [9]:
ticker = 'ABCL'
file_path = f'tickers/{ticker}.csv'
df = pd.read_csv(file_path)
df = df.drop(['high', 'low', 'close', 'volume'], axis=1)
df.to_csv(file_path)

In [10]:
for ticker in biotech_tickers:
    file_path = f'tickers/{ticker}.csv'
    df = pd.read_csv(file_path)
    try:
        df = df.drop(['high', 'low', 'close', 'volume'], axis=1)
        df.to_csv(file_path)
    except Exception as e:
        print(f'Error for {ticker}: {e}')

Error for ABCL: "['high', 'low', 'close', 'volume'] not found in axis"


## Reducing data to a timeframe

In [11]:
for year in range(2013, 2023, 1):
    count = 0
    for ticker in biotech_tickers:
        file_path = f'tickers/{ticker}.csv'
        df = pd.read_csv(file_path)
        earliest_date = df['date'][0]
        earliest_date_year = int(earliest_date[:4])
        if earliest_date_year <= year:
            count += 1

    print(f'{count} stocks have data starting in a year before {year}')

34 stocks have data starting in a year before 2013
37 stocks have data starting in a year before 2014
50 stocks have data starting in a year before 2015
55 stocks have data starting in a year before 2016
65 stocks have data starting in a year before 2017
78 stocks have data starting in a year before 2018
97 stocks have data starting in a year before 2019
126 stocks have data starting in a year before 2020
158 stocks have data starting in a year before 2021
163 stocks have data starting in a year before 2022


This year, we're going with 2015 onwards

In [12]:
year = 2015

In [13]:
tickers_2015 = []

for ticker in biotech_tickers:
    df = pd.read_csv(f'tickers/{ticker}.csv')
    earliest_date = df['date'][0]
    earliest_date_year = int(earliest_date[:4])
    if earliest_date_year <= year:
        df.to_csv(f'tickers_2015/{ticker}.csv')
        tickers_2015.append(ticker)

Now, we need to remove values before 1 Jan 2015

In [14]:
def date_str_valid(date_str):
    try:
        year = int(str(date_str)[:4])
    except:
        return False
    if year >= 2015:
        return True
    return False    

In [15]:
df = df[df.apply(lambda x: date_str_valid(x['date']), axis = 1)]
df

Unnamed: 0.1,Unnamed: 0,date,open
0,0,2020-01-29,6.6300
1,1,2020-01-30,9.0000
2,2,2020-01-31,6.9800
3,3,2020-02-03,7.1500
4,4,2020-02-04,7.1800
...,...,...,...
669,669,2022-08-29,11.0100
670,670,2022-08-30,11.6000
671,671,2022-08-31,11.0400
672,672,2022-09-01,11.3147


In [16]:
for ticker in tickers_2015:
    original_file_path = f'tickers_2015/{ticker}.csv'
    df = pd.read_csv(original_file_path)
    df = df.dropna()
    df = df[df.apply(lambda x: date_str_valid(x['date']), axis = 1)]
    df.to_csv(original_file_path)

Now we need to remove data from 2020 or later

In [17]:
def date_str_before_2020(date_str):
    try:
        year = int(str(date_str)[:4])
    except:
        return False
    if year < 2020:
        return True
    return False    

In [18]:
for ticker in tickers_2015:
    original_file_path = f'tickers_2015/{ticker}.csv'
    df = pd.read_csv(original_file_path)
    df = df.dropna()
    df = df[df.apply(lambda x: date_str_before_2020(x['date']), axis = 1)]
    df.to_csv(original_file_path)

We now have a bunch of junk which we should remove from each of the files

In [19]:
for ticker in tickers_2015:
    file_path = f'tickers_2015/{ticker}.csv'
    df = pd.read_csv(file_path)
    df = df[['date', 'open']]
    df.to_csv(file_path)

## Preparing Data

Now we need to merge these all into one dataframe / CSV file. This should have the date as the column and opening price for that stock as the row

First, we need to change the files so that each have them have the date as the column and opening price as the value

In [20]:
for ticker in tickers_2015:
    path = f'tickers_2015/{ticker}.csv'
    df = pd.read_csv(path)
    df = df[['date', 'open']]
    df = df.transpose()
    df.columns = df.iloc[0]
    df = df.drop(['date'])
    df.insert(loc=0, column='', value=ticker)
    df.to_csv(path, index=False)

Now, we need to merge all the dataframes into one mega DF

In [21]:
dfs = []

for ticker in tickers_2015:
    path = f'tickers_2015/{ticker}.csv'
    df = pd.read_csv(path)
    dfs.append(df)

big_boi = pd.concat(dfs, axis=0)
big_boi

Unnamed: 0.1,Unnamed: 0,2015-01-02,2015-01-05,2015-01-06,2015-01-07,2015-01-08,2015-01-09,2015-01-12,2015-01-13,2015-01-14,...,2019-12-17,2019-12-18,2019-12-19,2019-12-20,2019-12-23,2019-12-24,2019-12-26,2019-12-27,2019-12-30,2019-12-31
0,NVO,36.1597,36.2022,35.9048,35.8878,36.2022,36.6102,37.0691,37.4345,37.511,...,54.9628,54.4045,54.1207,54.8965,54.8965,54.6221,54.8397,55.4642,55.275,54.5748
0,VRTX,119.97,122.27,121.31,118.82,122.0,123.75,123.01,120.4,118.42,...,219.72,217.83,220.14,219.62,220.95,220.46,220.6,220.59,220.38,218.6
0,REGN,413.9,414.26,414.39,402.79,411.57,409.78,414.02,417.73,403.12,...,375.73,370.3,371.95,374.94,377.2,374.75,376.76,375.24,373.74,373.64
0,RPRX,10.02,9.57,9.61,9.09,8.97,8.9,9.35,9.14,8.66,...,,,,,,,,,,
0,SGEN,32.31,32.38,33.04,31.84,32.49,31.74,31.88,32.2,31.02,...,115.2,113.96,118.5,115.0,118.32,115.21,115.17,113.73,113.69,113.9
0,ALNY,97.97,96.44,95.77,94.85,97.35,102.14,103.22,105.57,102.41,...,123.7,117.27,116.1,117.88,115.74,117.08,118.0,118.3,116.83,114.0
0,BMRN,91.24,90.89,93.11,91.72,95.66,94.44,95.31,97.29,95.44,...,83.36,83.13,84.61,85.45,85.3,86.11,86.43,86.16,85.55,84.15
0,INCY,73.65,73.82,73.23,72.48,75.0,73.01,73.07,73.92,75.28,...,93.43,90.84,90.9,92.01,90.68,90.47,91.2,89.17,88.27,87.7
0,TECH,87.3801,85.4824,85.2475,84.8624,85.9991,85.783,86.4783,86.7225,86.1964,...,217.1782,215.2583,213.8926,216.2084,219.9591,220.5529,219.019,216.4459,215.8224,215.1791
0,UTHR,129.5,126.84,127.11,125.53,126.87,128.88,129.78,133.92,133.92,...,91.26,89.58,89.82,89.49,90.76,90.89,91.27,89.64,88.65,86.91


In [22]:
big_boi.reset_index()
big_boi

Unnamed: 0.1,Unnamed: 0,2015-01-02,2015-01-05,2015-01-06,2015-01-07,2015-01-08,2015-01-09,2015-01-12,2015-01-13,2015-01-14,...,2019-12-17,2019-12-18,2019-12-19,2019-12-20,2019-12-23,2019-12-24,2019-12-26,2019-12-27,2019-12-30,2019-12-31
0,NVO,36.1597,36.2022,35.9048,35.8878,36.2022,36.6102,37.0691,37.4345,37.511,...,54.9628,54.4045,54.1207,54.8965,54.8965,54.6221,54.8397,55.4642,55.275,54.5748
0,VRTX,119.97,122.27,121.31,118.82,122.0,123.75,123.01,120.4,118.42,...,219.72,217.83,220.14,219.62,220.95,220.46,220.6,220.59,220.38,218.6
0,REGN,413.9,414.26,414.39,402.79,411.57,409.78,414.02,417.73,403.12,...,375.73,370.3,371.95,374.94,377.2,374.75,376.76,375.24,373.74,373.64
0,RPRX,10.02,9.57,9.61,9.09,8.97,8.9,9.35,9.14,8.66,...,,,,,,,,,,
0,SGEN,32.31,32.38,33.04,31.84,32.49,31.74,31.88,32.2,31.02,...,115.2,113.96,118.5,115.0,118.32,115.21,115.17,113.73,113.69,113.9
0,ALNY,97.97,96.44,95.77,94.85,97.35,102.14,103.22,105.57,102.41,...,123.7,117.27,116.1,117.88,115.74,117.08,118.0,118.3,116.83,114.0
0,BMRN,91.24,90.89,93.11,91.72,95.66,94.44,95.31,97.29,95.44,...,83.36,83.13,84.61,85.45,85.3,86.11,86.43,86.16,85.55,84.15
0,INCY,73.65,73.82,73.23,72.48,75.0,73.01,73.07,73.92,75.28,...,93.43,90.84,90.9,92.01,90.68,90.47,91.2,89.17,88.27,87.7
0,TECH,87.3801,85.4824,85.2475,84.8624,85.9991,85.783,86.4783,86.7225,86.1964,...,217.1782,215.2583,213.8926,216.2084,219.9591,220.5529,219.019,216.4459,215.8224,215.1791
0,UTHR,129.5,126.84,127.11,125.53,126.87,128.88,129.78,133.92,133.92,...,91.26,89.58,89.82,89.49,90.76,90.89,91.27,89.64,88.65,86.91


Seems like some rows are missing in `big_boi`

In [23]:
import numpy as np
columns_all = np.array([column for column in [columns for columns in [df.columns for df in dfs]]])
first_dates = [i[1] for i in columns_all]
(first_dates.sort())
first_dates[-1]

  columns_all = np.array([column for column in [columns for columns in [df.columns for df in dfs]]])


'2015-12-22'

The latest first date is  '2015-12-22'. This means we will have to reduce all of our data so that it starts on the 22nd of December 2015.

In [24]:
dates_to_remove = []
for i in range(1, 12, 1):
    month_code = ''
    if i <= 9:
        month_code = f'0{i}'
    else:
        month_code = str(i)
    for k in range(1, 32, 1):
        if k <= 9:
            date = f'2015-{month_code}-0{k}'
        else:
            date = f'2015-{month_code}-{k}'
        dates_to_remove.append(date)

for j in range(1, 22, 1):
    if j <= 9:
        date = f'2015-12-0{j}'
    else:
        date = f'2015-12-{j}'
    dates_to_remove.append(date)

In [25]:
'2015-10-06' in dates_to_remove

True

In [26]:
dfs = []

for ticker in tickers_2015:
    path = f'tickers_2015/{ticker}.csv'
    df = pd.read_csv(path)
    df = df.drop(dates_to_remove, axis=1, errors='ignore')
    dfs.append(df)

big_boi = pd.concat(dfs, axis=0)
big_boi

Unnamed: 0.1,Unnamed: 0,2015-12-22,2015-12-23,2015-12-24,2015-12-28,2015-12-29,2015-12-30,2015-12-31,2016-01-04,2016-01-05,...,2019-12-17,2019-12-18,2019-12-19,2019-12-20,2019-12-23,2019-12-24,2019-12-26,2019-12-27,2019-12-30,2019-12-31
0,NVO,49.0179,49.5701,49.7771,50.2947,50.4069,50.6312,50.2947,49.8548,50.01,...,54.9628,54.4045,54.1207,54.8965,54.8965,54.6221,54.8397,55.4642,55.275,54.5748
0,VRTX,121.09,123.92,124.38,123.99,124.97,127.68,125.55,123.03,123.89,...,219.72,217.83,220.14,219.62,220.95,220.46,220.6,220.59,220.38,218.6
0,REGN,535.59,531.06,535.56,536.45,542.28,550.54,542.62,531.76,519.38,...,375.73,370.3,371.95,374.94,377.2,374.75,376.76,375.24,373.74,373.64
0,RPRX,1.25,1.27,1.34,1.26,1.24,1.13,1.16,1.19,1.29,...,,,,,,,,,,
0,SGEN,42.31,42.73,43.71,43.7,44.51,45.3,44.82,43.88,42.07,...,115.2,113.96,118.5,115.0,118.32,115.21,115.17,113.73,113.69,113.9
0,ALNY,88.27,87.24,91.29,90.16,92.49,93.48,94.18,92.18,93.49,...,123.7,117.27,116.1,117.88,115.74,117.08,118.0,118.3,116.83,114.0
0,BMRN,104.36,104.4,105.83,105.91,106.02,106.3,105.44,102.8,105.02,...,83.36,83.13,84.61,85.45,85.3,86.11,86.43,86.16,85.55,84.15
0,INCY,113.78,109.21,110.26,108.98,109.98,110.27,108.24,106.5,102.14,...,93.43,90.84,90.9,92.01,90.68,90.47,91.2,89.17,88.27,87.7
0,TECH,83.8159,85.2154,85.3011,86.6815,87.7763,87.4241,86.4816,84.7965,84.0635,...,217.1782,215.2583,213.8926,216.2084,219.9591,220.5529,219.019,216.4459,215.8224,215.1791
0,UTHR,158.42,159.28,159.15,159.99,161.87,159.26,157.78,154.23,156.38,...,91.26,89.58,89.82,89.49,90.76,90.89,91.27,89.64,88.65,86.91


Now for some cleaning

In [27]:
big_boi = big_boi.reset_index(drop=True)

In [28]:
big_boi.rename(columns={'Unnamed: 0': 'ticker'}, inplace=True)
big_boi.head()

Unnamed: 0,ticker,2015-12-22,2015-12-23,2015-12-24,2015-12-28,2015-12-29,2015-12-30,2015-12-31,2016-01-04,2016-01-05,...,2019-12-17,2019-12-18,2019-12-19,2019-12-20,2019-12-23,2019-12-24,2019-12-26,2019-12-27,2019-12-30,2019-12-31
0,NVO,49.0179,49.5701,49.7771,50.2947,50.4069,50.6312,50.2947,49.8548,50.01,...,54.9628,54.4045,54.1207,54.8965,54.8965,54.6221,54.8397,55.4642,55.275,54.5748
1,VRTX,121.09,123.92,124.38,123.99,124.97,127.68,125.55,123.03,123.89,...,219.72,217.83,220.14,219.62,220.95,220.46,220.6,220.59,220.38,218.6
2,REGN,535.59,531.06,535.56,536.45,542.28,550.54,542.62,531.76,519.38,...,375.73,370.3,371.95,374.94,377.2,374.75,376.76,375.24,373.74,373.64
3,RPRX,1.25,1.27,1.34,1.26,1.24,1.13,1.16,1.19,1.29,...,,,,,,,,,,
4,SGEN,42.31,42.73,43.71,43.7,44.51,45.3,44.82,43.88,42.07,...,115.2,113.96,118.5,115.0,118.32,115.21,115.17,113.73,113.69,113.9


Seems like some values are still missing... because the stock market be like that. Let's nuke them

In [29]:
big_boi = big_boi.dropna()
big_boi

Unnamed: 0,ticker,2015-12-22,2015-12-23,2015-12-24,2015-12-28,2015-12-29,2015-12-30,2015-12-31,2016-01-04,2016-01-05,...,2019-12-17,2019-12-18,2019-12-19,2019-12-20,2019-12-23,2019-12-24,2019-12-26,2019-12-27,2019-12-30,2019-12-31
0,NVO,49.0179,49.5701,49.7771,50.2947,50.4069,50.6312,50.2947,49.8548,50.01,...,54.9628,54.4045,54.1207,54.8965,54.8965,54.6221,54.8397,55.4642,55.275,54.5748
1,VRTX,121.09,123.92,124.38,123.99,124.97,127.68,125.55,123.03,123.89,...,219.72,217.83,220.14,219.62,220.95,220.46,220.6,220.59,220.38,218.6
2,REGN,535.59,531.06,535.56,536.45,542.28,550.54,542.62,531.76,519.38,...,375.73,370.3,371.95,374.94,377.2,374.75,376.76,375.24,373.74,373.64
4,SGEN,42.31,42.73,43.71,43.7,44.51,45.3,44.82,43.88,42.07,...,115.2,113.96,118.5,115.0,118.32,115.21,115.17,113.73,113.69,113.9
5,ALNY,88.27,87.24,91.29,90.16,92.49,93.48,94.18,92.18,93.49,...,123.7,117.27,116.1,117.88,115.74,117.08,118.0,118.3,116.83,114.0
6,BMRN,104.36,104.4,105.83,105.91,106.02,106.3,105.44,102.8,105.02,...,83.36,83.13,84.61,85.45,85.3,86.11,86.43,86.16,85.55,84.15
7,INCY,113.78,109.21,110.26,108.98,109.98,110.27,108.24,106.5,102.14,...,93.43,90.84,90.9,92.01,90.68,90.47,91.2,89.17,88.27,87.7
8,TECH,83.8159,85.2154,85.3011,86.6815,87.7763,87.4241,86.4816,84.7965,84.0635,...,217.1782,215.2583,213.8926,216.2084,219.9591,220.5529,219.019,216.4459,215.8224,215.1791
9,UTHR,158.42,159.28,159.15,159.99,161.87,159.26,157.78,154.23,156.38,...,91.26,89.58,89.82,89.49,90.76,90.89,91.27,89.64,88.65,86.91
10,JAZZ,139.99,144.15,144.24,141.78,142.65,142.56,142.55,137.83,138.35,...,149.22,149.23,149.62,153.0,153.95,153.57,153.62,152.45,151.32,148.09


In [30]:
big_boi.to_csv('opening_prices_biotech_original.csv')

Now all that's left is we need to remove the explicit date headings and instead label the dates as relative to the first date. 

In [31]:
num_days = len(big_boi.iloc[0]) - 1
column_names = ['ticker']
for i in range(num_days):
    day_id = f'day_{i}'
    column_names.append(day_id)
column_names

['ticker',
 'day_0',
 'day_1',
 'day_2',
 'day_3',
 'day_4',
 'day_5',
 'day_6',
 'day_7',
 'day_8',
 'day_9',
 'day_10',
 'day_11',
 'day_12',
 'day_13',
 'day_14',
 'day_15',
 'day_16',
 'day_17',
 'day_18',
 'day_19',
 'day_20',
 'day_21',
 'day_22',
 'day_23',
 'day_24',
 'day_25',
 'day_26',
 'day_27',
 'day_28',
 'day_29',
 'day_30',
 'day_31',
 'day_32',
 'day_33',
 'day_34',
 'day_35',
 'day_36',
 'day_37',
 'day_38',
 'day_39',
 'day_40',
 'day_41',
 'day_42',
 'day_43',
 'day_44',
 'day_45',
 'day_46',
 'day_47',
 'day_48',
 'day_49',
 'day_50',
 'day_51',
 'day_52',
 'day_53',
 'day_54',
 'day_55',
 'day_56',
 'day_57',
 'day_58',
 'day_59',
 'day_60',
 'day_61',
 'day_62',
 'day_63',
 'day_64',
 'day_65',
 'day_66',
 'day_67',
 'day_68',
 'day_69',
 'day_70',
 'day_71',
 'day_72',
 'day_73',
 'day_74',
 'day_75',
 'day_76',
 'day_77',
 'day_78',
 'day_79',
 'day_80',
 'day_81',
 'day_82',
 'day_83',
 'day_84',
 'day_85',
 'day_86',
 'day_87',
 'day_88',
 'day_89',
 'day_90'

In [32]:
big_boi.columns = column_names
big_boi

Unnamed: 0,ticker,day_0,day_1,day_2,day_3,day_4,day_5,day_6,day_7,day_8,...,day_1003,day_1004,day_1005,day_1006,day_1007,day_1008,day_1009,day_1010,day_1011,day_1012
0,NVO,49.0179,49.5701,49.7771,50.2947,50.4069,50.6312,50.2947,49.8548,50.01,...,54.9628,54.4045,54.1207,54.8965,54.8965,54.6221,54.8397,55.4642,55.275,54.5748
1,VRTX,121.09,123.92,124.38,123.99,124.97,127.68,125.55,123.03,123.89,...,219.72,217.83,220.14,219.62,220.95,220.46,220.6,220.59,220.38,218.6
2,REGN,535.59,531.06,535.56,536.45,542.28,550.54,542.62,531.76,519.38,...,375.73,370.3,371.95,374.94,377.2,374.75,376.76,375.24,373.74,373.64
4,SGEN,42.31,42.73,43.71,43.7,44.51,45.3,44.82,43.88,42.07,...,115.2,113.96,118.5,115.0,118.32,115.21,115.17,113.73,113.69,113.9
5,ALNY,88.27,87.24,91.29,90.16,92.49,93.48,94.18,92.18,93.49,...,123.7,117.27,116.1,117.88,115.74,117.08,118.0,118.3,116.83,114.0
6,BMRN,104.36,104.4,105.83,105.91,106.02,106.3,105.44,102.8,105.02,...,83.36,83.13,84.61,85.45,85.3,86.11,86.43,86.16,85.55,84.15
7,INCY,113.78,109.21,110.26,108.98,109.98,110.27,108.24,106.5,102.14,...,93.43,90.84,90.9,92.01,90.68,90.47,91.2,89.17,88.27,87.7
8,TECH,83.8159,85.2154,85.3011,86.6815,87.7763,87.4241,86.4816,84.7965,84.0635,...,217.1782,215.2583,213.8926,216.2084,219.9591,220.5529,219.019,216.4459,215.8224,215.1791
9,UTHR,158.42,159.28,159.15,159.99,161.87,159.26,157.78,154.23,156.38,...,91.26,89.58,89.82,89.49,90.76,90.89,91.27,89.64,88.65,86.91
10,JAZZ,139.99,144.15,144.24,141.78,142.65,142.56,142.55,137.83,138.35,...,149.22,149.23,149.62,153.0,153.95,153.57,153.62,152.45,151.32,148.09


In [33]:
big_boi.to_csv('biotech_50.csv')

# Splitting data
We  need to save about 60 days worth of data for ourselves

In [49]:
last_day_id = 1012
col_names = ['ticker']
for i in range(last_day_id-61, last_day_id, 1):
    col_name = f'day_{i}'
    col_names.append(col_name)

In [51]:
testing_set = big_boi[col_names]
testing_set.to_csv('testing_set.csv')

In [52]:
last_day_id = 1012 - 61
col_names = ['ticker']
for i in range(0, last_day_id, 1):
    col_name = f'day_{i}'
    col_names.append(col_name)

In [55]:
training_set = big_boi[col_names]
training_set.to_csv('opening_prices_biotech.csv')