In [1]:
from bs4 import BeautifulSoup
import pandas as pd 
import requests 
import time

### Mission Objective 
> * establish a connection with the target url
> * extract the content of the target url (extract the html of that page we made a request to)
> * parse that html content using the BeautifulSoup class that we imported

In [26]:
def scrape_yahoo(url: str):
    resp = requests.get(url)
    bsoup_object = BeautifulSoup(resp.text, "lxml")
    
    pre_frame = {
        "ticker": [][:30], # make sure to shrink the column to having exactly 30 pieces of data so that no size/shape error occurs
        "name": [][:30],
        "last_price": [][:30],
        "market_time": [][:30],
        "change": [][:30],
        "change_percent": [][:30],
        "volume": [][:30],
        "market_cap": [][:30]
    }
    
    for row in bsoup_object.select('tr'):
        try:
            pre_frame["ticker"].append(row.select('td')[0].text)
            pre_frame["name"].append(row.select('td')[1].text)
            pre_frame["last_price"].append(row.select('td')[2].text)
            pre_frame["market_time"].append(row.select('td')[3].text)
            pre_frame["change"].append(row.select('td')[4].text)
            pre_frame["change_percent"].append(row.select('td')[5].text)
            pre_frame["volume"].append(row.select('td')[6].text)
            pre_frame["market_cap"].append(row.select('td')[7].text)
            
        except:
            continue 
            
    df = pd.DataFrame(pre_frame) # index=False we don't want an additional useless column made which has index numbers
    df.to_csv("finance.csv", index=False)
    
    return f"Exported {len(df)} rows of data"

In [27]:
# if __name__ == "__main__":
#     while True:
#         scrape_yahoo("https://finance.yahoo.com/trending-tickers")
#         time.sleep(10)

In [28]:
scrape_yahoo("https://finance.yahoo.com/trending-tickers")

'Exported 30 rows of data'

# Data Cleansing 

In [2]:
import pandas as pd

In [3]:
data_frame = pd.read_csv("finance.csv")
data_frame.head()

Unnamed: 0,ticker,name,last_price,market_time,change,change_percent,volume,market_cap
0,GME,GameStop Corp.,214.77,10:49AM EDT,14.4,+7.19%,3.975M,14.901B
1,PLUG,Plug Power Inc.,31.88,10:49AM EDT,2.03,+6.80%,26.167M,18.118B
2,OCGN,"Ocugen, Inc.",6.99,10:49AM EDT,-0.41,-5.54%,15.175M,1.386B
3,SPLK,Splunk Inc.,140.47,10:49AM EDT,14.98,+11.94%,4.522M,23.025B
4,BTC-USD,Bitcoin USD,29982.4,3:44PM BST,-3037.6,-9.20%,51.247B,561.888B


In [4]:
data_frame["change"] = data_frame["change"].str.replace("+", "")

In [5]:
data_frame["change"] = data_frame["change"].str.replace(",", "")

In [6]:
data_frame["change"] = data_frame["change"].str.replace("-", "")

In [8]:
data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   ticker          30 non-null     object
 1   name            30 non-null     object
 2   last_price      30 non-null     object
 3   market_time     30 non-null     object
 4   change          30 non-null     object
 5   change_percent  30 non-null     object
 6   volume          30 non-null     object
 7   market_cap      30 non-null     object
dtypes: object(8)
memory usage: 2.0+ KB


In [10]:
data_frame["change"] = data_frame["change"].astype(float)
data_frame["last_price"] = data_frame["last_price"].str.replace(",", "").astype(float)
data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   ticker          30 non-null     object 
 1   name            30 non-null     object 
 2   last_price      30 non-null     float64
 3   market_time     30 non-null     object 
 4   change          30 non-null     float64
 5   change_percent  30 non-null     object 
 6   volume          30 non-null     object 
 7   market_cap      30 non-null     object 
dtypes: float64(2), object(6)
memory usage: 2.0+ KB


In [11]:
data_frame.drop("change_percent", axis=1, inplace=True)

In [13]:
data_frame.head()

Unnamed: 0,ticker,name,last_price,market_time,change,volume,market_cap
0,GME,GameStop Corp.,214.77,10:49AM EDT,14.4,3.975M,14.901B
1,PLUG,Plug Power Inc.,31.88,10:49AM EDT,2.03,26.167M,18.118B
2,OCGN,"Ocugen, Inc.",6.99,10:49AM EDT,0.41,15.175M,1.386B
3,SPLK,Splunk Inc.,140.47,10:49AM EDT,14.98,4.522M,23.025B
4,BTC-USD,Bitcoin USD,29982.4,3:44PM BST,3037.6,51.247B,561.888B


In [19]:
data_frame["volume"] = data_frame["volume"].str.replace("M", "")
data_frame["volume"] = data_frame["volume"].str.replace("B", "")
data_frame["volume"] = data_frame["volume"].str.replace(",", "").astype(float)

In [20]:
data_frame.head()

Unnamed: 0,ticker,name,last_price,market_time,change,volume,market_cap
0,GME,GameStop Corp.,214.77,10:49AM EDT,14.4,3.975,14.901B
1,PLUG,Plug Power Inc.,31.88,10:49AM EDT,2.03,26.167,18.118B
2,OCGN,"Ocugen, Inc.",6.99,10:49AM EDT,0.41,15.175,1.386B
3,SPLK,Splunk Inc.,140.47,10:49AM EDT,14.98,4.522,23.025B
4,BTC-USD,Bitcoin USD,29982.4,3:44PM BST,3037.6,51.247,561.888B


In [21]:
data_frame["market_cap"] = data_frame["market_cap"].str.replace("M", "")
data_frame["market_cap"] = data_frame["market_cap"].str.replace("B", "")
data_frame["market_cap"] = data_frame["market_cap"].str.replace(",", "").astype(float)

In [22]:
data_frame.head()

Unnamed: 0,ticker,name,last_price,market_time,change,volume,market_cap
0,GME,GameStop Corp.,214.77,10:49AM EDT,14.4,3.975,14.901
1,PLUG,Plug Power Inc.,31.88,10:49AM EDT,2.03,26.167,18.118
2,OCGN,"Ocugen, Inc.",6.99,10:49AM EDT,0.41,15.175,1.386
3,SPLK,Splunk Inc.,140.47,10:49AM EDT,14.98,4.522,23.025
4,BTC-USD,Bitcoin USD,29982.4,3:44PM BST,3037.6,51.247,561.888


In [23]:
data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   ticker       30 non-null     object 
 1   name         30 non-null     object 
 2   last_price   30 non-null     float64
 3   market_time  30 non-null     object 
 4   change       30 non-null     float64
 5   volume       30 non-null     float64
 6   market_cap   30 non-null     float64
dtypes: float64(4), object(3)
memory usage: 1.8+ KB
