<a href="https://colab.research.google.com/github/tehsinbhati/Hello-Friends/blob/master/FinanceWebsiteCashflowForecasting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import time
import csv
import os.path
import numpy as np 
import pandas as pd 
import requests 
from bs4 import BeautifulSoup 

In [6]:
def request_with_check(url):
    
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36 , For: a Tutorial kernel By: elamraoui sohayb'}
    page_response = requests.get(url, headers=headers, timeout=60)
    if page_response.status_code>299:
        raise AssertionError("page content not found, status: %s"%page_response.status_code)
    
    return page_response   

In [7]:
page_test = request_with_check('https://www.investing.com/news/commodities-news')
# Cheking the first 5000 charchters of the HTML code
page_test.text[:5000]

' <!DOCTYPE HTML>\n<html dir="ltr" xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" xmlns:schema="http://schema.org/" class="com" lang="en" geo="US">\n<head>\n<script type="text/javascript">\n    function OptanonWrapper() { }\n\n    // temp for checks\n    </script>\n<style>\n\t.ot-floating-button {\n\t\tz-index:12 !important;\n\t}\n</style>\n<script type="text/javascript">\n        window.helpers = {\n            \'getCookie\': function (k){var v="; "+document.cookie;var p=v.split("; "+k+"=");if(p.length===2)return p.pop().split(";").shift();else return null},\n            \'setCookie\': function (k,v,d){var e=\'\';if(typeof d!==\'undefined\'){var n=new Date;n.setTime(n.getTime()+864e5*d);e+=\';expires="\';e+=n.toGMTString();e+=\'"\';}document.cookie=k+\'=\'+v+\';path=/\' + e},\n            \'delCookie\': function (k){document.cookie=k+\'=;path=/;expires=-1\'},\n            \'isNumber\': function (n){return !isNaN(n)&&typeof n!==undefined&&n!==null&&n!==\'\'},\n            \'removeC

In [8]:
def get_details(single_article):
    
    # A title is in <a></a> with the 'class' attribute set to: title
    title = single_article.find('a',{'class':'title'})

    # A safeguard against some empty articles in the deeper pages of the site
    if title == None:
        #print('Empty Article')
        return None
    
    # the link to an article is the Href attribute
    link = title['href']
    
    # A safeguarde against embedded Advertisment articles
    if (('/news/'and category_name) not in link):
        #print('Ad Article found')
        return None       
        
    title = title.text
    
    # The first Paragraph is in <p></p>
    first_p = single_article.find('p').text
    
    # the Source is in <span></span>, with Class == articleDetails
    source_tag = single_article.find_all('span',{'class':'articleDetails'})
    source = str(source_tag[0].span.text)
    
    #date is also in <span></span> withe the Class == date
    date = single_article.find('span',{'class':'date'}).text
    
    return title, link, first_p, source, date

In [9]:
def single_page(Url_page,page_id = 1):

    news_list = []

    #Making the Http request
    page = request_with_check(Url_page)
    
    #Calling the Html.parser to start extracting our data
    html_soup = BeautifulSoup(page.text, 'html.parser')
    
    # The Articles Class
    articles = html_soup.find('div',{'class':'largeTitle'})
    
    # The single Articles List
    articleItems = articles.find_all('article' ,{'class':'articleItem'})

    # Looping, for each single Article
    for article in articleItems:
        if get_details(article) == None:
            continue
        
        title, link, first_p, source_tag, date = get_details(article)
        news_list.append({'id_page':page_id,
                          'title':title,   
                          'date':date,
                          'link': link,
                          'source':source_tag,
                          'first_p':first_p})

    return news_list

In [10]:
def dict_to_csv (filename,news_dict):
    
    #Setting the Dataframe headers
    fields = news_dict[0]
    fields = list(fields.keys())
    
    #Checking if the file already exists, if Exists we woulb pe appending, if Not we creat it
    has_header = False
    if os.path.isfile(filename):
        with open(filename, 'r') as csvfile:
            sniffer = csv.Sniffer()
            has_header = sniffer.has_header(csvfile.read(2048))
    
    with open(filename, 'a',errors = 'ignore', encoding= 'utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fields)
        if(has_header == False):
            writer.writeheader()  
        for row in range(len(news_dict)):
            item = news_dict[row]
            writer.writerow(item)

In [11]:
def parsing_category_pages(category_name,base_url,number_pages):
    start_time = time.time()
    
    #getting the start page
    page = request_with_check(base_url)

    #Calling the Html Parser
    html_soup = BeautifulSoup(page.text, 'html.parser')
    
    #Finding the Laste page
    last_page = int(html_soup.findAll(class_='pagination')[-1].text)

    if number_pages > last_page:
        number_pages = last_page

    #Looping over the specified nupber of Pages:
    for p in range(1,number_pages,1):
        category_page = base_url+'/'+str(p)
        print('Parsing: ',category_page)
        page_news = single_page(category_page,p)
        
        #Saving to a CSV
        dict_to_csv(category_name+'.csv',page_news)
        
        #Time sleep
        time.sleep(10)
    
    print("--- %s seconds ---" % (time.time() - start_time))
    return True

In [12]:
URL = 'https://www.investing.com/news/'
category_name = 'commodities-news'
base_url = URL+category_name
parsing_category_pages ('commodities-news',base_url,number_pages=5)

Parsing:  https://www.investing.com/news/commodities-news/1
Parsing:  https://www.investing.com/news/commodities-news/2
Parsing:  https://www.investing.com/news/commodities-news/3
Parsing:  https://www.investing.com/news/commodities-news/4
--- 48.878769874572754 seconds ---


True

In [14]:
data = pd.read_csv('commodities-news.csv')
data.head(100)

Unnamed: 0,id_page,title,date,link,source,first_p
0,1,Oil extends losses as U.S. mulls strategic res...,- 11 minutes ago,/news/commodities-news/oil-drops-for-2nd-sessi...,By Reuters,By Ahmad Ghaddar LONDON (Reuters) - Oil price...
1,1,South Africa should not rush move away from co...,- 1 hour ago,/news/commodities-news/south-africa-should-not...,By Reuters,By Helen Reid and Alexander Winning JOHANNESB...
2,1,Exclusive-Lawyers warn EU against labelling ga...,- 3 hours ago,/news/commodities-news/exclusivelawyers-warn-e...,By Reuters,By Kate Abnett and Simon Jessop BRUSSELS (Reu...
3,1,OPEC+ caution and money behind reluctance to p...,- 4 hours ago,/news/commodities-news/opec-caution-and-money-...,By Reuters,"By Alex Lawler, Ahmad Ghaddar and Olesya Asta..."
4,1,Australia resources minister floats A$250 bill...,- 5 hours ago,/news/commodities-news/australia-resources-min...,By Reuters,MELBOURNE (Reuters) - Australia's resources m...
...,...,...,...,...,...,...
64,4,UK could issue more temporary visas to solve l...,"- Oct 02, 2021",/news/commodities-news/uk-could-issue-more-tem...,By Reuters,LONDON (Reuters) - British Prime Minister Bor...
65,4,Fuel shortages remain in southeast England but...,"- Oct 02, 2021",/news/commodities-news/fuel-shortages-remain-i...,By Reuters,LONDON (Reuters) - Gas stations in London and...
66,4,British military to help with fuel deliveries ...,"- Oct 01, 2021",/news/commodities-news/british-military-to-hel...,By Reuters,LONDON (Reuters) - Britain will deploy almost...
67,4,"Exclusive-White House, top Democrats agree to ...","- Oct 01, 2021",/news/commodities-news/exclusivewhite-house-to...,By Reuters,By Jarrett Renshaw and Timothy Gardner WASHIN...
