In [None]:
import pandas as pd
import feedparser
from bs4 import BeautifulSoup
from bs4.element import Comment
from urllib.request import Request, urlopen
from datetime import datetime
from time import mktime


url = 'https://news.google.com/rss/search?q="bitcoin%20cash"%20when%3A1d&hl=en-US&gl=US&ceid=US%3Aen'  #this rss gathers daily news about bitcoin cash 


class ParseFeed():

    def __init__(self, url):
        self.feed_url = url
        

    def parse(self):
        '''
        Parse the URL, and print all the details of the news 
        '''
            

        feeds = feedparser.parse(self.feed_url).entries
        self.pubdate_list = [] # create callable lists
        self.link_list = []
        self.body_list = []
        
        # gather links from rss for article extraction
        for f in feeds:
            test_list=[]
            test_list.append(f.get("link"))
            d=f.get("published_parsed")
            self.pubdate_list.append(d)
            
            #create some basic cleaning code
            def tag_visible(element):
                if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
                    return False
                if isinstance(element, Comment):
                    return False
                return True            
            
            #parse links in rss feed
            for link in test_list:
                def text_from_html(body):
                    soup = BeautifulSoup(body, "html.parser")
                    texts = soup.findAll(text=True)
                    visible_texts = filter(tag_visible, texts)  
                    return u" ".join(t.strip() for t in visible_texts)
                
                # some websites really don't like spiders and other bots, while we can --we won't try to bypass them.
                try:
                    req = Request(link, headers={'user-agent': 'Mozilla/5.0 (Windows NT 6.3; rv:36.0) ..'})
                    html = urlopen(req).read()
                    body_part = text_from_html(html)
                    
                    self.link_list.append(link)
                    self.body_list.append(body_part)
                except:
                    continue

#initiate parser
feed = ParseFeed(url)
feed.parse()


#convert struct date structure to timestamp
pub_date_end = []
for pdate in feed.pubdate_list:
    pdate = str(datetime.fromtimestamp(mktime(pdate)))
    pub_date_end.append(pdate)

data_load =pd.DataFrame(list(zip(feed.link_list,pub_date_end, feed.body_list)), 
               columns =['link', 'date','body']) 


sent_score_list = []

import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize
nltk.download('punkt')

analyzer = SentimentIntensityAnalyzer()
for paragraph in data_load['body']:
      
    
    sentence_list = tokenize.sent_tokenize(paragraph)
    paragraphSentiments = 0.0
    for sentence in sentence_list:
        vs = analyzer.polarity_scores(sentence)
        #print("{:-<69} {}".format(sentence, str(vs["compound"])))
        paragraphSentiments += vs["compound"]

    try:
        sent_score = round(paragraphSentiments / len(sentence_list), 3)
        sent_score_list.append(sent_score)
    except:
        sent_score = 0
        sent_score_list.append(sent_score)



data_load['sentiment score'] = sent_score_list


import azure.cosmos.cosmos_client as cosmos_client
import azure.cosmos.exceptions as exceptions
from azure.cosmos.partition_key import PartitionKey
import json


#setup connection (information like this should be stored in secure place like KeyVault)
client = cosmos_client.CosmosClient("https://trade-parser.documents.azure.com:443/","{API key}")
db = client.create_database_if_not_exists(id= "BCHSentimentDatabase")

container = db.create_container_if_not_exists(id="RSSparsedBCHnews",partition_key=PartitionKey(path ='/date',kind ='Hash'))


# load data to nosql

for i in range(0,data_load.shape[0]):
    # create a dictionary for the selected row
    data_dict = dict(data_load.iloc[i,:])
    # convert the dictionary to a json object.
    data_dict = json.dumps(data_dict, default =str)
    updated_item = container.upsert_item(json.loads(data_dict))
    updated_item