In [4]:
from pandas_datareader import data
import pandas as pd
import csv
import string
import datetime
from collections import defaultdict
from pandas.tseries.offsets import BDay
from bs4 import BeautifulSoup
import re
import requests

## Functions to retrieve stock data

In [3]:
def stockData(startDate, endDate, ticker):
    # Define which online source one should use
    data_source = 'google'

    # User pandas_reader.data.DataReader to load the desired data.
    panel_data = data.DataReader(ticker, data_source, startDate, endDate)

    close = panel_data.ix['Close']
    volume = panel_data.ix['Volume']
    op = panel_data.ix['Open']
    high = panel_data.ix['High']
    low = panel_data.ix['Low']

    # Getting all weekdays between 01/01/2017 and 12/31/2017
    all_weekdays = pd.date_range(start=startDate, end=endDate, freq='B')

    # Align new set of dates
    close = close.reindex(all_weekdays)
    volume = volume.reindex(all_weekdays)
    op = op.reindex(all_weekdays)
    high = high.reindex(all_weekdays)
    low = low.reindex(all_weekdays)

    result = pd.concat([close, volume, op, high, low], axis=1, join='inner')
    result.columns = ['close', 'volume', 'open', 'high', 'low']
    return result


def findHigh(startDate, ticker):
    # Get date and five days after
    temp_date = datetime.datetime.strptime(startDate, "%Y-%m-%d")
    endDate = temp_date + BDay(5)

    result = stockData(startDate, endDate, ticker)
    tempHigh = result.nlargest(1, 'high')
    high = tempHigh.iloc[0]['high']
    return high


def openPrice(startDate, ticker):
    temp_date = datetime.datetime.strptime(startDate, "%Y-%m-%d")
    endDate = temp_date + BDay(1)

    result = stockData(startDate, endDate, ticker)
    open = result.iloc[0]['open']
    return open

## Search Google for news on stocks of choice

In [47]:
tickers = ['GOOGL','MSFT','AAPL']

for stock in tickers:
    url = 'https://www.google.com/search?q=' + stock + '&source=lnms&tbm=nws'
    response = requests.get(url)

    soup = BeautifulSoup(response.content, "html.parser")

    links = []
    for tag in soup.findAll('a', href=True):
        link = re.findall('http*.*://*.*">', str(tag))
        if len(link) > 0 and re.search('google', str(link)) is None and re.search('youtube', str(link)) is None:
            links.append(link[0][:-2])
            
    
    print(links)

['https://seekingalpha.com/article/4201510-apple-lowering-expectations&amp;sa=U&amp;ved=0ahUKEwizvYvAwoTdAhUOuVkKHa80CAwQqQIIFCgAMAA&amp;usg=AOvVaw2bkqhAYwNvOjLH9uxI36jB', 'https://seekingalpha.com/article/4201510-apple-lowering-expectations&amp;sa=U&amp;ved=0ahUKEwizvYvAwoTdAhUOuVkKHa80CAwQpwIIFTAA&amp;usg=AOvVaw3Q6DCMT5o4fblirZoaay6I', 'https://www.zacks.com/stock/news/319317/is-apple-aapl-outperforming-other-computer-and-technology-stocks-this-year&amp;sa=U&amp;ved=0ahUKEwizvYvAwoTdAhUOuVkKHa80CAwQqQIIFygAMAE&amp;usg=AOvVaw392rUoF9eYR50-7Qpv2ByZ', 'https://www.zacks.com/stock/news/319317/is-apple-aapl-outperforming-other-computer-and-technology-stocks-this-year&amp;sa=U&amp;ved=0ahUKEwizvYvAwoTdAhUOuVkKHa80CAwQpwIIGDAB&amp;usg=AOvVaw3RviJqtZOYiqUFfVSh15ww', 'https://seekingalpha.com/article/4201376-time-take-profits-apple&amp;sa=U&amp;ved=0ahUKEwizvYvAwoTdAhUOuVkKHa80CAwQqQIIGigAMAI&amp;usg=AOvVaw3BOaDGj4UtGaw5yMvvsNpt', 'https://www.nasdaq.com/article/apple-aapl-gains-as-market-dip

## Scrape websites for text

The two methods below can be used to  pull text from the links collected above. Currently only using demo links. Some websites block requests.get so I used Selenium to pull the whole page source from one such site. Selenium is slow and might not be the greatest option.

In [64]:
url = 'https://www.nasdaq.com/article/technology-sector-update-for-08232018-feyegooggooglfbsmcicmtlatvi-cm1012272'

response = requests.get(url)

soup = BeautifulSoup(response.content, "html.parser")

text = soup.get_text()

text = re.sub(r'.*resultsVerbatimAbout', ' ', text)

text = re.sub(r'NextAdvanced searchSearch.*', ' ', text)

#text = re.sub(r'http.*.', ' ', text)

print(text)






(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
})(window,document,'script','dataLayer','GTM-K2BQVP7');


(function(){
var is_chrome;
if(typeof navigator.vendor!="undefined")
	is_chrome = ((navigator.userAgent.toLowerCase().indexOf('chrome') > -1) &&(navigator.vendor.toLowerCase().indexOf("google") > -1));
else
	is_chrome = false;
if(is_chrome)
{
//instart
//Copyright Instart Logic Tue Jul 17 2018 - All rights reserved - version: 10.2.8
!function(t){if(function(){var n=!0,e=!1,i="";try{var r="object"==typeof t.IXC_198_8424151822013404&&t.IXC_198_8424151822013404;if(n=!r||void 0===r.CanRun||"_198_8424151822013404"!==r._198_8424151822013404||r.CanRun("abd"))r=t.IXC_198_8424151822013404=t.IXC_198_8424151822013404||{},r.InitStartTime=(t

#### Selenium example to get text from Seeking Alpha

In [58]:
testLinks=['https://seekingalpha.com/article/4201510-apple-lowering-expectations&amp;sa=U&amp;ved=0ahUKEwjrk_6QvYTdAhXStlkKHY7_BwsQqQIIFCgAMAA&amp;usg=AOvVaw2gjPdaazl71IUdWouOJyrn']

from selenium import webdriver


for url in testLinks:
    driver = webdriver.Chrome(r'C:\Users\tiffany.fabianac\PycharmProjects\WINWIN\venv\Lib\site-packages\selenium\webdriver\chrome\chromedriver.exe')
    driver.get(url)

    soup = BeautifulSoup(driver.page_source,'lxml')
    driver.quit()

    text = soup.get_text()
    text = re.sub(r'{*.*}', ' ', text)
    text = re.sub(r'{*.*;', ' ', text)
    
    print(text)

 
 
 
 PROMarketplaceSeeking Alpha PortfolioPeopleNews

Most Used
Market News (by date)
Top News
Wall Street Breakfast
Dividend News
Earnings Calendar
See All Sections


Recommended
Energy News
Healthcare News
Tech News
On The Move
Latest StockTalks


Analysis

Most Used
Dividends
Dividend Ideas
Market Outlook
Stock Ideas
REITs
See All Sections


Recommended
Short Ideas
Closed End Funds
Today's Market
ETF Screener
Gold & Precious Metals


Sign in / Join Now(function(w) {
  var getParam = function (name) {
 
 #]*)"),
 
 
 ,
  rc = function(n) {
 
 
 ,
  paginate = function () {
    if (!w.aConf.pagination.singlePage) {
 
 
 ,
  unblock_authors_pick = "false",
  noRBCheck = function(){
    return !!(log || fromInStream || !fromInsight || unblock_authors_pick === "true")
 ,
  noRbStrategy = function () {
 
    w.aConf.pagination = {
      limit: pLimit,
      pages: pages,
      page: page,
      singlePage: noRBandPagination,
      singlePageBtn: false
 

    w.aConf.roadblock = {
      

## Extract Features

This method is using CountVectorizer to get unique words and eliminate stop words & symbols. 

In [62]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(analyzer="word", lowercase=True,
                                max_features=5000, stop_words='english', token_pattern=r'\b[a-zA-Z]{2,}\b')
text=[text]
bag_of_words = vect.fit_transform(text)

print("The Features: {0}".format(vect.get_feature_names()))

The Features: ['aapl', 'able', 'accepted', 'aconf', 'active', 'added', 'additional', 'adoption', 'advantage', 'aggressively', 'alert', 'alerts', 'alibaba', 'alipay', 'allow', 'allows', 'alpha', 'alternative', 'amazon', 'ambitions', 'americas', 'amzn', 'analysis', 'analyst', 'android', 'announced', 'app', 'apple', 'article', 'askola', 'aug', 'authorized', 'baba', 'banking', 'banks', 'base', 'basis', 'battle', 'behemoths', 'benefit', 'best', 'better', 'big', 'bigger', 'biggest', 'billion', 'breakfast', 'breaking', 'build', 'building', 'business', 'buy', 'calendar', 'cap', 'capital', 'cards', 'case', 'catalysts', 'catch', 'caveats', 'ceo', 'challenges', 'chase', 'chhatwal', 'chhatwalrohit', 'china', 'choose', 'close', 'closed', 'codes', 'comes', 'companies', 'company', 'compared', 'compensation', 'compete', 'competitor', 'completely', 'considered', 'contactless', 'contributed', 'contribution', 'control', 'corner', 'cornered', 'corners', 'cost', 'costco', 'countries', 'credit', 'customer',