In [2]:
from pandas_datareader import data
import pandas as pd
import csv
import string
import datetime
from collections import defaultdict
from pandas.tseries.offsets import BDay
from bs4 import BeautifulSoup
import re
import requests

## Functions to retrieve stock data

In [3]:
def stockData(startDate, endDate, ticker):
    # Define which online source one should use
    data_source = 'google'

    # User pandas_reader.data.DataReader to load the desired data.
    panel_data = data.DataReader(ticker, data_source, startDate, endDate)

    close = panel_data.ix['Close']
    volume = panel_data.ix['Volume']
    op = panel_data.ix['Open']
    high = panel_data.ix['High']
    low = panel_data.ix['Low']

    # Getting all weekdays between 01/01/2017 and 12/31/2017
    all_weekdays = pd.date_range(start=startDate, end=endDate, freq='B')

    # Align new set of dates
    close = close.reindex(all_weekdays)
    volume = volume.reindex(all_weekdays)
    op = op.reindex(all_weekdays)
    high = high.reindex(all_weekdays)
    low = low.reindex(all_weekdays)

    result = pd.concat([close, volume, op, high, low], axis=1, join='inner')
    result.columns = ['close', 'volume', 'open', 'high', 'low']
    return result


def findHigh(startDate, ticker):
    # Get date and five days after
    temp_date = datetime.datetime.strptime(startDate, "%Y-%m-%d")
    endDate = temp_date + BDay(5)

    result = stockData(startDate, endDate, ticker)
    tempHigh = result.nlargest(1, 'high')
    high = tempHigh.iloc[0]['high']
    return high


def openPrice(startDate, ticker):
    temp_date = datetime.datetime.strptime(startDate, "%Y-%m-%d")
    endDate = temp_date + BDay(1)

    result = stockData(startDate, endDate, ticker)
    open = result.iloc[0]['open']
    return open

## Search Google for news on stocks of choice

In [6]:
tickers = ['GOOGL','MSFT','AAPL']

for stock in tickers:
    url = 'https://www.google.com/search?q=' + stock + '&source=lnms&tbm=nws'
    response = requests.get(url)

    soup = BeautifulSoup(response.content, "html.parser")

    links = []
    for tag in soup.findAll('a', href=True):
        link = re.findall('http*.*://*.*">', str(tag))
        if len(link) > 0 and re.search('google', str(link)) is None and re.search('youtube', str(link)) is None:
            links.append(link[0][:-2])
            
    
print(links[0])

https://seekingalpha.com/article/4201510-apple-lowering-expectations&amp;sa=U&amp;ved=0ahUKEwjfkIvauoXdAhXQz1MKHe7KB1QQqQIIFCgAMAA&amp;usg=AOvVaw0SpZxkYoVE-N8sOLhgDlTt


## Scrape websites for text

The two methods below can be used to  pull text from the links collected above. Currently only using demo links. Some websites block requests.get so I used Selenium to pull the whole page source from one such site. Selenium is slow and might not be the greatest option.

In [16]:
url = 'https://www.nasdaq.com/article/technology-sector-update-for-08232018-feyegooggooglfbsmcicmtlatvi-cm1012272'

response = requests.get(url)

soup = BeautifulSoup(response.content, "html.parser")

text = soup.get_text()

text = str(text)

text = text[text.find('Shutterstock'):]

text = text[:text.find('Copyright')]


print(text)

Shutterstock photo


Top Tech Stocks MSFT +0.67% AAPL +0.45%makeArticleAd(); IBM -0.09% CSCO -0.03% GOOG +0.07% Technology stocks have turned higher Thursday afternoon, with the shares of tech stocks in the S&P 500 were adding over 0.2% in value while the Philadelphia Semiconductor Index was hanging on to a more than 0.1% gain. Among technology stocks moving on news: + FireEye ( FEYE  ) advanced Thursday, topping out with a 10% gain, following reports the cybersecurity company helped sector titans Facebook ( FB  ) and Alphabet's (GOOG,GOOGL) Google unit to identify Iranian influence campaigns on their respective websites. The New York Times reported FireEye executives tipped Facebook off to an Iranian disinformation campaign, which led to further discoveries of other campaigns from both Iran and Russia. FireEye also worked with Google's internal threat analysis divisions to help spot Iranian influence operations in its YouTube and email products, Google said today in a statement. In ot

#### Selenium example to get text from Seeking Alpha

In [17]:
testLinks=['https://seekingalpha.com/article/4201510-apple-lowering-expectations&amp;sa=U&amp;ved=0ahUKEwjrk_6QvYTdAhXStlkKHY7_BwsQqQIIFCgAMAA&amp;usg=AOvVaw2gjPdaazl71IUdWouOJyrn']

from selenium import webdriver


for url in testLinks:
    driver = webdriver.Chrome(r'C:\Users\tiffany.fabianac\PycharmProjects\WINWIN\venv\Lib\site-packages\selenium\webdriver\chrome\chromedriver.exe')
    driver.get(url)

    soup = BeautifulSoup(driver.page_source,'lxml')
    driver.quit()

    text = soup.get_text()
    text = re.sub(r'{*.*}', ' ', text)
    text = re.sub(r'{*.*;', ' ', text)
    
    text = text[text.find('Summary'):]

    text = text[:text.find('Next')]
    
    print(text)

SummaryIn recent earnings, Apple has touted the growth of Apple Pay and its future growth potential.A recent report from Juniper Research predicts that Apple will corner 50% market share of mobile payments among OEMs by 2020.Important international markets for Apple have different payment ecosystem which can hinder the growth of Apple Pay.Over the long run, retailers like Amazon, Walmart, and Alibaba have a much better ecosystem for building a strong payments business instead of phone companies like Apple.Any long-term growth expectation of Apple Pay would need to be moderated.Apple (AAPL) has repeatedly mentioned the progress it is making in the payments segments. A recent report from Juniper Research strengthens their point by showing massive growth in mobile wallets of OEM or original equipment manufacturer. The report says that OEM mobile pay users will increase to 450 million by 2020 and total transactions would increase to $300 billion. It also mentions that Apple will corner 50%

## Extract Features

This method is using CountVectorizer to get unique words and eliminate stop words & symbols. 

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(analyzer="word", lowercase=True,
                                max_features=5000, stop_words='english', token_pattern=r'\b[a-zA-Z]{2,}\b')
text=[text]
bag_of_words = vect.fit_transform(text)

print("The Features: {0}".format(vect.get_feature_names()))

The Features: ['aapl', 'able', 'accepted', 'added', 'additional', 'adoption', 'advantage', 'aggressively', 'alibaba', 'alipay', 'allow', 'allows', 'alpha', 'alternative', 'amazon', 'ambitions', 'americas', 'amzn', 'android', 'announced', 'app', 'apple', 'article', 'authorized', 'baba', 'banking', 'banks', 'base', 'basis', 'battle', 'behemoths', 'benefit', 'best', 'better', 'big', 'bigger', 'biggest', 'billion', 'build', 'building', 'business', 'capital', 'cards', 'catalysts', 'caveats', 'ceo', 'challenges', 'chase', 'china', 'choose', 'close', 'codes', 'comes', 'companies', 'company', 'compared', 'compensation', 'compete', 'competitor', 'completely', 'considered', 'contactless', 'contributed', 'contribution', 'corner', 'cornered', 'corners', 'cost', 'costco', 'countries', 'credit', 'customer', 'customers', 'daily', 'debit', 'decade', 'dedicated', 'depend', 'device', 'different', 'difficult', 'digits', 'dimon', 'direct', 'disadvantages', 'disclosure', 'does', 'domestic', 'dominated', 'd