# Data Collection
This noteboook is responsible for autonomously collecting Twitch follower, concurrent view, and total view data.

## Imports

In [2]:
#SQL connection and queries
import MySQLdb as mdb
#Connection failure exiting
import sys
#Getting current time (after data is scraped) for stream table
from time import gmtime, strftime, time, sleep
#Scraping
import urllib2
from bs4 import BeautifulSoup as bs
#Web automation
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

## File data
Below the database credentials are read in. These are used for logging into the database, creating the necessary tables (if not already created), and storing data.

In [3]:
db_user = ""
db_pass = ""
db_name = ""
db_host = "localhost"
with open("database_credentials.txt") as f:
    db_user = f.readline().strip()
    db_pass = f.readline().strip()
    db_name = f.readline().strip()
#print(db_user, db_pass, db_name)

## Database
### Versioning
Below the database is accessed using the specified username, password, and database name above. Note if the below code errors out, the rest of the database operations in this file will not produce expeted results (most likely will flat out not work).

In [4]:
#Connect
def mdb_get_version():
    """
    Connects to the database above using the obtained credentials.
    """
    con = None
    try:
        con = mdb.connect(host=db_host, user=db_user, passwd=db_pass, db=db_name)
        cur = con.cursor()
        cur.execute("SELECT VERSION()")
        data = cur.fetchone()
        print("Database version: {}".format(data))
    except mdb.Error, e:
        print("Error {}: {}".format(e.args[0], e.args[1]))
        sys.exit(1)
    finally:
        if con:
            con.close()
mdb_get_version()

Database version: ('5.7.21-log',)


### Table Creation
The below functions create the necessary tables for data storage if they do not already exist.

In [5]:
def create_tables():
    """
    Creates the necessary tables (if not already created) for Twitch data storage.
    """
    con = mdb.connect(host=db_host, user=db_user, passwd=db_pass, db=db_name)
    
    with con:
        cur = con.cursor()
        
        #Twitch game/creative/irl categories
        sql = """
        CREATE TABLE IF NOT EXISTS categories (
        id INT PRIMARY KEY AUTO_INCREMENT,
        category VARCHAR(255) NOT NULL UNIQUE
        )
        """
        cur.execute(sql)
        
        #Twitch streamers
        sql = """
        CREATE TABLE IF NOT EXISTS streamers (
        id INT PRIMARY KEY AUTO_INCREMENT,
        streamer VARCHAR(255) NOT NULL UNIQUE
        )
        """
        cur.execute(sql)
        
        #Individual stream
        """
        It's important to note here that any game, like Overwatch, will be streamed by multiple streamers.
        Similarly, a streamer may stream multiple games, or even stream Overwatch at different times.
        The goal of the learning agents will be to see how streaming variables, like stream time, affect
        concurrent viewership, if at all.
        """
        sql = """
        CREATE TABLE IF NOT EXISTS streams (
        id INT PRIMARY KEY AUTO_INCREMENT,
        current_viewers INT NOT NULL DEFAULT -1,
        created_at DATETIME NOT NULL DEFAULT current_timestamp,
        followers INT NOT NULL DEFAULT -1,
        total_views INT NOT NULL DEFAULT -1,
        streamer_id INT NOT NULL DEFAULT -1,
        FOREIGN KEY (streamer_id)
            REFERENCES streamers(id)
            ON DELETE CASCADE,
        category_id INT NOT NULL DEFAULT -1,
        FOREIGN KEY (category_id)
            REFERENCES categories(id)
            ON DELETE CASCADE
        )
        """
        cur.execute(sql)
create_tables()



## Twitch parameters
Below the variables for scraping on Twitch are setup. Currently this includes the following:
* url strings
* categories
    * these were chosen from the top 10 streaming categories on 2/2/2018 at 2:40pm

### URL strings

In [6]:
#String for vieweing all current streams for a specified category
category_string = "https://www.twitch.tv/directory/game/{}"

#String for navigating to a specific user's stream
stream_string = "https://www.twitch.tv/{}"

### Category strings

In [7]:
#Below is a list of the 10 most popular streaming categories on twitch as of
#2/2/2018 @ 2:40pm
safe_chars = "'"
categories = [
    "fortnite",
    "league of legends",
    "playerunknown's battlegrounds",
    "grand theft auto v",
    "hearthstone",
    "counter-strike: global offensive",
    "dota 2",
    "irl",
    "world of warcraft",
    "overwatch"
]
categories = ["irl"]
category_map = {}

print("Categories:")
for i in range(len(categories)):
    c = categories[i]
    category_map[c] = category_string.format(urllib2.quote(c, safe=safe_chars))
    print("\t{}. {}: {}".format(i, c, category_map[c]))

Categories:
	0. fortnite: https://www.twitch.tv/directory/game/fortnite
	1. league of legends: https://www.twitch.tv/directory/game/league%20of%20legends
	2. playerunknown's battlegrounds: https://www.twitch.tv/directory/game/playerunknown's%20battlegrounds
	3. grand theft auto v: https://www.twitch.tv/directory/game/grand%20theft%20auto%20v
	4. hearthstone: https://www.twitch.tv/directory/game/hearthstone
	5. counter-strike: global offensive: https://www.twitch.tv/directory/game/counter-strike%3A%20global%20offensive
	6. dota 2: https://www.twitch.tv/directory/game/dota%202
	7. irl: https://www.twitch.tv/directory/game/irl
	8. world of warcraft: https://www.twitch.tv/directory/game/world%20of%20warcraft
	9. overwatch: https://www.twitch.tv/directory/game/overwatch


## Data collection

### Scraping
The scraping module below is responsible for collecting data about all live streams currently streaming under 1 of the categories defined above. The driver does the following steps:
* Navigate to category *C*'s browse page
    * Collect the available streams (these should be the streams with the most concurrent viewers)
    * Scroll to the bottom of the page in an attempt to load more streams
    * Repeat this process until no new streams are found
* Store category *C* streams

In [None]:
#Stream map
stream_map = {}

In [20]:
#Spin up browser
driver = webdriver.Chrome();

#Number of times to scroll down a page
scrolls = 50
#Time to sleep between scrolls
sleeptime = 0.5

#Iterate over each category
for c in category_map:
    print("---{}---".format(c.title()))
    start = time()
    #Load up page
    driver.get(category_map[c])
    
    #Streams for storage
    streams = []
    
    try:
        #Wait for page to load at least 1 stream
        element = WebDriverWait(driver, 1).until(
                EC.presence_of_element_located((By.CLASS_NAME, "stream-thumbnail")))
        
        #Loop to get more streams
        prev_stream_count = -1
        while True:            
            #Assign global scrollable element for access through other scripts
            scroll_script = """
            window.s = document.getElementsByClassName("root-scrollable__wrapper")[0].parentElement.parentElement;
            """
            driver.execute_script(scroll_script)
            
            #Scroll
            scroll_script = """
            window.s.scrollTo(0, window.s.scrollHeight);
            """
            print("Scrolling for {} seconds ... ".format(scrolls * sleeptime))
            for i in range(scrolls):
                driver.execute_script(scroll_script)
                sleep(sleeptime)
                
            #Find streams
            streams = driver.find_elements_by_class_name("live-channel-card__videos")
            #Find text in parent containing concurrent viewers
            streams = map(lambda e: e.find_element_by_xpath("..").text, streams)
            
            #Display last scraped data
            print("Streams: {}".format(len(streams)))
            
            #If last stream obtained has 0 viewers, break
            if prev_stream_count == len(streams):
                break
            prev_stream_count = len(streams)
    finally:
        print("Time for retrieval and unwrapping of streams: {}s\n\n".format(time() - start))
    
    #Iterate over streams
    for i in range(len(streams)):
        stream_map[streams[i]] = 1

---Overwatch---
Streams: 30
Last stream: 29 viewers on SpaceLion
Scrolling for 50.0 seconds ... 
Streams: 1447
Last stream: 0 viewers on anikindahedgehog
Scrolling for 50.0 seconds ... 
Streams: 1447
Last stream: 0 viewers on anikindahedgehog
Time for retrieval and unwrapping of streams: 251.406999826s


---Dota 2---
Streams: 30
Last stream: 65 viewers on mamakclub
Scrolling for 50.0 seconds ... 
Time for retrieval and unwrapping of streams: 61.3960001469s




WebDriverException: Message: chrome not reachable
  (Session info: chrome=64.0.3282.167)
  (Driver info: chromedriver=2.35.528161 (5b82f2d2aae0ca24b877009200ced9065a772e73),platform=Windows NT 10.0.16299 x86_64)


In [9]:
from time import localtime, strftime
strftime("%Y-%m-%d %H:%M:%S", localtime())

'2018-02-24 15:23:28'