# Data Collection
This noteboook is responsible for autonomously collecting Twitch follower, concurrent view, and total view data.

## Imports

In [3]:
#SQL connection and queries
import MySQLdb as mdb
#Connection failure exiting
import sys
#Getting current time (after data is scraped) for stream table
from time import gmtime, strftime, time
#Scraping
import urllib2
from bs4 import BeautifulSoup as bs
#Web automation
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait

## File data
Below the database credentials are read in. These are used for logging into the database, creating the necessary tables (if not already created), and storing data.

In [4]:
db_user = ""
db_pass = ""
db_name = ""
db_host = "localhost"
with open("database_credentials.txt") as f:
    db_user = f.readline().strip()
    db_pass = f.readline().strip()
    db_name = f.readline().strip()
#print(db_user, db_pass, db_name)

## Database
### Versioning
Below the database is accessed using the specified username, password, and database name above. Note if the below code errors out, the rest of the database operations in this file will not produce expeted results (most likely will flat out not work).

In [5]:
#Connect
def mdb_get_version():
    """
    Connects to the database above using the obtained credentials.
    """
    con = None
    try:
        con = mdb.connect(host=db_host, user=db_user, passwd=db_pass, db=db_name)
        cur = con.cursor()
        cur.execute("SELECT VERSION()")
        data = cur.fetchone()
        print("Database version: {}".format(data))
    except mdb.Error, e:
        print("Error {}: {}".format(e.args[0], e.args[1]))
        sys.exit(1)
    finally:
        if con:
            con.close()
mdb_get_version()

Database version: ('5.7.21-log',)


### Table Creation
The below functions create the necessary tables for data storage if they do not already exist.

In [6]:
def create_tables():
    """
    Creates the necessary tables (if not already created) for Twitch data storage.
    """
    con = mdb.connect(host=db_host, user=db_user, passwd=db_pass, db=db_name)
    
    with con:
        cur = con.cursor()
        
        #Twitch game/creative/irl categories
        sql = """
        CREATE TABLE IF NOT EXISTS categories (
        id INT PRIMARY KEY AUTO_INCREMENT,
        category VARCHAR(255) NOT NULL UNIQUE
        )
        """
        cur.execute(sql)
        
        #Twitch streamers
        sql = """
        CREATE TABLE IF NOT EXISTS streamers (
        id INT PRIMARY KEY AUTO_INCREMENT,
        streamer VARCHAR(255) NOT NULL UNIQUE
        )
        """
        cur.execute(sql)
        
        #Individual stream
        """
        It's important to note here that any game, like Overwatch, will be streamed by multiple streamers.
        Similarly, a streamer may stream multiple games, or even stream Overwatch at different times.
        The goal of the learning agents will be to see how streaming variables, like stream time, affect
        concurrent viewership, if at all.
        """
        sql = """
        CREATE TABLE IF NOT EXISTS streams (
        id INT PRIMARY KEY AUTO_INCREMENT,
        current_viewers INT NOT NULL DEFAULT -1,
        created_at DATETIME NOT NULL DEFAULT current_timestamp,
        followers INT NOT NULL DEFAULT -1,
        total_views INT NOT NULL DEFAULT -1,
        streamer_id INT NOT NULL DEFAULT -1,
        FOREIGN KEY (streamer_id)
            REFERENCES streamers(id)
            ON DELETE CASCADE,
        category_id INT NOT NULL DEFAULT -1,
        FOREIGN KEY (category_id)
            REFERENCES categories(id)
            ON DELETE CASCADE
        )
        """
        cur.execute(sql)
create_tables()



## Twitch parameters
Below the variables for scraping on Twitch are setup. Currently this includes the following:
* url strings
* categories
    * these were chosen from the top 10 streaming categories on 2/2/2018 at 2:40pm

### URL strings

In [7]:
#String for vieweing all current streams for a specified category
category_string = "https://www.twitch.tv/directory/game/{}"

#String for navigating to a specific user's stream
stream_string = "https://www.twitch.tv/{}"

### Category strings

In [8]:
#Below is a list of the 10 most popular streaming categories on twitch as of
#2/2/2018 @ 2:40pm
safe_chars = "'"
categories = [
    "fortnite",
    "league of legends",
    "playerunknown's battlegrounds",
    "grand theft auto v",
    "hearthstone",
    "counter-strike: global offensive",
    "dota 2",
    "irl", "world of warcraft",
    "overwatch"
]
category_map = {}
category_url_strin = map(lambda s : urllib2.quote(s, safe=safe_chars), [
    "fortnite",
    "league of legends",
    "playerunknown's battlegrounds",
    "grand theft auto v",
    "hearthstone",
    "counter-strike: global offensive",
    "dota 2",
    "irl", "world of warcraft",
    "overwatch"
])
print("Categories:")
for i in range(len(categories)):
    c = categories[i]
    category_map[c] = category_string.format(urllib2.quote(c, safe=safe_chars))
    print("\t{}. {}: {}".format(i, c, category_map[c]))

Categories:
	0. fortnite: https://www.twitch.tv/directory/game/fortnite
	1. league of legends: https://www.twitch.tv/directory/game/league%20of%20legends
	2. playerunknown's battlegrounds: https://www.twitch.tv/directory/game/playerunknown's%20battlegrounds
	3. grand theft auto v: https://www.twitch.tv/directory/game/grand%20theft%20auto%20v
	4. hearthstone: https://www.twitch.tv/directory/game/hearthstone
	5. counter-strike: global offensive: https://www.twitch.tv/directory/game/counter-strike%3A%20global%20offensive
	6. dota 2: https://www.twitch.tv/directory/game/dota%202
	7. irl: https://www.twitch.tv/directory/game/irl
	8. world of warcraft: https://www.twitch.tv/directory/game/world%20of%20warcraft
	9. overwatch: https://www.twitch.tv/directory/game/overwatch


## Data collection

## Stream collector
The below code implements a function which collects all users currently streaming the category the soup parameter references.

In [21]:
def stream_collector(soup):
    """
    This function returns a list of users currently streaming the category specified
    by the soup parameter.
    """
    container = soup.find("div", {"class": "tw-tower tw-tower--gutter-sm tw-tower--240 tw-flex-wrap"})
    #container = map(lambda a : )
    #streams = container.find_all("a", {"class": "live-channel_card__videos"})
    streams = container.find_all("a", {"class": "stream-thumbnail"})
    streams2 = map(lambda e : e.getText(), streams)
    print(streams2)
    return len(streams2)

In [20]:
s = bs("<h1><h2>hey</h2></h1>")

u'hey'

### Scraping

In [27]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [26]:
#Spin up browser
driver = webdriver.Firefox()
#driver.implicitly_wait(10)

#Iterate over each category
for c in category_map:
    print("{}".format(c.title()))
    start = time()
    driver.get(category_map[c])
    #Wait for the page to load (in seconds)
    #soup = bs(driver.page_source)
    print("Time for retrieval and unwrapping of streams: {}s".format(time() - start))
    
    container = driver.find_element_by_class_name("tw-flex-shrink-0")
    streams = container.find_elements_by_class_name("live-channel-card__videos")
    print(len(streams))
    
    #Get streams
    #print(stream_collector(soup))

Overwatch
Time for retrieval and unwrapping of streams: 2.20899987221s
0
Dota 2
Time for retrieval and unwrapping of streams: 1.4240000248s
0
Playerunknown'S Battlegrounds
Time for retrieval and unwrapping of streams: 2.43799996376s
0
League Of Legends
Time for retrieval and unwrapping of streams: 2.0529999733s


WebDriverException: Message: Failed to interpret value as array


In [None]:
from time import localtime, strftime
strftime("%Y-%m-%d %H:%M:%S", localtime())