# Data collection: Twitch Tracker 
There are several twitch analytics websites/projects which store historic Twitch data. These include:
- TwitchTracker
- SullyGnome
- Social Blade
- Twitchmetrics
- Twitch Stats
- StreamElements Chat Stats
- Github repos (i.e. https://github.com/sid42/twitch-chat-analysis)

Here we'll use Twitch Tracker to identify the top 2k streamers according to their total number of hours watched by users in the last 1 month. For each streamer we'll collect data on them such as their recent and career streaming metrics along with what games they tend to play.

We use Selenium to collect the data. We've broken this process up into several functions.

In [None]:
# data processing
import json
import pandas as pd 

# Requests & web scraping
from selenium import webdriver      
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
import urllib.request
from urllib.error import HTTPError

# miscellaneous
import os
from glob import glob
import time
import numpy as np
from collections import Counter
from datetime import datetime 

# settings
import config

## Utilities

In [3]:
def rand_sleep(mean=4, std=1, lower=2.75, upper=5.25):
    # sleep for time distributed according to a truncated Gaussian distribution
    time.sleep(max(min(np.random.normal(loc=mean, scale=std), upper), lower)) 
    
def escape_ad(driver):
    webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()

## Collect data on streamer's main page

In [5]:
# optional: get image
def get_img(driver, item_name):
    if not os.path.exists("data/streamer_logos/%s.png" % item_name):
        img = driver.find_element_by_xpath('.//div[@id="app-logo"]/img').get_attribute("src")
        try:
            urllib.request.urlretrieve(img, "data/streamer_logos/%s.png" % item_name)
        except HTTPError:
            print("Logo not available for", item_name)
        else:
            rand_sleep()

In [6]:
# click on "3 month" view of performance data
def get_performance_data(driver):
    summary_stats = driver.find_element_by_xpath('.//span[@data-key="3 months"]')
    summary_stats.click()
    rand_sleep(mean=0.5, std=0.1, lower=0.4, upper=0.8)

    performance_panel = driver.find_element_by_xpath('.//div[@id="performance-panel"]')
    performance_blocks = performance_panel.find_elements_by_xpath('.//div[@class="g-x-s-block"]')
    performance_data = []
    for block in performance_blocks:
        values = block.find_element_by_xpath('.//div[contains(@class,"g-x-s-value")]')
        values_divs = values.find_elements_by_xpath('.//div')
        contrast_val = values_divs[0].text.strip()
        actual_val = values_divs[1].find_elements_by_xpath('.//span')[0].text.strip()
        label = block.find_element_by_xpath('.//div[contains(@class,"g-x-s-label")]').text
        performance_data.append({label: [actual_val, contrast_val]})
    return performance_data

In [7]:
def get_summary_data(driver):
    summary_panel = driver.find_element_by_xpath('.//span[contains(text(),"Summary")]/ancestor::div[@class="container"]')
    summary_blocks = summary_panel.find_elements_by_xpath('.//div[@class="g-x-s-block"]')
    summary_data = []
    for block in summary_blocks:
        values = block.find_element_by_xpath('.//div[contains(@class,"g-x-s-value")]')
        values_divs = values.find_elements_by_xpath('.//div')
        actual_val = values_divs[-1].text
        label = block.find_element_by_xpath('.//div[contains(@class,"g-x-s-label")]').text
        summary_data.append({label: actual_val})
    return summary_data

In [56]:
def get_subscriber_data(driver):
    # Note: some channels do not have subscribe buttons. Furthermore, subscriber counts are hard to track and are not
    # reported for most channels
    subscriber_data = []
    try:
        subscriber_panel = driver.find_element_by_xpath('.//span[contains(text(),"Subscribers")]/ancestor::div[@class="container"]')
    except NoSuchElementException:
        print("Channel does not have a subscriber button or is missing subscriber info")
    else:
        subscriber_blocks = subscriber_panel.find_elements_by_xpath('.//div[@class="g-x-s-block"]')
        for block in subscriber_blocks:
            actual_val = block.find_element_by_xpath('.//div[@class="g-x-s-value"]').text
            label = block.find_element_by_xpath('.//div[contains(@class,"g-x-s-label")]').text
            subscriber_data.append({label: actual_val})
    return subscriber_data

In [9]:
def check_or_x(class_text):
    if "fa-check" in class_text:
        return "Yes"
    elif "fa-times" in class_text:
        return "No"
    else:
        return ""

def get_channel_desc(profile):
    two_last_lis = profile.find_elements_by_xpath('./li')[-2:]
    for li in two_last_lis:
        try:
            text = li.find_element_by_xpath('.//div[@style="word-wrap:break-word;font-size:12px;"]').text
            return text
        except:
            pass
    return ""
    
def get_profile_data(driver):
    # profile block
    profile = driver.find_element_by_xpath('.//div[contains(text(),"Streamer Profile")]/ancestor::ul[contains(@class,"list-group")]')

    # streamer name
    name = profile.find_element_by_xpath('.//div[@id="mini-profile"]/h4').text

    # 1 month algorithmic ranking
    ranking_1month = profile.find_element_by_xpath('.//div[contains(@title,"rank")]/b').text # overall ranking for last month                                             

    # streaming language
    language = profile.find_element_by_xpath('.//div[contains(text(),"Language")]/following-sibling::div/span').text

    # channel creation data
    created = profile.find_element_by_xpath('.//div[contains(text(),"Created")]/following-sibling::div/span').text

    # partnered
    partner_class = profile.find_element_by_xpath('.//span[contains(text(),"Partner")]/child::i').get_attribute('class')
    partner = check_or_x(partner_class)

    # mature content
    mature_class = profile.find_element_by_xpath('.//span[contains(text(),"Mature")]/child::i').get_attribute('class')
    mature = check_or_x(mature_class)

    # channel description
    channel_desc = get_channel_desc(profile)

    profile_data = [{"Name": name},
                    {"Ranking 1-month": ranking_1month},
                    {"Language": language},
                    {"Creation date": created},
                    {"Partnered": partner},
                    {"Mature content": mature},
                    {"Channel description": channel_desc}
                   ]
    return profile_data

In [10]:
def process_table(tr_list):
    output = []
    for tr in tr_list:
        tds = tr.find_elements_by_xpath('./td')
        label, value = tds[0].text, tds[1].text
        output.append({label: value})
    return output

def get_other_data(driver):
    # 2 additional tables
    average_stream = driver.find_elements_by_xpath('.//div[contains(text(),"Average Stream")]/parent::div/following-sibling::table/tbody/tr')                
    table1 = process_table(average_stream)
    various_metrics = driver.find_elements_by_xpath('.//div[contains(text(),"Various Metrics")]/parent::div/following-sibling::table/tbody/tr')
    table2 = process_table(various_metrics)
    return table1, table2

In [11]:
# save data
def save_streamer_data(item_name, performance_data, summary_data, subscriber_data, profile_data, table1, table2):
    full_data = {"Performance": performance_data, "Summary": summary_data, "Subscribers": subscriber_data, "Profile": profile_data,
                "Average stream": table1, "Various metrics": table2}
    
    with open('data/streamer_info/%s.json' % item_name, 'w') as f:
        json.dump(full_data, f)

In [12]:
def get_streamer_data(driver, item_name):
    time.sleep(0.2)
    escape_ad(driver)
    time.sleep(0.2)
    get_img(driver, item_name)
    performance_data = get_performance_data(driver)
    summary_data = get_summary_data(driver)
    subscriber_data = get_subscriber_data(driver)
    profile_data = get_profile_data(driver)
    table1, table2 = get_other_data(driver)
    save_streamer_data(item_name, performance_data, summary_data, subscriber_data, profile_data, table1, table2)

## Collect data on streamer's "game" page

In [13]:
def get_game_data(driver, item_name):
    to_games = driver.find_element_by_xpath('.//div[@class="header-scope"]//a[contains(text(),"Games")]')
    to_games.click()
    escape_ad(driver)
    rand_sleep()

    # games table: we only extract the top 10 games
    table_headers = [th.text.strip() for th in driver.find_elements_by_xpath('.//table/thead/tr/th')]
    table_body = []
    table_body_rows = driver.find_elements_by_xpath('.//table/tbody/tr')
    for row in table_body_rows:
        table_body.append([td.text.strip() for td in row.find_elements_by_xpath('./td')])

    df = pd.DataFrame(data=table_body, columns=table_headers)
    df.to_csv("data/games/%s.csv" % item_name, mode="w", index=False)

In [14]:
def back_to_starting_page(driver):
    driver.back()
    time.sleep(0.1)
    escape_ad(driver)
    time.sleep(1)
    driver.back()
    time.sleep(0.1)
    escape_ad(driver)
    rand_sleep()

## Get one full page of streamers

In [15]:
def get_one_page(driver, completed):
    time.sleep(0.1)
    escape_ad(driver)
    time.sleep(0.1)
    
    # get list of streamers on page 
    items = driver.find_elements_by_xpath('//div[@class="ranked-item"]')
    i = 0
    
    while i < len(items):
        link = items[i].find_element_by_xpath('.//div[@class="ri-name"]/a')
        i += 1
        item_name = link.text
        print("%i: %s" % (i, item_name))
        
        if item_name in completed:
            continue
        
        # go to streamer page
        driver.execute_script("arguments[0].scrollIntoView()", link)
        link.click()
        escape_ad(driver)
        rand_sleep()
        
        # get data
        get_streamer_data(driver, item_name)
        get_game_data(driver, item_name)
        
        # go back to start
        back_to_starting_page(driver)
        escape_ad(driver)
        items = driver.find_elements_by_xpath('//div[@class="ranked-item"]') # refresh stale reference

## Get multiple pages

In [16]:
def get_multiple_pages(driver, starting_page, num_pages, completed):
    # starting page
#     url = "https://twitchtracker.com/channels/hours-watched?page=%i" % starting_page
    url = "https://twitchtracker.com/channels/hours-watched/english?page=%i" % starting_page
        
    # load the starting page
    driver.get(url)  
    rand_sleep()
    
    page_count = 0
    while page_count < num_pages:
        print("\n")
        print("Beginning page:", starting_page + page_count)
#         print("Pages remaining:", num_pages-starting_page-page_count+1)
        print("-"*20)
        get_one_page(driver, completed)
        next_page = driver.find_element_by_xpath('.//ul[contains(@class,"pagination")]/li[2]/a') 
        driver.execute_script("arguments[0].scrollIntoView()", next_page)
        time.sleep(0.1)
        next_page.click()
        rand_sleep(mean=20, std=7, lower=15, upper=30)
        page_count += 1

## Running the script...

In [17]:
# initialize driver
driver = webdriver.Chrome('./chromedriver.exe')

# initialize headers for urllib.request
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', config.user_agent)]
urllib.request.install_opener(opener)

# completed = [] # starting new
completed = [x.split(".json")[0] for x in next(os.walk("data/streamer_info"))[2]] # resuming
get_multiple_pages(driver, starting_page=208, num_pages=250-208, completed=completed)



Beginning page: 113
--------------------
1: Jynx
Channel does not have a subscriber button or is missing subscriber info
2: Leffen
Channel does not have a subscriber button or is missing subscriber info
3: x5_PiG
Channel does not have a subscriber button or is missing subscriber info
4: benice92
Channel does not have a subscriber button or is missing subscriber info
5: HekTic_JukeZ
6: Sajam
Channel does not have a subscriber button or is missing subscriber info
7: imfrosk
Channel does not have a subscriber button or is missing subscriber info
8: eskay
Channel does not have a subscriber button or is missing subscriber info
9: Geef
Channel does not have a subscriber button or is missing subscriber info
10: Slade


Beginning page: 114
--------------------
1: LiquidWiFi
2: DSKoopa
Channel does not have a subscriber button or is missing subscriber info
3: Richard_Hammer
Channel does not have a subscriber button or is missing subscriber info
4: Chanimaly
Channel does not have a subscriber 

3: Zmok
Channel does not have a subscriber button or is missing subscriber info
4: PizzaHS
Channel does not have a subscriber button or is missing subscriber info
5: rabbittvlittlegirl
Channel does not have a subscriber button or is missing subscriber info
6: UnRooolie
Channel does not have a subscriber button or is missing subscriber info
7: Mushu
8: PaladinsGame
Channel does not have a subscriber button or is missing subscriber info
9: GTAWiseGuy
10: d0cc_tv
Channel does not have a subscriber button or is missing subscriber info


Beginning page: 124
--------------------
1: americandadtv36
Channel does not have a subscriber button or is missing subscriber info
2: OVOPhantuums
Channel does not have a subscriber button or is missing subscriber info
3: Warcraft
Channel does not have a subscriber button or is missing subscriber info
4: chell
Channel does not have a subscriber button or is missing subscriber info
5: n3on
Channel does not have a subscriber button or is missing subscriber i

Channel does not have a subscriber button or is missing subscriber info
4: Lt_Custard
Channel does not have a subscriber button or is missing subscriber info
5: PlayStation
Channel does not have a subscriber button or is missing subscriber info
6: AnnieBot
7: ytzaxy
Channel does not have a subscriber button or is missing subscriber info
8: Mendo
9: Surefour
10: SMii7Y


Beginning page: 134
--------------------
1: Polecat324
Channel does not have a subscriber button or is missing subscriber info
2: Suspect
Channel does not have a subscriber button or is missing subscriber info
3: Goofy757
Channel does not have a subscriber button or is missing subscriber info
4: NorthTaisheen
Channel does not have a subscriber button or is missing subscriber info
5: Apocalypto_12
Channel does not have a subscriber button or is missing subscriber info
6: OfficialTaco
Channel does not have a subscriber button or is missing subscriber info
7: BenFruit
Channel does not have a subscriber button or is missing

Channel does not have a subscriber button or is missing subscriber info
8: UnrealYuki
Channel does not have a subscriber button or is missing subscriber info
9: HatFilms
Channel does not have a subscriber button or is missing subscriber info
10: fineokay
Channel does not have a subscriber button or is missing subscriber info


Beginning page: 144
--------------------
1: kkonvy
Channel does not have a subscriber button or is missing subscriber info
2: Xisuma
Channel does not have a subscriber button or is missing subscriber info
3: Orzanel
Channel does not have a subscriber button or is missing subscriber info
4: DreamHackFN
Channel does not have a subscriber button or is missing subscriber info
5: KayPea
6: FilthyRobot
7: OaksLab
Channel does not have a subscriber button or is missing subscriber info
8: gigi
Channel does not have a subscriber button or is missing subscriber info
9: propunker
Channel does not have a subscriber button or is missing subscriber info
10: GiantBomb
Channel d



Beginning page: 154
--------------------
1: lethamyr_rl
Channel does not have a subscriber button or is missing subscriber info
2: justfoxii
Channel does not have a subscriber button or is missing subscriber info
3: EnragedCinema
Channel does not have a subscriber button or is missing subscriber info
4: AtomicTwins
Channel does not have a subscriber button or is missing subscriber info
5: ybnmikey
Channel does not have a subscriber button or is missing subscriber info
6: DauT
Channel does not have a subscriber button or is missing subscriber info
7: Razer
Channel does not have a subscriber button or is missing subscriber info
8: AutolykusLoL
Channel does not have a subscriber button or is missing subscriber info
9: Tolomeo
Channel does not have a subscriber button or is missing subscriber info
10: Capp
Channel does not have a subscriber button or is missing subscriber info


Beginning page: 155
--------------------
1: ARUUU
Channel does not have a subscriber button or is missing subs

Channel does not have a subscriber button or is missing subscriber info
5: propunker
Channel does not have a subscriber button or is missing subscriber info
6: BonsaiBroz
Channel does not have a subscriber button or is missing subscriber info
7: janix
8: slamjam_
Channel does not have a subscriber button or is missing subscriber info
9: Bean
Channel does not have a subscriber button or is missing subscriber info
10: Raelilblack
Channel does not have a subscriber button or is missing subscriber info


Beginning page: 165
--------------------
1: CitizenS9
2: Rated
Channel does not have a subscriber button or is missing subscriber info
3: Redshell
Channel does not have a subscriber button or is missing subscriber info
4: Zayt
Channel does not have a subscriber button or is missing subscriber info
5: JaredFPS
6: Bptz
Channel does not have a subscriber button or is missing subscriber info
7: WorldofTanks
Channel does not have a subscriber button or is missing subscriber info
8: FailArmy
Cha

9: JayDuhbb
Channel does not have a subscriber button or is missing subscriber info
10: iamExpel


Beginning page: 175
--------------------
1: AbdulHD
Channel does not have a subscriber button or is missing subscriber info
2: DaisyGray
Channel does not have a subscriber button or is missing subscriber info
3: PhuzzyBond
Channel does not have a subscriber button or is missing subscriber info
4: SynnfulJoestarGia
Channel does not have a subscriber button or is missing subscriber info
5: sunglitters
Channel does not have a subscriber button or is missing subscriber info
6: SideArms4Reason
Channel does not have a subscriber button or is missing subscriber info
7: jonsmiff
8: Lexiav
Channel does not have a subscriber button or is missing subscriber info
9: KatGunn
Channel does not have a subscriber button or is missing subscriber info
10: officialwkuk
Channel does not have a subscriber button or is missing subscriber info


Beginning page: 176
--------------------
1: HyperRPG
Channel does n

3: NobletOfU
Channel does not have a subscriber button or is missing subscriber info
4: kanister_mtg
5: Angel
Channel does not have a subscriber button or is missing subscriber info
6: Dangers
Channel does not have a subscriber button or is missing subscriber info
7: Nindiddeh
8: Euriece
Channel does not have a subscriber button or is missing subscriber info
9: LivPosting
10: LuckyChamu
Channel does not have a subscriber button or is missing subscriber info


Beginning page: 186
--------------------
1: Taylor_Jevaux
Channel does not have a subscriber button or is missing subscriber info
2: ZeroxMercy
Channel does not have a subscriber button or is missing subscriber info
3: justketh
Channel does not have a subscriber button or is missing subscriber info
4: duoking1
Channel does not have a subscriber button or is missing subscriber info
5: Natarsha
Channel does not have a subscriber button or is missing subscriber info
6: TSMSword
Channel does not have a subscriber button or is missing 

4: MrSheepLive
Channel does not have a subscriber button or is missing subscriber info
5: Afro
6: PandaDota
Channel does not have a subscriber button or is missing subscriber info
7: mitsuki_tv
Channel does not have a subscriber button or is missing subscriber info
8: Ubisoft
Channel does not have a subscriber button or is missing subscriber info
9: FarmerJohn
Channel does not have a subscriber button or is missing subscriber info
10: DougDougW
Channel does not have a subscriber button or is missing subscriber info


Beginning page: 196
--------------------
1: TweaK
Channel does not have a subscriber button or is missing subscriber info
2: Livibee
Channel does not have a subscriber button or is missing subscriber info
3: XenosysVex
Channel does not have a subscriber button or is missing subscriber info
4: GFuelEnergy
Channel does not have a subscriber button or is missing subscriber info
5: iRiskpvp
6: Covertgoblue
Channel does not have a subscriber button or is missing subscriber info

5: shockist
Channel does not have a subscriber button or is missing subscriber info
6: BHOFILMS
Channel does not have a subscriber button or is missing subscriber info
7: maximarex03
Channel does not have a subscriber button or is missing subscriber info
8: Luminumn
Channel does not have a subscriber button or is missing subscriber info
9: derrekow
Channel does not have a subscriber button or is missing subscriber info
10: AvaGG
Channel does not have a subscriber button or is missing subscriber info


Beginning page: 206
--------------------
1: RNCesus
2: floofyeosang
Channel does not have a subscriber button or is missing subscriber info
3: StremZ
4: rioluTM
Channel does not have a subscriber button or is missing subscriber info
5: bully
6: Just_Relax_Kid
7: geo874
Channel does not have a subscriber button or is missing subscriber info
8: DSGamingGC
Channel does not have a subscriber button or is missing subscriber info
9: iHasCupquake
Channel does not have a subscriber button or is m

KeyboardInterrupt: 

## Counting languages

In [20]:
datadir = "data/streamer_info"
langs = []
for fname in next(os.walk(datadir))[2]:
    with open(datadir + "/" + fname, "r") as f:
        x = json.load(f)
        profile = x["Profile"]
        for d in profile:
            if "Language" in d:
                langs.append(d["Language"])

ct = Counter(langs)
print("Number of streamers", len(langs))
print("Number of unique languages", len(ct))
print("\nLanguage counts\n" + "-"*50 + "\n", ct.most_common(), "\n", "-"*50)

Number of streamers 3154
Number of unique languages 28

Language counts
--------------------------------------------------
 [('English', 2030), ('Spanish', 214), ('Portuguese', 182), ('Russian', 133), ('French', 117), ('Korean', 117), ('German', 109), ('Chinese', 48), ('Italian', 46), ('Turkish', 37), ('Polish', 35), ('Japanese', 28), ('Thai', 12), ('Czech', 11), ('Arabic', 8), ('Finnish', 6), ('Hungarian', 4), ('Unknown', 3), ('Slovak', 2), ('Greek', 2), ('Swedish', 2), ('Danish', 2), ('Romanian', 1), ('Malay', 1), ('Indonesian', 1), ('Hindi', 1), ('Vietnamese', 1), ('Dutch', 1)] 
 --------------------------------------------------


## Recoding non-ascii usernames (i.e. Chinese characters)

In [3]:
usernames = [x.split(".json")[0] for x in next(os.walk("data/streamer_info"))[2]]
with open("data/usernames_archive/top-3154-hoursWatched-allLanguages.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(usernames))

In [29]:
usernames = [x.split(".json")[0] for x in next(os.walk("data/streamer_info"))[2]]
def is_ascii(s):
    return all(ord(c) < 128 for c in s)

non_ascii_names = [x for x in usernames if not is_ascii(x.replace("_","").replace(" ",""))]
non_ascii_names[:10]

['_연두부_',
 'ぐちつぼ',
 'しゃるる',
 'たぬき忍者',
 'むかい',
 'らいじん',
 'らっだぁ',
 'ウェザーニュース',
 'スタンミ',
 '世界の屁こき隊']

### Attempt 1: search TwitchTracker for username

In [12]:
# initialize driver
driver = webdriver.Chrome('./chromedriver.exe')

def write_data(name, non_ascii):
    with open("data/non_ascii_to_ascii.txt", "a+", encoding="utf-8") as f:
        f.write("%s %s\n" % (name, non_ascii))


for i,name in enumerate(non_ascii_names):
    print("Completed:", i/len(non_ascii_names))
    url = "https://twitchtracker.com/search?q=%s" % name
    driver.get(url)
    time.sleep(4)
    search_results = driver.find_elements_by_xpath('.//table[@id="channels-result"]/tbody/tr[1]//a[contains(text(),"Visit")]')
        
    if len(search_results) == 0:
        print("No search results:", name)
        write_data(name, "NA")
        continue

    search_results[0].click()
    time.sleep(4)
    
    try:
        title = driver.find_element_by_xpath('.//h4[text()="%s"]' % name).text
    except NoSuchElementException:
        write_data(name, "NA")
    else:
        if title != name:
            write_data(name, "NA")
        else:
            try:
                subtitle = driver.find_element_by_xpath('.//div[@id="app-title"]/small').text
            except NoSuchElementException:
                print("No subtitle for:", name)
                write_data(name, "NA")
            else:
                write_data(name, subtitle)   
    finally:
        driver.back()
        time.sleep(3)

Completed: 0.0
Completed: 0.00625
Completed: 0.0125
Completed: 0.01875
Completed: 0.025
Completed: 0.03125
Completed: 0.0375
Completed: 0.04375
Completed: 0.05
Completed: 0.05625
Completed: 0.0625
Completed: 0.06875
Completed: 0.075
Completed: 0.08125
Completed: 0.0875
No search results: 大丸
Completed: 0.09375
No search results: 嬌兔
Completed: 0.1
Completed: 0.10625
No search results: 小舞
Completed: 0.1125
Completed: 0.11875
No search results: 懶貓
Completed: 0.125
No search results: 接接
Completed: 0.13125
Completed: 0.1375
Completed: 0.14375
No search results: 殺梗
Completed: 0.15
Completed: 0.15625
No search results: 湯米
Completed: 0.1625
Completed: 0.16875
Completed: 0.175
Completed: 0.18125
No search results: 球球
Completed: 0.1875
No search results: 紀囧
Completed: 0.19375
Completed: 0.2
No search results: 羅傑
Completed: 0.20625
No search results: 老婆
Completed: 0.2125
No search results: 老皮
Completed: 0.21875
No search results: 薛喜
Completed: 0.225
No search results: 蛇足
Completed: 0.23125
Complet

### Attempt 2: search Twitch.tv website for username

In [23]:
# initialize driver
driver = webdriver.Chrome('./chromedriver.exe')

df = pd.read_csv("data/non_ascii_to_ascii.txt", delimiter=" ", header=None, names=["non_ascii","ascii"])
complete = df.loc[~df["ascii"].isna()]
incomplete = df.loc[df["ascii"].isna()]
for row in incomplete.iterrows():
    name = row[1]["non_ascii"]
    url = "https://www.twitch.tv/search?term=%s" % name
    driver.get(url)
    time.sleep(6)
    
    try:
        search_results = driver.find_elements_by_xpath('.//div[contains(@class,"search-result")]/a')
    except:
        print("error getting", name)
    else:
        row[1]["ascii"] = search_results[0].get_attribute("href").split("/")[-1]

incomplete.head()

Unnamed: 0,non_ascii,ascii
14,大丸,wtf_winds123
15,嬌兔,zrush
17,小舞,nein1203
19,懶貓,failverde
20,接接,godjj


In [27]:
df_concat = pd.concat([complete, incomplete], axis=0)
df_concat.to_csv("data/non_ascii_to_ascii.csv", index=False)
df_concat.isna().sum()

non_ascii    0
ascii        0
dtype: int64

### Recoding file names in streamer_logos/ and games/

In [51]:
code_df = pd.read_csv("data/non_ascii_to_ascii.csv")
code = {x:y for x,y in zip(code_df["non_ascii"], code_df["ascii"])}

def rename(src, dst):
    if os.path.exists(src):
        os.rename(src, dst)
        
for name in non_ascii_names:
    with open("data/streamer_info/%s.json" % name, "r", encoding="utf-8") as f:
        data = json.load(f)
        data["Profile"][0]["Name"] = code[name]
        
    with open("data/streamer_info/%s.json" % name, "w", encoding="utf-8") as f:
        json.dump(data, f)
    
    rename("data/streamer_info/%s.json" % name, "data/streamer_info/%s.json" % code[name])
    rename("data/streamer_logos/%s.png" % name, "data/streamer_logos/%s.png" % code[name])
    rename("data/games/%s.csv" % name, "data/games/%s.csv" % code[name])
    
df = pd.read_csv("data/streamer_info.csv")
df["name"] = df["name"].replace(code)
df.sort_values("name").to_csv("data/streamer_info.csv", index=False)
    
for fname in glob("data/usernames_archive/*"):
    with open(fname, "r", encoding="utf-8") as f:
        data = f.read().split("\n")
    with open(fname, "w", encoding="utf-8") as f:
        f.write( "\n".join(list(map(lambda x: x if x not in code else code[x], data))) )


## Reformatting data & removing subscriber data

In [21]:
def reformat_performance_data(x,idx):
    if idx == 5:
        val = float(list(x["Performance"][idx].values())[0][0].replace(",",""))
    else:
        val = int(round(float(list(x["Performance"][idx].values())[0][0].replace(",",""))))
    
#     contrast = list(x["Performance"][idx].values())[0][1].replace(",", "").strip()
#     if not contrast:
#         raw_inc = "NA"
        
#     elif "|" in contrast:
#         sign = contrast.split("|")[0][0]
#         if idx == 5:
#             raw_inc = float(eval(contrast.split("|")[0]))
#         else:
#             raw_inc = int(eval(contrast.split("|")[0]))
#     else:
#         if idx == 5:
#             raw_inc = float(eval(contrast))
#         else:
            
#             raw_inc = int(eval(contrast))
    
#     return val, raw_inc
    return val

def reformat_summary_data(x, idx):
    val = int(float(list(x["Summary"][idx].values())[0].replace(",", "")))
    return val

def reformat_profile_data(x, idx):
    if idx == 1:
        val = int("".join([c for c in list(x["Profile"][idx].values())[0] if c.isdigit()]))
    elif idx == 3:
        string = list(x["Profile"][idx].values())[0]
#         val = datetime.strptime(string, "%b %d, %Y").strftime("%Y-%m-%d")
        val = datetime.strptime(string, "%b %d, %Y").strftime("%Y-%m-%d")
    else:
        val = list(x["Profile"][idx].values())[0]
    return val

def reformat_stream_data(x, idx):
    if idx == 0:
        val = float(list(x["Average stream"][idx].values())[0])
    elif idx == 1:
        string = list(x["Average stream"][idx].values())[0].replace(",","")
        if string[-1] == "K":
            val = int(float(string[:-1])*10**3)
        elif string[-1] == "M":
            val = int(float(string[:-1])*10**6)
        else:
            val = int(string)
    elif idx == 2:
        val = int(list(x["Average stream"][idx].values())[0].replace(",", ""))
    else:
        string = list(x["Average stream"][idx].values())[0].replace(",","")
        if not string[-3:] == "hrs":
            with open("errors.txt", "a+") as g:
                g.write("Error" + list(x["Profile"][0].values())[0])
        else:
            val = eval(string[:-3])

    return val

def reformat_metrics_data(x, idx):
    if idx == 0:
        val = float(eval(list(x["Various metrics"][idx].values())[0].split("/")[0]))
    elif idx == 1:
        val = int(list(x["Various metrics"][idx].values())[0])
    else:
        val = list(x["Various metrics"][idx].values())[0]
    return val

In [39]:
def reformat_user(x):
    data = []
    for i in range(len(x["Performance"])):
        data.append(reformat_performance_data(x,i))
#         val, raw_inc = reformat_performance_data(x,i)
#         data.extend([val, raw_inc])

    for i in range(len(x["Summary"])):
        data.append(reformat_summary_data(x,i))
    for i in range(len(x["Profile"])):
        data.append(reformat_profile_data(x,i))
    for i in range(len(x["Average stream"])):
        data.append(reformat_stream_data(x,i))
    for i in range(len(x["Various metrics"])):
        data.append(reformat_metrics_data(x,i))
    return data

all_data = []
for fname in next(os.walk("data/streamer_info"))[2]:
    with open("data/streamer_info/" + fname, "r") as f:
        x = json.load(f)
#         print(x["Profile"][0]["Name"])
        all_data.append(reformat_user(x))
        

cols = [
    'rec_hours_streamed',
    'rec_hours_streamed_inc',
    'rec_avg_viewers',
    'rec_avg_viewers_inc',
    'rec_peak_viewers',
    'rec_peak_viewers_inc',
    'rec_hours_watched',
    'rec_hours_watched_inc',
    'rec_followers_gained',
    'rec_followers_gained_inc',
    'rec_followers_per_hour',
    'rec_followers_per_hour_inc',
    'rec_views_gained',
    'rec_views_gained_inc',
    'rec_number_of_streams',
    'rec_number_of_streams_inc',
    'total_hours_streamed',
    'career_peak_viewers',
    'total_followers',
    'total_views',
    'name',
    'ranking',
    'language',
    'creation_date',
    'partnered',
    'mature_content',
    'channel_description',
    'avg_games_streamed',
    'avg_views_gained',
    'avg_followers_gained',
    'avg_stream_duration',
    'active_days_per_week',
    'unique_games_played',
    'stream_start',
    'total_activity']
cols = [c for c in cols if c[-4:] != "_inc"]

df = pd.DataFrame(data=all_data, columns=cols)
df = df.set_index("name").reset_index()

In [40]:
df.shape

(3154, 27)

In [41]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(df[df["name"]=="shroud"])

Unnamed: 0,name,rec_hours_streamed,rec_avg_viewers,rec_peak_viewers,rec_hours_watched,rec_followers_gained,rec_followers_per_hour,rec_views_gained,rec_number_of_streams,total_hours_streamed,career_peak_viewers,total_followers,total_views,ranking,language,creation_date,partnered,mature_content,channel_description,avg_games_streamed,avg_views_gained,avg_followers_gained,avg_stream_duration,active_days_per_week,unique_games_played,stream_start,total_activity
2334,shroud,565,30577,516289,17304543,1182816,2090.0,32210765,64,8071,516289,8395134,399605110,43,English,2012-11-03,Yes,No,I'm back baby,2.3,389000,8005,8.1,4.9,227,19:30,1000 of 1442 days


In [42]:
df.to_csv("data/streamer_info.csv", mode="w", index=False)

## English streams

In [51]:
df_eng = df.loc[(df["language"]=="English") & (~df["name"].isin(non_ascii_names))]

In [44]:
df_eng.to_csv("data/streamer_info_eng.csv", mode="w", index=False)