In [1]:
import requests, json, time, math
from datetime import datetime
from multiprocessing import Pool
from sqlalchemy import *
from bs4 import BeautifulSoup
import pandas as pd
from tqdm.auto import tqdm

#### Good to have

In [2]:
def split_list(l, n):
    for i in range(0, len(l), n):
        yield l[i:i + n]


In [3]:
for i in tqdm(range(10)):
    time.sleep(.3)




#### Get user inventory
##### Option A: For Loop

In [4]:
path_user_id = 'data/steam_user_id.txt'
with open(path_user_id, 'rb') as f:
    lst_user_id = f.readlines()

lst_user_id[:5]

[b'76561198158086086\n',
 b'76561198074188133\n',
 b'76561198058088990\n',
 b'76561198175177483\n',
 b'76561198042649112\n']

In [5]:
'\n76561198158086086 \n\n\t'.strip()

'76561198158086086'

In [6]:
for user_id in lst_user_id[:5]:
    base_url = 'http://api.steampowered.com/IPlayerService/GetOwnedGames/v0001/'
    params = {
        'key' : 'D0C62157A8941F12A687382B6D635449',
        'steamid' : user_id.strip(),
        'format' : 'json' 
    }
    r = requests.get(base_url, params = params, headers = {})
    user_inventory = r.json().get('response').get('games')
    time.sleep(.5)
    print(user_id, '\n', user_inventory)

b'76561198158086086\n' 
 None
b'76561198074188133\n' 
 [{'appid': 4000, 'playtime_forever': 3415, 'playtime_windows_forever': 0, 'playtime_mac_forever': 0, 'playtime_linux_forever': 0}, {'appid': 34030, 'playtime_forever': 16526, 'playtime_windows_forever': 0, 'playtime_mac_forever': 0, 'playtime_linux_forever': 0}, {'appid': 42680, 'playtime_forever': 4631, 'playtime_windows_forever': 0, 'playtime_mac_forever': 0, 'playtime_linux_forever': 0}, {'appid': 42690, 'playtime_forever': 11055, 'playtime_windows_forever': 0, 'playtime_mac_forever': 0, 'playtime_linux_forever': 0}, {'appid': 207610, 'playtime_forever': 126, 'playtime_windows_forever': 0, 'playtime_mac_forever': 0, 'playtime_linux_forever': 0}, {'appid': 50300, 'playtime_forever': 625, 'playtime_windows_forever': 0, 'playtime_mac_forever': 0, 'playtime_linux_forever': 0}, {'appid': 104900, 'playtime_forever': 173, 'playtime_windows_forever': 0, 'playtime_mac_forever': 0, 'playtime_linux_forever': 0}, {'appid': 227300, 'playtime

##### Option B: Multiprocessing

In [None]:
# what is multiprocessing?
# Multiprocessing vs threading, queue

In [7]:
path_user_id = 'data/steam_user_id.txt'
with open(path_user_id, 'r') as f:
    lst_user_id = f.readlines()[:50]

In [8]:
def worker(lst_user_id_temp):
    dic_temp = {}
    for user_id in tqdm(lst_user_id_temp, leave=False):
        base_url = 'http://api.steampowered.com/IPlayerService/GetOwnedGames/v0001/'
        params = {
            'key' : 'D0C62157A8941F12A687382B6D635449',
            'steamid' : user_id.strip(),
            'format' : 'json' }
        r = requests.get(base_url, params = params)
        user_inventory = r.json().get('response').get('games')
        dic_temp.update({user_id.strip():user_inventory})
        time.sleep(.5)
    return dic_temp

In [9]:
p = Pool(2)

dic_master = {}
for i in tqdm(list(split_list(lst_user_id,10))):
    lst_temp_dic = p.map(worker, split_list(i,5))
    for j in lst_temp_dic:
        dic_master.update(j)
    
    time.sleep(5)




In [10]:
with open('data/crawled_user_inventory.txt', 'w') as f:
    for user_id, user_inventory in list(dic_master.items()):
        f.write(json.dumps({str(user_id):user_inventory}))
        f.write('\n')

### Web Crawler II
#### 1) rate limit
#### 2) Headers, cookies
#### 3) Multiprocessing / Threading
#### 4) Selenium

In [11]:
r = requests.get('https://www.youtube.com/watch?v=h31myLyc_qk')
soup = BeautifulSoup(r.text, 'lxml')
soup.find_all('yt-formatted-string', {'class':'style-scope ytd-video-primary-info-renderer'})

[]

In [12]:
from selenium import webdriver

In [13]:
# https://chromedriver.chromium.org/
driver = webdriver.Chrome('/Users/alanliu/chromedriver')

In [14]:
driver.get('https://www.youtube.com/watch?v=h31myLyc_qk')

In [15]:
soup = BeautifulSoup(driver.page_source, 'lxml')
soup.find('yt-formatted-string', {'class':'style-scope ytd-video-primary-info-renderer'}).string

'Civilization VI: Rise and Fall - New Features Explained (Full Details)'

#### get app info

In [16]:
# get all available app id
url = 'https://api.steampowered.com/ISteamApps/GetAppList/v2/'
r = requests.get(url)
dic_app_list = r.json()
lst_app_id = [i.get('appid') for i in dic_app_list.get('applist').get('apps')]
len(lst_app_id)

90393

In [17]:
current_count = 0
path_app_detail_sample = 'app_detail_sample.txt'
with open(path_app_detail_sample, 'w') as f:
    for app_id in tqdm(lst_app_id[:5]):
        url_app_detail = ('http://store.steampowered.com/api/appdetails?appids=%s') % (app_id)
        for i in range(3):
            try:
                r = requests.get(url_app_detail)
                result = r.json()
                break
            except Exception as e:
                print(e)
                time.sleep(5)
        f.write(json.dumps(result))
        f.write('\n')

        if current_count > 0 and current_count % 200 == 0:
            time.sleep(300)
        else:
            time.sleep(.5)




In [18]:
path_app_detail = 'data/app_detail.txt'

with open(path_app_detail, 'r') as f:
    
    dic_steam_app = {
        'initial_price':{},
        'name':{},
        'score':{},
        'windows':{},
        'mac':{},
        'linux':{},
        'type':{},
        'release_date':{},
        'recommendation':{},
        'header_image':{}
    }
    lst_raw_string = f.readlines()[:200]

    for raw_string in tqdm(lst_raw_string):
        try:
            app_data = list(json.loads(raw_string).values())[0]
            if app_data.get('success'):
                app_data = app_data.get('data')
                steam_id = app_data.get('steam_appid')
                initial_price = app_data.get('price_overview',{}).get('initial')
                if app_data.get('is_free') == True:
                    initial_price = 0
                app_name = app_data.get('name')
                critic_score = app_data.get('metacritic', {}).get('score')
                app_type = app_data.get('type')
                for (platform, is_supported) in app_data.get('platforms',{}).items():
                    if is_supported == True:
                        dic_steam_app[platform].update({steam_id:1})
                    else:
                        dic_steam_app[platform].update({steam_id:0})
                if app_data.get('release_date',{}).get('coming_soon') == False:
                    release_date = app_data.get('release_date',{}).get('date')
                    if not release_date == '':
                        try:
                            release_date = datetime.strptime(release_date, '%b %d, %Y')
                        except Exception as e:
                            try:
                                release_date = datetime.strptime(release_date, '%d %b, %Y')
                            except:
                                release_date = None
                        

                recommendation = app_data.get('recommendations',{}).get('total')
                header_image = app_data.get('header_image')
                dic_steam_app['initial_price'].update({steam_id:initial_price})
                dic_steam_app['name'].update({steam_id:app_name})
                dic_steam_app['score'].update({steam_id:critic_score})
                dic_steam_app['type'].update({steam_id:app_type})
                dic_steam_app['release_date'].update({steam_id:release_date})
                dic_steam_app['recommendation'].update({steam_id:recommendation})
                dic_steam_app['header_image'].update({steam_id:header_image})
            time.sleep(.1)
        except:
            pass





#### Work with MySQL in Python

In [19]:
df_app_info = pd.DataFrame(dic_steam_app)
df_app_info.index.name = 'app_id'
df_app_info.reset_index(inplace=True)
df_app_info.head()

Unnamed: 0,app_id,header_image,initial_price,linux,mac,name,recommendation,release_date,score,type,windows
0,1005040,https://steamcdn-a.akamaihd.net/steam/apps/100...,299.0,0,0,Big Crown®: Showdown - OST,,2019-01-10,,dlc,1
1,1005080,https://steamcdn-a.akamaihd.net/steam/apps/100...,,0,0,Yukinas Diary,,2019-01-10,,game,1
2,1005090,https://steamcdn-a.akamaihd.net/steam/apps/100...,,0,0,传送到异世界开后宫,,2019-01-10,,game,1
3,1005160,https://steamcdn-a.akamaihd.net/steam/apps/100...,999.0,0,1,Fantasy Grounds - Meanders Map Pack: Spaceport...,,2019-01-15,,dlc,1
4,1005270,https://steamcdn-a.akamaihd.net/steam/apps/100...,0.0,0,0,Spectrum's Path Demo,,2019-01-15,,demo,1


In [20]:
user = ''
password = ''
host = '127.0.0.1'
db_name = 'steam'
engine = create_engine(f'mysql+pymysql://{user}:{password}@{host}/{db_name}?charset=utf8mb4')

In [21]:
df_app_info.to_sql('tbl_app_info', engine, if_exists='replace',index=False)

In [22]:
engine.execute(
    '''
    select * from tbl_app_info limit 10
    ''').fetchall()

[(1005040, 'https://steamcdn-a.akamaihd.net/steam/apps/1005040/header.jpg?t=1547113290', 299.0, 0, 0, 'Big Crown®: Showdown - OST', None, datetime.datetime(2019, 1, 10, 0, 0), None, 'dlc', 1),
 (1005080, 'https://steamcdn-a.akamaihd.net/steam/apps/1005080/header.jpg?t=1547149885', None, 0, 0, 'Yukinas Diary', None, datetime.datetime(2019, 1, 10, 0, 0), None, 'game', 1),
 (1005090, 'https://steamcdn-a.akamaihd.net/steam/apps/1005090/header.jpg?t=1547545353', None, 0, 0, '传送到异世界开后宫', None, datetime.datetime(2019, 1, 10, 0, 0), None, 'game', 1),
 (1005160, 'https://steamcdn-a.akamaihd.net/steam/apps/1005160/header.jpg?t=1547584941', 999.0, 0, 1, 'Fantasy Grounds - Meanders Map Pack: Spaceport (Map Pack)', None, datetime.datetime(2019, 1, 15, 0, 0), None, 'dlc', 1),
 (1005270, 'https://steamcdn-a.akamaihd.net/steam/apps/1005270/header.jpg?t=1547345726', 0.0, 0, 0, "Spectrum's Path Demo", None, datetime.datetime(2019, 1, 15, 0, 0), None, 'demo', 1),
 (1005390, 'https://steamcdn-a.akamaihd.n

In [23]:
pd.read_sql_query('''
    select * from tbl_app_info limit 10
''', engine)

Unnamed: 0,app_id,header_image,initial_price,linux,mac,name,recommendation,release_date,score,type,windows
0,1005040,https://steamcdn-a.akamaihd.net/steam/apps/100...,299.0,0,0,Big Crown®: Showdown - OST,,2019-01-10,,dlc,1
1,1005080,https://steamcdn-a.akamaihd.net/steam/apps/100...,,0,0,Yukinas Diary,,2019-01-10,,game,1
2,1005090,https://steamcdn-a.akamaihd.net/steam/apps/100...,,0,0,传送到异世界开后宫,,2019-01-10,,game,1
3,1005160,https://steamcdn-a.akamaihd.net/steam/apps/100...,999.0,0,1,Fantasy Grounds - Meanders Map Pack: Spaceport...,,2019-01-15,,dlc,1
4,1005270,https://steamcdn-a.akamaihd.net/steam/apps/100...,0.0,0,0,Spectrum's Path Demo,,2019-01-15,,demo,1
5,1005390,https://steamcdn-a.akamaihd.net/steam/apps/100...,,0,0,Anti-Grav Bamboo-copter,,2019-01-02,,game,1
6,1005750,https://steamcdn-a.akamaihd.net/steam/apps/100...,1699.0,1,1,Murderers and their Mothers,,2019-01-04,,series,1
7,1005770,https://steamcdn-a.akamaihd.net/steam/apps/100...,199.0,0,0,Murderers and their Mothers: Daniel Bartlam: T...,,2019-01-04,,episode,1
8,1005790,https://steamcdn-a.akamaihd.net/steam/apps/100...,699.0,0,1,The Pirate's Fate - Prisoner of Destiny Expansion,,2019-01-04,,dlc,1
9,1005810,https://steamcdn-a.akamaihd.net/steam/apps/100...,199.0,0,0,Murderers and their Mothers: Fred and Rose Wes...,,2019-01-04,,episode,1
