# PROJECT - API WEB SCRAPPING

## 1.- API 

## Retrieve set cards from API

In [1]:
#import libraries

import json
import requests
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize
from bs4 import BeautifulSoup
import time

In [2]:
# extract sets from scryfall API

complete = []
url = 'https://api.scryfall.com/sets'
response = requests.get(url)
results = response.json()
x = json_normalize(results)
flattened_data1 = json_normalize(x.data[0])
complete.append(flattened_data1)
scryfall = pd.concat(complete, sort=False)

In [3]:
scryfall.head()

Unnamed: 0,object,id,code,name,uri,scryfall_uri,search_uri,released_at,set_type,card_count,parent_set_code,digital,foil_only,icon_svg_uri,tcgplayer_id,mtgo_code,arena_code,block_code,block
0,set,8fe3f935-7c8d-4a4e-a051-c0b0f251d262,tund,Unsanctioned Tokens,https://api.scryfall.com/sets/8fe3f935-7c8d-4a...,https://scryfall.com/sets/tund,https://api.scryfall.com/cards/search?order=se...,2020-02-29,token,6,und,False,False,https://img.scryfall.com/sets/default.svg?1581...,,,,,
1,set,fccfdf97-f5f2-43b4-9be9-9255414e6633,und,Unsanctioned,https://api.scryfall.com/sets/fccfdf97-f5f2-43...,https://scryfall.com/sets/und,https://api.scryfall.com/cards/search?order=se...,2020-02-29,funny,96,,False,False,https://img.scryfall.com/sets/default.svg?1581...,2598.0,,,,
2,set,66d787e4-101d-4f72-a4ed-7c38df9b99fe,pthb,Theros Beyond Death Promos,https://api.scryfall.com/sets/66d787e4-101d-4f...,https://scryfall.com/sets/pthb,https://api.scryfall.com/cards/search?order=se...,2020-01-24,promo,136,thb,False,False,https://img.scryfall.com/sets/thb.svg?1581915600,,,,,
3,set,200c397b-bf57-46a2-8ebf-592148fd49a4,tthb,Theros Beyond Death Tokens,https://api.scryfall.com/sets/200c397b-bf57-46...,https://scryfall.com/sets/tthb,https://api.scryfall.com/cards/search?order=se...,2020-01-24,token,14,thb,False,False,https://img.scryfall.com/sets/thb.svg?1581915600,,,,,
4,set,5f23a78d-cda1-462a-8be3-a62b40c34913,thb,Theros Beyond Death,https://api.scryfall.com/sets/5f23a78d-cda1-46...,https://scryfall.com/sets/thb,https://api.scryfall.com/cards/search?order=se...,2020-01-24,expansion,358,,False,False,https://img.scryfall.com/sets/thb.svg?1581915600,2568.0,thb,thb,,


In [4]:
#extract the legal core and expansion sets which are from Return to Ravnica set to actual
core = scryfall[scryfall['set_type'] == 'core']
expansion = scryfall[scryfall['set_type'] == 'expansion']
set_list = pd.concat([core, expansion])
set_list = set_list.filter(['code', 'name', 'released_at', 'block', 'card_count'])

# the initial sets include from 2012-10-05 (Return to Ravnica set) to actual set
pioneer_sets = set_list[set_list['released_at'] >= '2012-10-05']

#append 'Welcome Decks' which are also legal
pioneer_sets = pd.concat([pioneer_sets, set_list.filter(like='Welcome Deck', axis=0)])
pioneer_sets = pioneer_sets.sort_values(by=['released_at'])
pioneer_sets['code'] = pioneer_sets['code'].apply(lambda x: x.upper())
pioneer_sets = pioneer_sets.set_index('code',inplace=False).fillna('Unknown')
pioneer_sets.columns = ['Set Name', 'Release Date', 'Block', 'Total Cards']
pioneer_sets['Total Cards'].sum()

7795

## Retrieve the list of all cards from the legal sets

In [None]:
# Retrieve the number of pages needed for request
total_cards_set = pioneer_sets['Total Cards'].to_list()
page_numbers = [i // 100 + 1 for i in total_cards_set]

complete = []
for i in pioneer_sets.index:
    for j in range(1,page_numbers[0]+1):
        url='https://api.magicthegathering.io/v1/cards?page='+str(j)+'&set='+str(i)
        response = requests.get(url)
        time.sleep(0.25)
        results = response.json()
        x = json_normalize(results)
        flattened_data1 = json_normalize(x.cards[0])
        complete.append(flattened_data1)
        
data = pd.concat(complete, sort=False, ignore_index=True)
data

In [None]:
data.columns

In [None]:
# Select only columns with valuable info

cards = data[['name', 'manaCost', 'cmc', 'colors', 'type', 'rarity', 'set', 'text', 'power', 'toughness', 'loyalty']]

In [None]:
# Extract the list of banned cards

url = 'https://magic.wizards.com/es/game-info/gameplay/formats/pioneer'

def get_links(url):
    '''
    This function takes a url of a MTG format and
    scrappes the list of the cards that are banned
    '''
    html = requests.get(url).content
    soup = BeautifulSoup(html, features='html.parser')
    links = soup.find_all('a', {'class':'autocard-link'})
    
    # from each link take the 2 second last 
    # (which contains the name of the card)
    # and strip the last character that are related with
    
    reference = [str(i).split('>')[-2][:-3] for i in links]
    return reference

banned_cards = get_links(url)

In [None]:
print(banned_cards)

## Extract the matches results from the last competitive tournaments 

In [None]:
def get_matches(city):
    '''
    This function takes the name of a city an raises an error if the city is not valid.
    In case the city if one of the cities (currently brussels, nagoya and phoenix) that had a tournament, 
    creates a dataframe with all the different matches.
    '''
    tabs = []
    # check the city before passing to the url
    if city.lower() == 'brussels':
        range_max = 17
    elif (city.lower() == 'nagoya') | (city.lower() == 'phoenix'):
        range_max = 16
    else:
        raise ValueError('The city provided is not valid')
        
    for i in range(1,range_max):
        url = 'https://magic.gg/news/players-tour-'+str(city)+'-2020-round-'+str(i)+'-results'
        html = requests.get(url).content
        soup = BeautifulSoup(html, features='html.parser')
        table = soup.find_all('div', {'class': 'table-wrapper'})[0].find_all('tr')
    
        for tr in table:
            tabs.append({
                'Player': tr.find_all('td')[1].string.rstrip().title(),
                'Opponent': tr.find_all('td')[5].string.title(),
                'Match': tr.find_all('td')[3].string
                })
    return pd.DataFrame(tabs)

In [None]:
brussels_matches = get_matches('brussels')
brussels_matches.head()

In [None]:
# nagoya matches
nagoya_matches = get_matches('nagoya')
nagoya_matches.head()

In [None]:
# phoenix matches
phoenix_matches = get_matches('phoenix')
phoenix_matches.head()

## Extract the decklist the different player at each tournament

In [None]:
def get_decklist(city):
    '''
    This function takes the name of a city an raises an error if the city is not valid.
    In case the city if one of the cities (currently brussels, nagoya and phoenix) that had a tournament, 
    creates a dataframe with all the decklist played by the different players.
    
    After extracting data, it makes a transformation because card frequency and card list are extracted as lists. 
    Therefore, for each card, it creates a new column and stores its number of copies in each deck.
    '''
    # check if the city is correct
    
    if city.lower() == 'nagoya':
        range_max = 4
    elif (city.lower() == 'brussels') | (city.lower() == 'phoenix'):
        range_max = 6
    else:
        raise ValueError('The city provided is not valid')
        
    card_list = []
    card_count = []
    span_cards = []
    player_deck = []
    
    for s in range(1,range_max):
        # extract the data from web
        url = 'https://magic.wizards.com/en/events/coverage/players-tour-'+str(city)+'-2020-decklists-'+str(s)
        html = requests.get(url).content
        soup = BeautifulSoup(html, features='html.parser')
        player = [str(title.text) for title in soup.find_all('h4')]
        # append the results
        player_deck.append(player)
       
        for deck in soup.find_all('div', {'class':'deck-list-text'}):
            a_tag = deck.find_all('span', {'class':'card-name'})
            count_tag = deck.find_all('span', {'class':'card-count'})
            name = [str(tag).split('>')[-3][:-3] for tag in a_tag]
            count = [tag.text for tag in count_tag]
        
        # change non cards 'span class'
            span = [str(tag).split('>')[-2][:-6] for tag in a_tag]
    
        #append the results
            card_list.append(name)
            card_count.append(count)
            span_cards.append(span)

    for i, deck in enumerate(card_list):
        for j, card in enumerate(deck):
            if '<span class' in card:
                card_list[i][j] = span_cards[i][j]


    player_list = [(i.split(',')[0] + str(',') + i.split(',')[1].split('-')[0].rstrip()).title() 
                   for page in player_deck for i in page]
    deck_list = ['-'.join(i.split(',')[1].split('-')[1:]) for page in player_deck for i in page]


    decklist = pd.DataFrame([player_list, deck_list, card_count, card_list], index=['Player', 'Deck', 'Copies', 'Cards'])
    decklist = decklist.T
    
    ### Extract all the cards stored in card list as new columns in the dataframe and assign card count as values ###
    
    #Dict comprehension with key = card name, value = card count
    card_frequency = [dict(zip(card_list[i], card_count[i])) for i in range(len(card_list))]

    card_values =  pd.DataFrame([card_frequency], index=['cards'])

    # extract the dict values and create one column for each card
    card_values = card_values.T.cards.apply(pd.Series).fillna(0) 

    #reorder the columns alphabetically
    card_values = card_values.reindex(sorted(card_values.columns), axis=1)
    
    decklist = pd.concat([decklist.Player, decklist.Deck, card_values], axis=1, sort=False)
    
    return decklist

        

In [None]:
brussels_decklist = get_decklist('brussels')
brussels_decklist.head()

In [None]:
nagoya_decklist = get_decklist('nagoya')
nagoya_decklist.head()

In [None]:
phoenix_decklist = get_decklist('phoenix')
phoenix_decklist.head()

### Given the decklists dataframe, calculate the most played card (number of copies) for each tournament

In [None]:
def most_played_cards(decklist):
    '''
    This function takes a decklist type dataframe 
    and returns the most played card of this tournament
    '''
    # extract the name of all cards played
    card_columns = decklist.columns[2:]
    
    # data type
    decklist[card_columns] = decklist[card_columns].astype('int32')

    return decklist[card_columns].apply(np.sum, axis=0).sort_values(ascending=False)

In [None]:
brussels_played = most_played_cards(brussels_decklist)
brussels_played

In [None]:
nagoya_played = most_played_cards(nagoya_decklist)
nagoya_played

In [None]:
phoenix_played = most_played_cards(phoenix_decklist)
phoenix_played

### Append the results to the total Cards Legal retrieved from API

In [None]:
# retrieve the list of the most played cards

total_played = pd.concat([brussels_played, nagoya_played, phoenix_played], axis=1, sort=True)
total_played.columns = ['Brussels', 'Nagoya', 'Phoenix']
total_played = total_played.fillna(0)
total_played = total_played.astype('int32')
total_played['Total copies'] = total_played.sum(axis=1, skipna = True)
total_played.sort_values(by='Total copies', ascending=False)
total_played = total_played.reset_index()
total_played.columns = ['Name', 'Brussels', 'Nagoya', 'Phoenix', 'Total copies']
total_played

In [None]:
cards = pd.merge(cards, total_played, left_on='name', right_on='Name')
cards = cards.drop(['Brussels_x', 'Nagoya_x', 'Phoenix_x', 'Total copies_x', 'Name'], axis=1)
cards.columns = ['Card Name', 'Mana Cost', 'CMC', 'Colors', 'Type', 'Rarity', 'Set', 'Text', 'Power', 'Toughness', 'Loyalty', 
                 'Brussels copies', 'Nagoya copies', 'Phoenix copies', 'Total copies']

cards.head()

In [None]:
'''

#transform result table into classification
brussels_matches['Result'] = brussels_matches['Match'].str.split('\s').str.get(0)
brussels_matches['GW'] = brussels_matches['Match'].str.split('\s').str.get(1).str.split('-').str.get(0)
brussels_matches['GL'] = brussels_matches['Match'].str.split('\s').str.get(1).str.split('-').str.get(1)

#brussels_round['Player']

### Replace Player - Opponent with Deck from Brussel Matches

#Dict for mapping the player-deck
player_deck_dict = dict(zip(player_list, deck_list))

# I was lost without a map... 
brussels_matches['Opponent'] = brussels_matches['Opponent'].map(player_deck_dict)
brussels_matches['Player'] = brussels_matches['Player'].map(player_deck_dict)

brussels_matches['Player'].value_counts(dropna=False)

'''

In [None]:
######################################################

In [None]:
# transform type to integer
#nagoya_round = nagoya_round.fillna(0).astype({'GW':'int32', 'GL':'int32'})
#player_agg = nagoya_round.groupby('Player')['GW','GL'].agg('sum')
#opponent_agg = nagoya_round.groupby('Opponent')['GW','GL'].agg('sum')

#player_agg
#opponent_agg


## Perform stadistical analysis

In [None]:
## 1) Most played card
## 2) The last set was impactfull? (% of copies / total)


## 5) Most played decks
## 6) Matrix of win rate of each deck --> Boxplot 