In [None]:
import requests
from lxml import etree
import re
import csv
import datetime

In [None]:
"""
1. Sweep the text for each unique version of the game (excluding the world cup version)
2. Sweep the sitemaps for each version to get all unique players
3. Visit each player's main page, determine which updates they were part of (which update urls do not redirect)
4. Iterate through each update every player was included in, scrape player data
"""

In [None]:
def sweep_1_season_text(url):
    
    response = requests.get(url)
    data = response.content
    dom = etree.HTML(data)
    
    # retrieve all of the season and date slugs
    season_slugs = dom.xpath('//li[@class="breadcrumb-item dropdown"]/div[starts-with(@class,"dropdown-menu")]/a[@class="dropdown-item"]/@href')

    season_list = []

    for ix, season in enumerate(season_slugs):
        try:
            season_text = season_slugs[ix]
            slug = re.search('fifa\d\d', season_text).group(0)
            season_list.append(slug)

        except:
            pass
    
    # deduplicate the list by making it a set, then turn it back into a list
    # sort the list
    season_list = list(set(season_list))
    season_list.sort()
    return season_list

In [None]:
def sweep_player_sitemaps(season_text):
    
    sitemap_url = 'https://www.fifaindex.com/sitemap-{}-players.xml'.format(season_text)

    response = requests.get(sitemap_url)
    data = response.content
    dom = etree.HTML(data)

    player_nodes = dom.xpath('//loc')
    
    player_list = []

    for ix, player in enumerate(player_nodes):
        
        try:
            player_node = dom.xpath('//loc')[ix]
            player_season_url = player_node.text
            player_url = re.sub('fifa\d\d_\d{0,3}/$', '', player_season_url)
            player_list.append(player_url)
        
        except:
            player_list.append('error - {}'.format(sitemap_url))

        player_list = list(set(player_list))
    
    return player_list

In [None]:
def create_unique_player_list(season):
    
    player_list = []
    
    season_players = sweep_player_sitemaps(season)
    
    for player in season_players:

        player_list.append(player)

    player_list = list(set(player_list))
    
    return player_list

In [None]:
def sweep_2_players_from_sitemaps(season_list):
    player_list = []
    for season in season_list:
        print 'now processing {}'.format(season)
        unique_players = create_unique_player_list(season)
        for player in unique_players:
            player_list.append(player)
        player_list = list(set(player_list))
        print 'done processing {}'.format(season)
        print 'player_list set is now {} records long'.format(len(player_list))
    
    return player_list    

In [2]:
def sweep_3_season_date_scrape_urls(player_list):
    
    scrape_list = []
    
    for player_url in player_list:
        
        response = requests.get(player_url)
        data = response.content
        dom = etree.HTML(data)
        season_slugs = dom.xpath('//a[@class="dropdown-item" and starts-with(text(),"FIFA ")]/@href')
        season_slugs

        for ix, season in enumerate(season_slugs):
            season_url = 'https://www.fifaindex.com{}'.format(season)
            response = requests.get(season_url)
            
            if response.status_code == 200 and len(response.history) == 0:
                data = response.content
                dom = etree.HTML(data)
                season_date_slugs = dom.xpath('//div[@class="dropdown-menu fade-out"]/a/@href')
                
                for ix, season_date_slug in enumerate(season_date_slugs):
                    season_date_url = 'https://www.fifaindex.com{}'.format(season_date_slugs[ix])
                    scrape_list.append(season_date_url)
            
            else:
                pass

    return scrape_list
    

In [3]:
def scrape_1_player_data(scrape_url):
    
    player_data =[]
    
    if re.search('fifa\d\dwc', scrape_url) > 0:
        pass
    
    else:

        response = requests.get(scrape_url)
        data = response.content
        dom = etree.HTML(data)

        season = dom.xpath('//a[@class="dropdown-toggle"]/text()')[0]
        rating_date = dom.xpath('//a[@class="dropdown-toggle"]/text()')[1]
        player_name = dom.xpath('//div[@class="align-self-center pl-3"]/h1/text()')[0]
        player_url = scrape_url
        player_nationality = dom.xpath('//div[@class="align-self-center pl-3"]/h2/a/text()')
        player_rating_overall = dom.xpath('//span[starts-with(@class,"badge badge-dark rating")]/text()')[0]
        player_rating_potential = dom.xpath('//span[starts-with(@class,"badge badge-dark rating")]/text()')[1]
        player_height_metric = dom.xpath('//p[starts-with(text(),"Height")]/span/span[@class="data-units data-units-metric"]/text()')[0]
        player_height_imperial = dom.xpath('//p[starts-with(text(),"Height")]/span/span[@class="data-units data-units-imperial"]/text()')[0]
        player_weight_metric = dom.xpath('//p[starts-with(text(),"Weight")]/span/span[@class="data-units data-units-metric"]/text()')[0]
        player_weight_imperial = dom.xpath('//p[starts-with(text(),"Weight")]/span/span[@class="data-units data-units-imperial"]/text()')[0]
        player_preferred_positions = dom.xpath('//p[starts-with(text(),"Preferred Positions")]/span/a/@title')
        player_preferred_foot = dom.xpath('//p[starts-with(text(),"Preferred Foot")]/span/text()')[0]
        player_birth_date = dom.xpath('//p[starts-with(text(),"Birth Date")]/span/text()')[0]
        player_age = dom.xpath('//p[starts-with(text(),"Age")]/span/text()')[0]
        try:
            player_value_dollars = dom.xpath('//p[starts-with(text(),"Value") and @class="data-currency data-currency-dollar"]/span[@class="float-right"]/text()')[0]
        except:
            player_value_dollars = None
        try:
            player_wage_dollars = dom.xpath('//p[starts-with(text(),"Wage") and @class="data-currency data-currency-dollar"]/span[@class="float-right"]/text()')[0]
        except:
            player_wage_dollars = None
        try:
            club_team_name = dom.xpath('//button[@data-target="#transferModal"]/ancestor::div[@class="col-12 col-sm-6 col-lg-6 team"]/div/h5/a/text()')[0]
        except:
            club_team_name = None
        try:
            club_team_url = dom.xpath('//button[@data-target="#transferModal"]/ancestor::div[@class="col-12 col-sm-6 col-lg-6 team"]/div/h5/a/@href')[0]
        except:
            club_team_url = None
        try:
            club_team_position = dom.xpath('//button[@data-target="#transferModal"]/ancestor::div[@class="col-12 col-sm-6 col-lg-6 team"]/div/div/p[starts-with(text(),"Position")]/span/a/@title')
        except:
            club_team_position = None
        try:
            club_team_on_loan_from = dom.xpath('//button[@data-target="#transferModal"]/ancestor::div[@class="col-12 col-sm-6 col-lg-6 team"]/div/div/p[starts-with(text(),"On loan from")]/span/a/@title')[0]
        except:
            club_team_on_loan_from = None
        try:
            club_team_join_date = dom.xpath('//button[@data-target="#transferModal"]/ancestor::div[@class="col-12 col-sm-6 col-lg-6 team"]/div/div/p[starts-with(text(),"Joined Club")]/span/text()')[0]
        except:
            club_team_join_date = None
        try:
            club_team_contract_length = dom.xpath('//button[@data-target="#transferModal"]/ancestor::div[@class="col-12 col-sm-6 col-lg-6 team"]/div/div/p[starts-with(text(),"Contract Length")]/span/text()')[0]
        except:
            club_team_contract_length = None

        dt = datetime.datetime.now()
        date_added = dt.isoformat()

        payload = {
            "season" : season,
            "rating_date" : rating_date,
            "player_name" : player_name,
            "player_url" : player_url,
            "player_nationality" : player_nationality,
            "player_rating_overall" : player_rating_overall,
            "player_rating_potential" : player_rating_potential,
            "player_height_metric" : player_height_metric,
            "player_height_imperial" : player_height_imperial,
            "player_weight_metric" : player_weight_metric,
            "player_weight_imperial" : player_weight_imperial,
            "player_preferred_positions" : player_preferred_positions,
            "player_preferred_foot" : player_preferred_foot,
            "player_birth_date" : player_birth_date,
            "player_age" : player_age,
            "player_value_dollars" : player_value_dollars,
            "player_wage_dollars" : player_wage_dollars,
            "club_team_name" : club_team_name,
            "club_team_url" : club_team_url,
            "club_team_position" : club_team_position,
            "club_team_on_loan_from" : club_team_on_loan_from,
            "club_team_join_date" : club_team_join_date,
            "club_team_contract_length" : club_team_contract_length,
#             "national_team_name" : national_team_name,
#             "national_team_url" : national_team_url,
#             "national_team_position" : national_team_position,
            "date_added" : date_added 
        }
#     print payload
        print payload['player_url'], payload['season'], payload['rating_date'], payload['player_value_dollars'], payload['player_wage_dollars'], payload['club_team_name']
        
        player_data.append(payload)
    
    return player_data
        
#         with open('fifaindex_messi_test_v1.csv', 'w') as output:
#             writer = csv.writer(output, lineterminator='\n')
#             for val in payload:
#                 writer.writerows([val])
    
    
    
    
    
    
        # dead code that would help us scrape both the national and club teams
        # problem is that the club and national teams switch position at times and this is position dependent
#         team_nodes = dom.xpath('//div[@class="col-12 col-sm-6 col-lg-6 team"]/div[@class="card mb-5"]/h5')
#         if len(team_nodes) == 1:
#             national_team_name = None
#             national_team_url = None
#             national_team_position = None
#         elif len(team_nodes) == 2: 
#             national_team_name = club_team_name = dom.xpath('//div[@class="col-12 col-sm-6 col-lg-6 team"]/div[@class="card mb-5"]/h5/a/text()')[0]
#             national_team_url = dom.xpath('//div[@class="col-12 col-sm-6 col-lg-6 team"]/div[@class="card mb-5"]/h5/a/@href')[1]
#             national_team_position = dom.xpath('//div[@class="col-12 col-sm-6 col-lg-6 team"]/div[@class="card mb-5"]/div/p[starts-with(text(),"Position")]/span/a/@title')[0]
#             club_team_name = dom.xpath('//button[@data-target="#transferModal"]/ancestor::div[@class="col-12 col-sm-6 col-lg-6 team"]/div/h5/a/text()')[1]
#             club_team_url = dom.xpath('//div[@class="col-12 col-sm-6 col-lg-6 team"]/div[@class="card mb-5"]/h5/a/@href')[3]
#             club_team_position = dom.xpath('//div[@class="col-12 col-sm-6 col-lg-6 team"]/div[@class="card mb-5"]/div/p[starts-with(text(),"Position")]/span/a/@title')[1]
#             try:
#                 club_team_join_date = dom.xpath('//div[@class="col-12 col-sm-6 col-lg-6 team"]/div[@class="card mb-5"]/div/p[starts-with(text(),"Joined Club")]/span/text()')[0]
#             except:
#                 club_team_join_date = None
#             try:
#                 club_team_contract_length = dom.xpath('//div[@class="col-12 col-sm-6 col-lg-6 team"]/div[@class="card mb-5"]/div/p[starts-with(text(),"Contract Length")]/span/text()')[0]
#             except:
#                 club_team_contract_length = None
#             try:
#                 club_team_on_loan_from = dom.xpath('//div[@class="col-12 col-sm-6 col-lg-6 team"]/div[@class="card mb-5"]/div/p[starts-with(text(),"On loan from")]/span/a/@title')[0]
#             except:
#                 club_team_on_loan_from = None

In [None]:
"""Running the script"""

In [None]:
url = 'https://www.fifaindex.com/'
season_list = sweep_1_season_text(url)
player_list = sweep_2_players_from_sitemaps(season_list)
scrape_list = sweep_3_season_date_scrape_urls(player_list)
for scrape_url in scrape_list:
    scrape_1_player_data(scrape_url)

In [None]:
with open('fifaindex_player_sweep_unique_v2.csv', 'w') as output:
    writer = csv.writer(output, lineterminator='\n')
    for val in distinct_players:
        writer.writerow([val])

In [None]:
# test with one player
player_list = ['https://www.fifaindex.com/player/158023/lionel-messi/']
# player_list = ['https://www.fifaindex.com/player/212419/tyrone-mings/']
scrape_list = sweep_3_season_date_scrape_urls(player_list)
player_data = []
for scrape_url in scrape_list:
    player_details = scrape_1_player_data(scrape_url)
    player_data.append(player_details)

https://www.fifaindex.com/player/158023/lionel-messi/ FIFA 19  Feb. 7, 2019  $128.000.000 $650.000 FC Barcelona
https://www.fifaindex.com/player/158023/lionel-messi/fifa19_312/ FIFA 19  Feb. 4, 2019  $128.000.000 $650.000 FC Barcelona
https://www.fifaindex.com/player/158023/lionel-messi/fifa19_311/ FIFA 19  Jan. 31, 2019  $128.000.000 $650.000 FC Barcelona
https://www.fifaindex.com/player/158023/lionel-messi/fifa19_310/ FIFA 19  Jan. 28, 2019  $128.000.000 $650.000 FC Barcelona
https://www.fifaindex.com/player/158023/lionel-messi/fifa19_309/ FIFA 19  Jan. 24, 2019  $128.000.000 $650.000 FC Barcelona
https://www.fifaindex.com/player/158023/lionel-messi/fifa19_308/ FIFA 19  Jan. 21, 2019  $128.000.000 $650.000 FC Barcelona
https://www.fifaindex.com/player/158023/lionel-messi/fifa19_307/ FIFA 19  Jan. 17, 2019  $128.000.000 $650.000 FC Barcelona
https://www.fifaindex.com/player/158023/lionel-messi/fifa19_306/ FIFA 19  Jan. 14, 2019  $128.000.000 $650.000 FC Barcelona
https://www.fifaindex

https://www.fifaindex.com/player/158023/lionel-messi/fifa18_222/ FIFA 18  March 19, 2018  $133.000.000 $620.000 FC Barcelona
https://www.fifaindex.com/player/158023/lionel-messi/fifa18_221/ FIFA 18  March 15, 2018  $133.000.000 $620.000 FC Barcelona
https://www.fifaindex.com/player/158023/lionel-messi/fifa18_220/ FIFA 18  March 12, 2018  $133.000.000 $620.000 FC Barcelona
https://www.fifaindex.com/player/158023/lionel-messi/fifa18_219/ FIFA 18  March 8, 2018  $133.000.000 $620.000 FC Barcelona
https://www.fifaindex.com/player/158023/lionel-messi/fifa18_218/ FIFA 18  March 5, 2018  $133.000.000 $620.000 FC Barcelona
https://www.fifaindex.com/player/158023/lionel-messi/fifa18_217/ FIFA 18  Feb. 26, 2018  $133.000.000 $620.000 FC Barcelona
https://www.fifaindex.com/player/158023/lionel-messi/fifa18_216/ FIFA 18  Feb. 22, 2018  $133.000.000 $620.000 FC Barcelona
https://www.fifaindex.com/player/158023/lionel-messi/fifa18_215/ FIFA 18  Feb. 19, 2018  $133.000.000 $620.000 FC Barcelona
https

https://www.fifaindex.com/player/158023/lionel-messi/fifa17_155/ FIFA 17  July 17, 2017  $97.500.000 $650.000 FC Barcelona
https://www.fifaindex.com/player/158023/lionel-messi/fifa17_154/ FIFA 17  July 13, 2017  $97.500.000 $650.000 FC Barcelona
https://www.fifaindex.com/player/158023/lionel-messi/fifa17_153/ FIFA 17  July 10, 2017  $97.500.000 $650.000 FC Barcelona
https://www.fifaindex.com/player/158023/lionel-messi/fifa17_152/ FIFA 17  July 6, 2017  $97.500.000 $650.000 FC Barcelona
https://www.fifaindex.com/player/158023/lionel-messi/fifa17_151/ FIFA 17  July 3, 2017  $97.500.000 $650.000 FC Barcelona
https://www.fifaindex.com/player/158023/lionel-messi/fifa17_150/ FIFA 17  June 29, 2017  $97.500.000 $650.000 FC Barcelona
https://www.fifaindex.com/player/158023/lionel-messi/fifa17_149/ FIFA 17  June 26, 2017  $97.500.000 $650.000 FC Barcelona
https://www.fifaindex.com/player/158023/lionel-messi/fifa17_148/ FIFA 17  June 22, 2017  $97.500.000 $650.000 FC Barcelona
https://www.fifain

In [None]:
player_data

In [None]:
test_url = 'https://www.fifaindex.com/player/158023/lionel-messi/'

response = requests.get(test_url)
data = response.content
dom = etree.HTML(data)

player_rating_overall = dom.xpath('//span[starts-with(@class,"badge badge-dark rating")]/text()')[0]
player_rating_overall

In [None]:
with open('fifaindex_messi_test_v1.csv', 'w') as output:
    writer = csv.writer(output, lineterminator='\n')
    for val in player_data:
        writer.writerows([val])
    