In [None]:
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [None]:
INDICES=['V', 'G', 'A', 'R', 'RS', 'AG', 'AM', 'ES', 'FV']

# page_votes = "https://www.fantagazzetta.com/voti-serie-a/2016-17/"
# page_votes = "http://www.gazzetta.it/calcio/fantanews/voti/serie-a-2016-17/giornata-"
pages_grades = {'2014':"http://www.gazzetta.it/calcio/fantanews/voti/serie-a-2014-15/giornata-",
                '2015':"http://www.gazzetta.it/calcio/fantanews/voti/serie-a-2015-16/giornata-",
                '2016':"http://www.gazzetta.it/calcio/fantanews/voti/serie-a-2016-17/giornata-",
               }
n_giornate = 38



folder = "C:\\Users\\zuk-8\\Programs\\workspace\\jupiter_notebooks\\Andrea"
fn = "fanta_grades.csv"
link_fn = "player_links.csv"

In [None]:
def clean_table_cells(str):
    str = str.replace('\r', '').replace('\n', '').replace('\t', '').replace(' ', '')
    try:
        return float(str)
    except ValueError:
        return pd.np.NaN

def get_team_soups(page_votes, giornata):
    # Get html data from page+giornata
    page = page_votes+str(giornata)
    r = requests.get(page)
    data = r.text
    # Navigate inside data using BS
    soup = BeautifulSoup(data, "lxml")
    soup1 = [sec for sec in soup.html.body.find_all('section') if sec.get('class', None)==['main-container']][0]
    soup2 = [sec for sec in soup1.find_all('section') if sec.get('class', None)==['section-standard-row']][0]
    soup3 = [sec for sec in soup2.find_all('div') if sec.get('class', None)==["MXXX-central-articles-main-column"]][0]
    soup4 = [sec for sec in soup3.find_all('div') if
                   sec.get('class', None)==['magicDayList', 'listView', 'magicDayListChkDay']][0]
    return [sec for sec in soup4.find_all('div') if sec.get('class', None)==['singleRound']]

def get_player_soup(team_soup):
    team_list = [sec for sec in team_soup.find_all('div') if sec.get('class', None)==["magicTeamListContainer"]][0]
    team_grades = [sec for sec in team_list.find_all('ul') if sec.get('class')==["magicTeamList"]][0]
    team = [span.contents[0] for span in team_grades.li.div.find_all('span') if span.get('class', None)==["teamNameIn"]][0]
    player_soups = team_grades.find_all('li')[1:]
    return team, player_soups

def get_player_name(player_soup):
    return [span.string for span in player_soup.div.div.find_all('span') if span.get('class',None)==["playerNameIn"]][0]

def get_player_link(player_soup):
    return [a.get('href') for a in player_soup.find_all('a') if a.get('href', None)!=None][0]

def get_player_stats(player_soup, indices=INDICES):
    player_name = get_player_name(player_soup)
    role = [span.string for span in player_soup.div.find_all('span') if
            span.get('class',None)==["playerRole", "show-for-small"]][0]
    stats = pd.Series(data=[clean_table_cells(div.string)
            for div in player_soup.find_all('div') if div.get('class', None)[0] == "inParameter"],
      index=indices)
    player_in = [sp for sp in player_soup.find_all('span') if sp.get('class',None)==['playerStats', 'icon', 'down']]
    player_out = [sp for sp in player_soup.find_all('span') if sp.get('class',None)==['playerStats', 'icon', 'up']]
    pi = 1 if len(player_in)>0 else 0
    po = 1 if len(player_out)>0 else 0
    stats = stats.append(pd.Series(index=['In', 'Out'], data = [pi, po]))
    return player_name, role, stats

In [None]:
all_stats = pd.DataFrame()
for season,page_grades in pages_grades.items():
    season_stats = pd.DataFrame()
    for giornata in range(1,n_giornate+1):
        team_stats = pd.DataFrame()
        for team_soup in get_team_soups(page_grades, giornata):
            team, player_soups = get_player_soup(team_soup)
            player_stats = pd.DataFrame()
            for player_soup in player_soups:
                player_name, role, stats = get_player_stats(player_soup, indices=INDICES)
                ps = pd.DataFrame({player_name:stats}).T
                ps['Role'] = role
                player_stats = pd.concat([player_stats, ps])
            player_stats = player_stats.reset_index().rename(columns={'index':'Player'})
            player_stats['Team'] = team
            team_stats = pd.concat([team_stats, player_stats])
        team_stats['Giornata'] = str(giornata)
        season_stats = pd.concat([season_stats, team_stats])
    season_stats['Season'] = season
    all_stats = pd.concat([all_stats, season_stats])
all_stats = all_stats.set_index(['Season', 'Giornata', 'Team', 'Player'])
all_stats.to_csv(os.path.join(folder, fn))

In [None]:
all_links = dict()
for season,page_grades in pages_grades.items():
    for giornata in range(1,n_giornate+1):
        for team_soup in get_team_soups(page_grades, giornata):
            team, player_soups = get_player_soup(team_soup)
            for player_soup in player_soups:
                all_links[(get_player_name(player_soup), season)] = get_player_link(player_soup)

all_links_df = pd.DataFrame(columns=['Player', 'Season', 'Link'])
for k,v in all_links.items():
    all_links_df = all_links_df.append(pd.DataFrame(data=[[k[0], k[1], v]], columns=['Player', 'Season', 'Link']))
all_links_df = all_links_df.set_index(['Season', 'Player']).sort_index().reset_index()
all_links_df.to_csv(os.path.join(folder, link_fn), index=False)

In [None]:
def get_attribute_from_text(str):
    return clean_attributes(str.replace('\r', '').replace('\n', '').replace('\t', '').replace('  ', '').replace(' ', '_')).\
         split(':')

def clean_attributes(str):
    return str.replace('_cm', '').replace('_kg', '')

def get_attribute_from_player_soup(player_soup):
    # Navigate towards the place where information is stored
    main_container = [sec for sec in player_soup.body.find_all('section') if sec.get('class',None)==['main-container']][0]
    opener = [sec for sec in main_container.find_all('section') if sec.get('class',None)==['opener']][0]
    opener2 = [div for div in opener.find_all('div') if div.get('class',None)==['MXXX-section-opener-column']][0]
    opener3 = [sec for sec in opener2.find_all('section') if sec.get('class',None)==['profilo-giocatore']][0]
    profile = [div for div in opener3.find_all('div') if div.get('class',None)==["first_half_profilo"]][0]
    profile_data = [div for div in profile.find_all('div') if div.get('class',None)==["right"]][0]
    # Get the data and store it in a df
    data=[get_attribute_from_text(p.get_text()) for p in profile_data.find_all('p')]
    return pd.DataFrame(data=[[d[1] for d in data]], columns=[d[0] for d in data])

In [None]:
all_links_df = pd.read_csv(os.path.join(folder, link_fn))
columns = ['Player', 'Season', 'Link', 'Ruolo', 'Squadra', 'Data_di_nascita', 'Nazionalita', 'Altezza', 'Peso']
player_attributes = pd.DataFrame(columns=columns)
for row in all_links_df.index:
    player_name = all_links_df.loc[row]['Player']
    player_season = all_links_df.loc[row]['Season']
    player_link = all_links_df.loc[row]['Link']
    player_soup = BeautifulSoup(requests.get(player_link).text, "lxml")
    player_data = get_attribute_from_player_soup(player_soup)
    player_data['Player'] = player_name
    player_data['Season'] = player_season
    player_data['Link'] = player_link
    player_attributes = player_attributes.append(player_data)
player_attributes[columns].to_csv(os.path.join(folder, attr_fn), index=False)

In [None]:
# # Get page links

# page = "//www.fantagazzetta.com/squadre"
# player_teams = {}
# player_links = {}

# teams = ['Atalanta', 'Bologna', 'Cagliari', 'Chievo', 'Crotone', 'Empoli', 'Fiorentina',
#          'Genoa', 'Inter', 'Juventus', 'Lazio', 'Milan', 'Napoli', 'Palermo', 'Pescara',
#          'Roma', 'Sampdoria', 'Sassuolo', 'Torino', 'Udinese']

# for team in teams:
#     site = page+"/"+team
#     r  = requests.get("https:"+site)
#     data = r.text
#     soup = BeautifulSoup(data, "lxml")
#     for link in soup.find_all('a'):
#         l = link.get('href', '')
#         if l.startswith(site):
#             player_teams[link.contents[0]] = team
#             player_links[link.contents[0]] = l

# for player,link in list(player_links.items())[1:2]:
#     site = link + "/2/2016-17"
    
#     r  = requests.get("https:"+site)
#     data = r.text
#     soup = BeautifulSoup(data, "html.parser")