In [32]:
import pandas as pd
# %conda install requests
# %conda install lxml
import requests
from io import StringIO

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
# Mozilla/5.0 (Windows NT 10.0; Win64; x64) 
# AppleWebKit/537.36 (KHTML, like Gecko) 
# Chrome/91.0.4472.124 Safari/537.36

url = "https://en.wikipedia.org/wiki/List_of_footballers_with_500_or_more_goals"
# https://en.wikipedia.org/
# wiki/List_of_footballers_with_500_or_more_goals
res = requests.get(url=url, headers=headers).text
res = StringIO(res)
df = pd.read_html(res)
df

[    0                                                  1
 0 NaN  This section needs additional citations for ve...,
    Rank              Player     Club                  Country and other  \
    Rank              Player   League  Cup Continental Country and other   
 0     1   Cristiano Ronaldo   590[b]   57         172               143   
 1     2       Lionel Messi*   553[c]   71         157               115   
 2     3               Pelé*   604[d]   49          26                83   
 3     4             Romário   545[e]   93          54                64   
 4     5       Ferenc Puskás   516[f]   69          56                84   
 5     6        Josef Bican*   515[g]  137          38                32   
 6     7  Robert Lewandowski   423[h]   62         117                88   
 7     8        Jimmy Jones*   330[i]  286          14                 9   
 8     9        Gerd Müller*   405[j]   92          69                68   
 9    10       Joe Bambrick*   347[k]  253     

In [33]:
df = df[3]

In [34]:
df

Unnamed: 0,Rank,Player,Goals,Matches,Ratio,Career span
0,1,Erwin Helmchen,989+,582,1.70,1924–1951
1,2,Cristiano Ronaldo,975,1337,0.73,2002–present
2,3,Josef Bican,950+,624,1.52,1930–1957
3,4,Ronnie Rooke,934+,1030,0.91,1929–1961
4,5,Lionel Messi,925,1194,0.77,2003–present
...,...,...,...,...,...,...
78,79,Roger Milla,504+,907,0.56,1968–1996
79,80,Steve Bloomer,504,755,0.67,1891–1920
80,81,Albert de Cleyn,503,588,0.86,1933–1955
81,82,Delio Onnis,502,747,0.67,1966–1986


In [35]:
for col in df.columns:
    print(f"{col}: {pd.api.types.is_numeric_dtype(col)}")

Rank: False
Player: False
Goals: False
Matches: False
Ratio: False
Career span: False


In [36]:
df["Career span"] = df["Career span"].str.split("–").str[0].astype(int)

In [37]:
df["Matches"] = [int(item.replace("+", "")) for item in df["Matches"]]

In [38]:
for col in df.columns:
    print(f"{col} is numeric? {pd.api.types.is_numeric_dtype(df[col])}")

Rank is numeric? True
Player is numeric? False
Goals is numeric? False
Matches is numeric? True
Ratio is numeric? True
Career span is numeric? True


In [39]:
df["Goals"] = df["Goals"].str.replace("+", "").astype(int)
df.head()

Unnamed: 0,Rank,Player,Goals,Matches,Ratio,Career span
0,1,Erwin Helmchen,989,582,1.7,1924
1,2,Cristiano Ronaldo,975,1337,0.73,2002
2,3,Josef Bican,950,624,1.52,1930
3,4,Ronnie Rooke,934,1030,0.91,1929
4,5,Lionel Messi,925,1194,0.77,2003


In [48]:
from datetime import datetime, timedelta
import random, math
data = []
columns = ["name", "date", "goals"]

for row in df.itertuples():
    name = row.Player
    start_date = f"{row._6}-08-01"
    start_date = datetime.strptime(start_date, "%Y-%m-%d")
    sd = random.random()
    for i in range(row.Matches):
        name = name
        date = timedelta(days=3 * i) + start_date
        goals = random.normalvariate(row.Ratio, sd)
        goals = round(goals) if goals > 0 else 0
        info = [name, date, goals]
        data.append(info)
        
games_df = pd.DataFrame(data=data, columns=columns)
games_df.shape

(60168, 3)

In [49]:
games_df.loc[games_df["goals"] < 0]

Unnamed: 0,name,date,goals


In [50]:
fake_df = games_df.groupby("name").agg(
    matches=("name", "count"),
    goals=("goals", "sum"),
    ratio=("goals", "mean"),
    career_start=("date", "min"),
).sort_values("goals", ascending=False).reset_index()

fake_df

Unnamed: 0,name,matches,goals,ratio,career_start
0,Erwin Helmchen,582,1163,1.998282,1924-08-01
1,Cristiano Ronaldo,1337,1000,0.747943,2002-08-01
2,Josef Bican,624,996,1.596154,1930-08-01
3,Ronnie Rooke,1030,977,0.948544,1929-08-01
4,Romário,1003,953,0.950150,1984-08-01
...,...,...,...,...,...
78,Alfredo Di Stéfano,720,512,0.711111,1945-08-01
79,George Brown,711,502,0.706048,1920-08-01
80,Des Dickson,715,496,0.693706,1964-08-01
81,Franz Binder,431,460,1.067285,1927-08-01


In [51]:
fake_df[0:10]

Unnamed: 0,name,matches,goals,ratio,career_start
0,Erwin Helmchen,582,1163,1.998282,1924-08-01
1,Cristiano Ronaldo,1337,1000,0.747943,2002-08-01
2,Josef Bican,624,996,1.596154,1930-08-01
3,Ronnie Rooke,1030,977,0.948544,1929-08-01
4,Romário,1003,953,0.95015,1984-08-01
5,Lionel Messi,1194,917,0.768007,2003-08-01
6,Abe Lenstra,850,807,0.949412,1936-08-01
7,Ferenc Puskás,792,805,1.016414,1943-08-01
8,Ferenc Deák,515,799,1.551456,1939-08-01
9,Robert Lewandowski,1079,798,0.739574,2004-08-01


In [52]:
df[0:10]

Unnamed: 0,Rank,Player,Goals,Matches,Ratio,Career span
0,1,Erwin Helmchen,989,582,1.7,1924
1,2,Cristiano Ronaldo,975,1337,0.73,2002
2,3,Josef Bican,950,624,1.52,1930
3,4,Ronnie Rooke,934,1030,0.91,1929
4,5,Lionel Messi,925,1194,0.77,2003
5,6,Jimmy Jones,840,760,1.11,1943
6,7,Ferenc Puskás,802,792,1.01,1943
7,8,Ferenc Deák,795,515,1.54,1939
8,9,Abe Lenstra,790,850,0.93,1936
9,10,Romário,785,1003,0.78,1984
