Load data and check shape

In [114]:
import pandas as pd

df = pd.read_html("https://en.wikipedia.org/wiki/NBA_All-Star_Game#All-Star_Game_results")

df = df[2]
print(df.shape)

df = df.dropna()
print(df.shape)

(74, 5)
(71, 5)


In [115]:
print(df)

       Year                              Result  \
0      1951                   East 111, West 94   
1      1952                   East 108, West 91   
2      1953                    West 79, East 75   
3      1954               East 98, West 93 (OT)   
4      1955                   East 100, West 91   
..      ...                                 ...   
66     2017                  West 192, East 182   
67  2018[5]   Team LeBron 148, Team Stephen 145   
68     2019   Team LeBron 178, Team Giannis 164   
69     2020  Team LeBron 157, Team Giannis 155‡   
70     2021    Team LeBron 170, Team Durant 150   

                            Host arena                      Host city  \
0                        Boston Garden          Boston, Massachusetts   
1                    Boston Garden (2)      Boston, Massachusetts (2)   
2   Allen County War Memorial Coliseum            Fort Wayne, Indiana   
3          Madison Square Garden III**        New York City, New York   
4      Madison Square 

Drop columns

In [116]:
df = df.drop(labels = ['Host arena', 'Game MVP'], axis = 1)

Extract City name

In [117]:
df['Host city'] = df['Host city'].str.split(', ').str[0]
df

Unnamed: 0,Year,Result,Host city
0,1951,"East 111, West 94",Boston
1,1952,"East 108, West 91",Boston
2,1953,"West 79, East 75",Fort Wayne
3,1954,"East 98, West 93 (OT)",New York City
4,1955,"East 100, West 91",New York City
...,...,...,...
66,2017,"West 192, East 182",New Orleans
67,2018[5],"Team LeBron 148, Team Stephen 145",Los Angeles
68,2019,"Team LeBron 178, Team Giannis 164",Charlotte
69,2020,"Team LeBron 157, Team Giannis 155‡",Chicago


Split Result into 2 columns, for East and West and their respective points.

In [118]:
df.Result = df.Result.str.split(', ').apply(lambda x: sorted(x))
df = df.reset_index(drop = True)
df


Unnamed: 0,Year,Result,Host city
0,1951,"[East 111, West 94]",Boston
1,1952,"[East 108, West 91]",Boston
2,1953,"[East 75, West 79]",Fort Wayne
3,1954,"[East 98, West 93 (OT)]",New York City
4,1955,"[East 100, West 91]",New York City
...,...,...,...
66,2017,"[East 182, West 192]",New Orleans
67,2018[5],"[Team LeBron 148, Team Stephen 145]",Los Angeles
68,2019,"[Team Giannis 164, Team LeBron 178]",Charlotte
69,2020,"[Team Giannis 155‡, Team LeBron 157]",Chicago


In [119]:
df['East'] = df['Result'].str[0]
df['West'] = df['Result'].str[1]
df

Unnamed: 0,Year,Result,Host city,East,West
0,1951,"[East 111, West 94]",Boston,East 111,West 94
1,1952,"[East 108, West 91]",Boston,East 108,West 91
2,1953,"[East 75, West 79]",Fort Wayne,East 75,West 79
3,1954,"[East 98, West 93 (OT)]",New York City,East 98,West 93 (OT)
4,1955,"[East 100, West 91]",New York City,East 100,West 91
...,...,...,...,...,...
66,2017,"[East 182, West 192]",New Orleans,East 182,West 192
67,2018[5],"[Team LeBron 148, Team Stephen 145]",Los Angeles,Team LeBron 148,Team Stephen 145
68,2019,"[Team Giannis 164, Team LeBron 178]",Charlotte,Team Giannis 164,Team LeBron 178
69,2020,"[Team Giannis 155‡, Team LeBron 157]",Chicago,Team Giannis 155‡,Team LeBron 157


In [120]:
df = df.drop(labels = 'Result', axis = 1)
df

Unnamed: 0,Year,Host city,East,West
0,1951,Boston,East 111,West 94
1,1952,Boston,East 108,West 91
2,1953,Fort Wayne,East 75,West 79
3,1954,New York City,East 98,West 93 (OT)
4,1955,New York City,East 100,West 91
...,...,...,...,...
66,2017,New Orleans,East 182,West 192
67,2018[5],Los Angeles,Team LeBron 148,Team Stephen 145
68,2019,Charlotte,Team Giannis 164,Team LeBron 178
69,2020,Chicago,Team Giannis 155‡,Team LeBron 157


In [121]:
df.East = df.East.str.extract('(\d+)')
df

Unnamed: 0,Year,Host city,East,West
0,1951,Boston,111,West 94
1,1952,Boston,108,West 91
2,1953,Fort Wayne,75,West 79
3,1954,New York City,98,West 93 (OT)
4,1955,New York City,100,West 91
...,...,...,...,...
66,2017,New Orleans,182,West 192
67,2018[5],Los Angeles,148,Team Stephen 145
68,2019,Charlotte,164,Team LeBron 178
69,2020,Chicago,155,Team LeBron 157


In [122]:
df.West = df.West.str.extract('(\d+)')
df

Unnamed: 0,Year,Host city,East,West
0,1951,Boston,111,94
1,1952,Boston,108,91
2,1953,Fort Wayne,75,79
3,1954,New York City,98,93
4,1955,New York City,100,91
...,...,...,...,...
66,2017,New Orleans,182,192
67,2018[5],Los Angeles,148,145
68,2019,Charlotte,164,178
69,2020,Chicago,155,157


Count differences in scores

In [123]:
# zliczenie roznic w wynikach
df = df.dropna()
df['Diff'] = df['East'].astype(int) - df['West'].astype(int)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Diff'] = df['East'].astype(int) - df['West'].astype(int)


Unnamed: 0,Year,Host city,East,West,Diff
0,1951,Boston,111,94,17
1,1952,Boston,108,91,17
2,1953,Fort Wayne,75,79,-4
3,1954,New York City,98,93,5
4,1955,New York City,100,91,9
...,...,...,...,...,...
66,2017,New Orleans,182,192,-10
67,2018[5],Los Angeles,148,145,3
68,2019,Charlotte,164,178,-14
69,2020,Chicago,155,157,-2


In [124]:
# najwieksze roznice
df.Diff.value_counts().sort_values(ascending = False)

-5     5
 9     4
 2     4
 17    3
-11    3
 7     3
 8     3
 12    3
-2     2
 5     2
-15    2
 6     2
 10    2
 11    2
-4     2
-14    2
-20    2
 20    2
-27    2
-3     2
 3     2
-10    2
-1     2
 1     2
 43    1
-22    1
-23    1
 14    1
 21    1
-16    1
-9     1
-21    1
 4     1
-40    1
Name: Diff, dtype: int64

In [125]:
df.groupby('Diff').size()

Diff
-40    1
-27    2
-23    1
-22    1
-21    1
-20    2
-16    1
-15    2
-14    2
-11    3
-10    2
-9     1
-5     5
-4     2
-3     2
-2     2
-1     2
 1     2
 2     4
 3     2
 4     1
 5     2
 6     2
 7     3
 8     3
 9     4
 10    2
 11    2
 12    3
 14    1
 17    3
 20    2
 21    1
 43    1
dtype: int64