In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from sklearn.metrics import r2_score

# Scrape Data

Data scraped from Sina's CBA Database 

In [2]:
sina_link = 'http://cba.sports.sina.com.cn/cba/stats/teamrank/'

In [3]:
season_dict = {'19-20': '205',
               '18-19': '198',
               '17-18': '189',
               '16-17': '180',
               '15-16': '171'}

In [4]:
def fetch(url):
    r = requests.get(url)
    r.encoding = 'gbk'
    soup = BeautifulSoup(r.text, 'html.parser')
    return soup

In [5]:
def get_raw_data(html):
    data_list = []
    for row in html.find_all('tr'):
        row_list = []
        for item in row.find_all('th'):
            row_list.append(item.text)
        for item in row.find_all('td'):
            row_list.append(item.text)
        data_list.append(row_list)
    return data_list

In [6]:
def get_league_df(url):
    soup = fetch(url)
    league_table = soup.findAll('table')[0]
    league_data = get_raw_data(league_table)[1:]
    league_data = [[col.replace('\n', '').replace('\t', '').strip()
                    for col in row]
                   for row in league_data]               
    league_cols = ['排名', '球队', '胜', '负', '胜率', '主场战绩',
                   '客场战绩', '每场得分', '每场丢分', '连胜/连负']
    league_df = pd.DataFrame(league_data, columns=league_cols)
    return league_df[['胜', '负', '每场得分', '每场丢分']]

In [7]:
dfs = []
for key in season_dict:
    url = f'{sina_link}?qleagueid={season_dict[key]}'
    temp = get_league_df(url)
    temp['赛季'] = key.split('-')[0]
    dfs.append(temp)

In [8]:
df = pd.concat(dfs)

# Adjust Data

In [9]:
for col in list(df.columns):
    if col != '赛季':
        df[col] = df[col].apply(float)
df.dtypes

胜       float64
负       float64
每场得分    float64
每场丢分    float64
赛季       object
dtype: object

In [10]:
df['场次'] = df['胜'] + df['负']
df['胜率'] = df['胜'] / df['场次']
df['得分'] = df['场次'] * df['每场得分']
df['失分'] = df['场次'] * df['每场丢分']
df.head(1)

Unnamed: 0,胜,负,每场得分,每场丢分,赛季,场次,胜率,得分,失分
0,44.0,2.0,120.96,100.7,19,46.0,0.956522,5564.16,4632.2


# Calculate the Exponent

In [16]:
def calc_exponent(df, last_n_yrs=3):    
    best_r2 = 0
    best_r2_exponent = 0
    
    yr_cond = sorted(list(set(df['赛季'])), reverse=True)[:last_n_yrs]
    df_adj = df[df['赛季'].isin(yr_cond)]
    print(df_adj.shape)
    
    for i in range(1, 2001):
        exponent = i / 100
        temp = df_adj.copy()
        temp['预测胜率'] = temp['得分'] ** exponent / \
                          (temp['得分'] ** exponent + 
                           temp['失分'] ** exponent)
        r2 = r2_score(temp['胜率'], temp['预测胜率'])
        if r2 > best_r2:
            best_r2 = r2
            best_r2_exponent = exponent
    
    print(f'{best_r2_exponent} is the best exponent')
    print(f'with {best_r2:.2%} being the highest R-squared.')

## Last 3 Years

In [17]:
calc_exponent(df)

(60, 9)
13.27 is the best exponent
with 96.42% being the highest R-squared.


## Last 5 Years

In [18]:
calc_exponent(df, last_n_yrs=5)

(100, 9)
13.28 is the best exponent
with 96.24% being the highest R-squared.
