In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
print(os.listdir("../input"))


Import neccessary libraries and packages

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline
import random
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split,cross_val_score

Load previous results from both regular season and the NCAA tourment.

In [3]:
dr_reg = pd.read_csv("../input/RegularSeasonDetailedResults.csv")
dr_tour = pd.read_csv("../input/TourneyDetailedResults.csv")
df_team = pd.read_csv("../input/Teams.csv")
dr= pd.concat((dr_reg, dr_tour), ignore_index=True)

Add some engineered features:  field goal %, 3 pointer %, free throw % and assist/(assist+turnover) % for both winner and loser.

In [4]:
dr = dr.assign(Wfgp = dr.Wfgm/dr.Wfga*100)
dr = dr.assign(Wfgp3 = dr.Wfgm3/dr.Wfga3*100)
dr = dr.assign(Lfgp = dr.Lfgm/dr.Lfga*100)
dr = dr.assign(Lfgp3 = dr.Lfgm3/dr.Lfga3*100)
dr = dr.assign(Lftp = dr.Lftm/dr.Lfta*100)
dr = dr.assign(Wftp = dr.Wftm/dr.Wfta*100)
dr = dr.assign(Latop = dr.Last/(dr.Last+dr.Lto)*100)
dr = dr.assign(Watop = dr.Wast/(dr.Wast+dr.Wto)*100)
dr.fillna(value=0.0,inplace=True)
# Sort with time
dr.sort_values(by=['Season','Daynum'],inplace=True)

Look at the histograms of the above features for both the winner an loser.

In [5]:
(hist_Wfg,bins_fg) = plt.hist(dr.Wfgp,bins=100,range=(min(dr.Lfgp),max(dr.Wfgp)))[:-1]
hist_Lfg = plt.hist(dr.Lfgp,bins=100,range=(min(dr.Lfgp),max(dr.Wfgp)))[0]
plt.xlabel('field goal %')

In [6]:
(hist_Wfg3,bins_fg3) = plt.hist(dr.Wfgp3,bins=100,range=(min(list(dr.Lfgp3)+list(dr.Wfgp3)),max(list(dr.Lfgp3)+list(dr.Wfgp3))))[:-1]
hist_Lfg3 = plt.hist(dr.Lfgp3,bins=100,range=(min(list(dr.Lfgp3)+list(dr.Wfgp3)),max(list(dr.Lfgp3)+list(dr.Wfgp3))))[0]
plt.xlabel('3 pointer %')

In [7]:
(hist_Wft,bins_ft) = plt.hist(dr.Wftp,bins=100,range=(min(list(dr.Lftp)+list(dr.Wftp)),max(list(dr.Lftp)+list(dr.Wftp))))[:-1]
hist_Lft = plt.hist(dr.Lftp,bins=100,range=(min(list(dr.Lftp)+list(dr.Wftp)),max(list(dr.Lftp)+list(dr.Wftp))))[0]
plt.xlabel('free throw %')

In [8]:
(hist_Watop,bins_atop) = plt.hist(dr.Watop,bins=100,range=(min(list(dr.Latop)+list(dr.Watop)),max(list(dr.Latop)+list(dr.Watop))))[:-1]
hist_Latop = plt.hist(dr.Latop,bins=100,range=(min(list(dr.Latop)+list(dr.Watop)),max(list(dr.Latop)+list(dr.Watop))))[0]

Then look at the winning percentage as a function of the above features.

In [9]:
prob_Wfg = hist_Wfg/(hist_Wfg+hist_Lfg)
prob_Wfg3 = hist_Wfg3/(hist_Wfg3+hist_Lfg3)
prob_Wft = hist_Wft/(hist_Wft+hist_Lft)
prob_Wato = hist_Watop/(hist_Watop+hist_Latop)

In [10]:
plt.figure(1)
plt.subplot(221)
plt.plot(bins_fg[:-1],prob_Wfg,'.')
plt.xlabel('field goal %')
plt.ylabel('winning %')

plt.subplot(222)
plt.plot(bins_fg3[:-1],prob_Wfg3,'.')
plt.xlabel('3 point %')

plt.subplot(223)
plt.plot(bins_ft[:-1],prob_Wft,'.')
plt.xlabel('free throw %')
plt.ylabel('winning %')

plt.subplot(224)
plt.plot(bins_atop[:-1],prob_Wato,'.')
plt.xlabel('ass/to %')

plt.tight_layout()

No surprise,  filed goal, 3 point, free throw percentages and assist to turnover ratio are pretty important to winning.
Next we take a look at the best teams since 2003.

In [11]:
df_wins = dr.groupby('Wteam').size().reset_index()
df_wins = df_wins.rename(index=str, columns={"Wteam": "Team_Id",0:'Wins'})

df_loss = dr.groupby('Lteam').size().reset_index()
df_loss=df_loss.rename(index=str, columns={"Lteam": "Team_Id",0:'Loss'})

In [12]:
df_percent = df_wins['Wins']/(df_wins['Wins']+df_loss['Loss'])*100
df_percent = df_percent.to_frame()
df_percent['Team_Id'] = df_wins['Team_Id']
df_percent=df_percent.rename(index=str, columns={0:'Win %'})

In [13]:
df_win = df_team.merge(df_wins,on='Team_Id').merge(df_loss,on='Team_Id').merge(df_percent,on='Team_Id').sort_values(by='Win %',ascending=False)

In [14]:
df_win[:10]

Kansas, Gonzaga and Duke are the top 3 in winning percentage.

Now we are gonna use a new feature called the Elo rating. It is invented by Arpad Elo and it's used for reflecting the strength teams. Reference: https://en.wikipedia.org/wiki/Elo_rating_system
A base value of 1500 and width of 400 is used.

In [15]:
base_elo = 1500
elo_width = 400

In [16]:
# Define functions to calculate elo rating and the mean season stats. 
def cal_elo(win_elo_before, lose_elo_before):
    expect_a = 1.0/(1+10**((lose_elo_before - win_elo_before)/elo_width))
    # Change k factor for different rating range so that high ratings will not be too sensitive to recent results.
    if win_elo_before < 2400:
        k_factor = 32
    else:
        k_factor = 24
    win_elo_after = round(win_elo_before + (k_factor * (1 - expect_a)))
    diff_elo = win_elo_after-win_elo_before
    lose_elo_after = lose_elo_before-diff_elo
    return win_elo_after, lose_elo_after

def update_stats(season,team,stat,value):
    if team not in season_stat[season]:
        season_stat[season][team] = {}
    if stat not in season_stat[season][team]:
        season_stat[season][team][stat] = [value]
    else:
        season_stat[season][team][stat].append(value)

In [17]:
x=[]
y=[]
season_stat = {}
df_team['elo'] = base_elo
current_season = dr.at[0, 'Season']
season_stat[current_season] = {}
features = ['fgp','fga','fgp3','fga3','ftp','fta','atop','or','dr','stl','blk']
#Use 11 features plus elo rating to train

In [18]:
for index, row in dr.iterrows():
    if row['Season'] != current_season:
        """After each season, regress toward the base rating to make ratings more close 
        in the start of each season
        """
        df_team['elo'] -= (df_team['elo']-base_elo)/3
        current_season = row['Season']
        season_stat[current_season] = {}
    # Calculate and update elo for Wteam and Lteam
    win_team = row['Wteam']
    lose_team = row['Lteam']
    win_elo_before = df_team.at[win_team-1101,'elo']
    lose_elo_before = df_team.at[lose_team-1101,'elo']
    win_elo,lose_elo = cal_elo(win_elo_before, lose_elo_before) 
    df_team.at[win_team-1101,'elo'] = win_elo
    df_team.at[lose_team-1101,'elo'] = lose_elo
    # Give extra 100 elo rating to the home team
    if row.Wloc == 'H':
        win_elo_before += 100
    elif row.Wloc == 'A':
        lose_elo_before += 100
    # Collect training data as the diff of features from the winner and loser.
    x_temp = [win_elo_before-lose_elo_before]
    skip = False
    for feature in features:
        try:
            W_value = np.mean(season_stat[current_season][win_team][feature])
            L_value = np.mean(season_stat[current_season][lose_team][feature])
            x_temp.append(W_value-L_value)
        except:
            skip = True
        # Update stats for this current season
        update_stats(current_season,win_team,feature,row['W'+feature])
        update_stats(current_season,lose_team,feature,row['L'+feature])
    # If the two teams have played at least one game in the season, we collect their average stats
    if not skip:
        if random.random() > 0.5:
            x.append(x_temp)
            y.append(1)    
        else:
            x.append(list(-np.array(x_temp)))
            y.append(0)                

Now let's look at the teams with the highest elo ratings. The defending champions Vilanova is the top school in this list.

In [19]:
df_team.sort_values(by='elo',ascending=False)

Now let's train the data by Sklearn's logistic regression method.

In [20]:
x_train, x_test, y_train, y_test = train_test_split(np.array(x), np.array(y), test_size=0.2)

In [21]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

In [22]:
# Logistic Regression
logreg = LogisticRegression()
logreg.fit(x_train,y_train)
acc_logreg = round(logreg.score(x_train, y_train) * 100, 2)
acc_cv = round(logreg.score(x_test, y_test) * 100, 2)
print(acc_logreg,acc_cv)

Collect the average stats from 2017 season and predict the tourment results.

In [23]:
sub = pd.read_csv("../input/SampleSubmission.csv")
sub["Team1"] = sub["Id"].apply(lambda x: int(x.split("_")[1]))
sub["Team2"] = sub["Id"].apply(lambda x: int(x.split("_")[2]))

In [24]:
x_input = []
for index, row in sub.iterrows():
    sub.at[index,'Team_Name1'] = df_team.at[row.Team1-1101,'Team_Name']
    sub.at[index,'Team_Name2'] = df_team.at[row.Team2-1101,'Team_Name']
    x_temp = [df_team.at[row.Team1-1101,'elo']-df_team.at[row.Team2-1101,'elo']]
    for feature in features:
        x_temp.append(np.mean(season_stat[2017][row.Team1][feature])-np.mean(season_stat[2017][row.Team2][feature]))
    x_input.append(x_temp)

In [25]:
sub['Pred']=logreg.predict_proba(np.array(x_input))[:,1]

In [26]:
sub[:10]

In [27]:
sub = sub[['Id','Pred']]
sub.to_csv("submission.csv", index=False)