In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("NBA_TRAIN.csv")
df_test = pd.read_csv("NBA_TEST.csv")

## 1. 잘하거나 꾸준한 선수 feature 생성

In [3]:
len(df["shoot player"].unique())

482

In [4]:
df.loc[df["current shot outcome"]=="SCORED","CSO"] = 1
df.loc[df["current shot outcome"]=="MISSED","CSO"] = 0

In [5]:
total = df.groupby("shoot player").sum()
total_score = total["CSO"].sort_values(ascending=False)

In [6]:
total_score

shoot player
Russell Westbrook     422.0
Karl-Anthony Towns    398.0
Anthony Davis         373.0
LeBron James          363.0
Kyrie Irving          356.0
                      ...  
Brice Johnson           0.0
Gary Neal               0.0
Elijah Millsap          0.0
Aaron Harrison          0.0
Larry Sanders           0.0
Name: CSO, Length: 482, dtype: float64

In [7]:
average = df.groupby("shoot player").mean()
percentage = average["CSO"].sort_values(ascending=False)

In [8]:
pd.DataFrame(percentage).rename(columns={"CSO": "PERCENT"})

Unnamed: 0_level_0,PERCENT
shoot player,Unnamed: 1_level_1
Jarnell Stokes,1.000000
Walter Tavares,0.750000
Tyson Chandler,0.701613
Lucas Nogueira,0.694118
DeAndre Jordan,0.687023
...,...
Mike Tobey,0.000000
Ben Bentil,0.000000
Gary Neal,0.000000
Elijah Millsap,0.000000


In [9]:
df1 = pd.DataFrame(percentage).rename(columns={"CSO": "PERCENT"}).sort_index()
df2 = pd.DataFrame(total_score).sort_index()
df3 = df1.join(df2)
df4 = df3.sort_values(by="CSO",ascending=False)[:100]
top_players = df4.index[:25]
guaranteed_players = df4[df4["PERCENT"] > 0.5].index
print(len(top_players))
print(len(guaranteed_players))

25
25


In [10]:
df["self previous shot"].value_counts()

MISSED    56595
SCORED    47240
Name: self previous shot, dtype: int64

## 2. 포지션 One-hot 인코딩으로 변경

In [11]:

df = pd.read_csv("NBA_TRAIN.csv")
df_test = pd.read_csv("NBA_TEST.csv")




from sklearn.preprocessing import OneHotEncoder
onehotencoder1 = OneHotEncoder()
X = onehotencoder1.fit_transform(df[["player position"]]).toarray()
positions =  pd.DataFrame(X,columns=onehotencoder1.categories_)
df = pd.concat([df, positions], axis=1)
X = onehotencoder1.transform(df_test[["player position"]]).toarray()
positions = pd.DataFrame(X,columns=onehotencoder1.categories_)

## 3. 슛타입 정리

In [12]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(df["shot type"].value_counts())

Jump Shot                                    47020
Layup                                         7939
Pullup Jump Shot                              7369
Driving Layup                                 6682
Floating Jump Shot                            2274
Step Back Jump Shot                           2263
Hook Shot                                     2085
Tip Layup Shot                                2029
Cutting Layup Shot                            1928
Running Layup                                 1825
Turnaround Jump Shot                          1621
Driving Floating Jump Shot                    1591
Dunk                                          1385
Fadeaway Jumper                               1367
Putback Layup                                 1132
Driving Finger Roll Layup                     1119
Cutting Dunk Shot                             1041
Running Jump Shot                              920
Reverse Layup                                  897
Turnaround Hook Shot           

In [13]:
def clean(shottype):
    shot_type = shottype.split("'")[-1]
    shot_type = shot_type.split(".")[-1]
    shot_type = shot_type.replace(" ","").lower()
    shot_type = shot_type.replace("morris","").replace("iii","").replace("amoute","").replace("iv","").replace("ii","")

    if "bank" in shot_type:
        return "bankshot"
    elif "putback" in shot_type:
        return "putback"
    elif "3pt" in shot_type:
        return "3ptshot"
    elif "cutting" in shot_type:
        return "cutting"
    elif "alleyoop" in shot_type:
        return "alleyoop"
    else:
        return shot_type
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(df["shot type"].apply(clean).value_counts())

jumpshot                  47612
layup                      8142
pullupjumpshot             7511
dringlayup                 6792
cutting                    3262
floatingjumpshot           2317
stepbackjumpshot           2304
bankshot                   2229
hookshot                   2098
tiplayupshot               2089
runninglayup               1874
turnaroundjumpshot         1672
dringfloatingjumpshot      1613
putback                    1487
dunk                       1428
fadeawayjumper             1410
alleyoop                   1278
dringfingerrolllayup       1134
3ptshot                     942
runningjumpshot             933
reverselayup                929
turnaroundhookshot          899
dringreverselayup           859
dringdunk                   747
runningdunk                 683
dringhookshot               655
turnaroundfadeawayshot      467
turnaroundfadeaway          455
runningpull-upjumpshot      301
fingerrolllayup             294
runningfingerrolllayup      243
tipdunks

## 전처리 최종

In [14]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer


df = pd.read_csv("NBA_TRAIN.csv")
df_test = pd.read_csv("NBA_TEST.csv")

def is_topplayer(x):
    if x in top_players:
        return 1
    else :
        return 0
def is_guaranteed_players(x):
    if x in guaranteed_players:
        return 1
    else :
        return 0
def get_distance(x, y):
    new_x = min(abs((933)-x),x)
    new_y = abs((500/2)-y)
    return (new_x**2+new_y**2)**0.5
def time_to_sec(time):
    m,s = time.split(":")
    return int(m)*60+int(s)

def clean(shottype):
    shot_type = shottype.split("'")[-1]
    shot_type = shot_type.split(".")[-1]
    shot_type = shot_type.replace(" ","").lower()
    shot_type = shot_type.replace("morris","").replace("iii","").replace("amoute","").replace("iv","").replace("ii","")

    if "bank" in shot_type:
        return "bankshot"
    elif "putback" in shot_type:
        return "putback"
    elif "3pt" in shot_type:
        return "3ptshot"
    elif "cutting" in shot_type:
        return "cutting"
    elif "alleyoop" in shot_type:
        return "alleyoop"
    else:
        return shot_type
    
    
    return shot_type



enc = LabelEncoder()
X = enc.fit_transform(df["home team"])
df["home team"] = X
X = enc.transform(df_test["home team"])
df_test["home team"] = X
X = enc.transform(df["away team"])
df["away team"] = X
X = enc.transform(df_test["away team"])
df_test["away team"] = X
enc2 = LabelEncoder()
df["shot type"] = df["shot type"].apply(clean)
X = enc2.fit_transform(df["shot type"])
df["shot type"] = X
df_test["shot type"] = df_test["shot type"].apply(clean)
X = enc2.transform(df_test["shot type"])
df_test["shot type"] = X

def preprocesing(df):
    median = df["time from last shot"].median()
    df["time from last shot"].fillna(median, inplace=True)
    median = df["location x"].median()
    df["location x"].fillna(median, inplace=True)
    median = df["location y"].median()
    df["location y"].fillna(median, inplace=True)
    
    df["distance"] = df.apply(lambda x: get_distance(x["location x"], x["location y"]), axis=1)

    df["is_topplayer"] = df["shoot player"].apply(is_topplayer)
    df["is_guaranteed_players"] = df["shoot player"].apply(is_topplayer)


    df.loc[df['home game']!="Yes","HG"] = 0
    df.loc[df['home game']=="Yes","HG"] = 1
    df.loc[df['self previous shot']!="SCORED","SPS"] = 0
    df.loc[df['self previous shot']=="SCORED","SPS"] = 1
    df.loc[df['opponent previous shot']!="SCORED","OPS"] = 0
    df.loc[df['opponent previous shot']=="SCORED","OPS"] = 1
    

    try:
        df.loc[df['current shot outcome']!="SCORED","CSO"] = 0
        df.loc[df['current shot outcome']=="SCORED","CSO"] = 1
        df.drop(columns=["current shot outcome"], inplace=True)
    except:
        pass
    
    try:
        df.drop(columns=["id"], inplace=True)
    except:
        pass



    df['day'] = pd.to_datetime(df['date']).dt.dayofweek 
    df.drop(columns="date",axis=1, inplace=True)
    df.drop(columns="shoot player",axis=1, inplace=True)
    df['seconds'] = df['time'].apply(time_to_sec)
    df.drop(columns=["time"], inplace=True)
    
    

    df.drop(columns=["self previous shot", "opponent previous shot"], inplace=True)
    df.drop(columns=["home game"], inplace=True)
    df = pd.get_dummies(df)
    return df
new_df_test = preprocesing(df_test)
new_df = preprocesing(df)

In [15]:
new_df.head()

Unnamed: 0,location x,home team,shot type,points,away team,location y,time from last shot,quarter,distance,is_topplayer,...,CSO,day,seconds,player position_C,player position_F,player position_G,player position_PF,player position_PG,player position_SF,player position_SG
0,676.0,18,17,2,4,225.0,9.0,1,258.21309,0,...,1.0,6,648,0,0,0,1,0,0,0
1,59.0,13,18,2,6,230.0,45.0,1,62.297673,0,...,0.0,3,110,0,0,0,0,1,0,0
2,50.0,2,18,2,3,269.0,46.0,3,53.488316,0,...,1.0,4,246,1,0,0,0,0,0,0
3,194.0,13,20,2,24,357.0,31.0,4,221.551348,0,...,1.0,1,13,0,0,0,0,0,0,1
4,35.0,14,17,2,19,449.0,27.0,3,202.054448,0,...,0.0,4,698,0,0,0,1,0,0,0


In [16]:
new_df.columns

Index(['location x', 'home team', 'shot type', 'points', 'away team',
       'location y', 'time from last shot', 'quarter', 'distance',
       'is_topplayer', 'is_guaranteed_players', 'HG', 'SPS', 'OPS', 'CSO',
       'day', 'seconds', 'player position_C', 'player position_F',
       'player position_G', 'player position_PF', 'player position_PG',
       'player position_SF', 'player position_SG'],
      dtype='object')

In [17]:
train_y = new_df['CSO']
train_X = new_df.drop("CSO", axis=1) # 훈련 세트를 위해 레이블 삭제
test_X = new_df_test

In [18]:
train_X.shape

(105036, 23)

In [19]:
test_X.shape

(105036, 23)

In [20]:
new_df.corr()["CSO"].sort_values()

distance                -0.198998
points                  -0.137947
shot type               -0.082057
time from last shot     -0.043894
player position_SG      -0.021611
player position_G       -0.021242
player position_PG      -0.017733
quarter                 -0.016209
player position_SF      -0.014868
OPS                     -0.014041
seconds                 -0.014024
SPS                     -0.008499
player position_F       -0.005570
location x              -0.001357
day                     -0.000700
home team                0.001715
away team                0.001719
location y               0.002802
is_topplayer             0.008259
is_guaranteed_players    0.008259
HG                       0.008688
player position_PF       0.014180
player position_C        0.060157
CSO                      1.000000
Name: CSO, dtype: float64

## 4. 정규화로 마무리

In [21]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
train_X = scaler.fit_transform(train_X)
test_X = scaler.transform(test_X)

## 모든 Classifier 한번 테스트

In [22]:
from sklearn.utils import all_estimators

estimators = all_estimators(type_filter='classifier')

all_regs = []
for name, RegressorClass in estimators:
    
    try:
        reg = RegressorClass()
        all_regs.append(reg)
        print(name)
    except:
        pass

AdaBoostClassifier
BaggingClassifier
BernoulliNB
CalibratedClassifierCV
CategoricalNB
ComplementNB
DecisionTreeClassifier
DummyClassifier
ExtraTreeClassifier
ExtraTreesClassifier
GaussianNB
GaussianProcessClassifier
GradientBoostingClassifier
HistGradientBoostingClassifier
KNeighborsClassifier
LabelPropagation
LabelSpreading
LinearDiscriminantAnalysis
LinearSVC
LogisticRegression
LogisticRegressionCV
MLPClassifier
MultinomialNB
NearestCentroid
NuSVC
PassiveAggressiveClassifier
Perceptron
QuadraticDiscriminantAnalysis
RadiusNeighborsClassifier
RandomForestClassifier
RidgeClassifier
RidgeClassifierCV
SGDClassifier
SVC


## 베이스라인 모델로 검증
- 전처리 또는 feature engineering을 변경하면서 이구간에서 최대한 성능 향상

In [23]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import HistGradientBoostingClassifier

rfc=RandomForestClassifier(random_state=42)

scores = cross_val_score(rfc, train_X, train_y, cv=5)
print(scores)

[0.64975248 0.65135431 0.64159566 0.64559433 0.64435664]


## 여러 Classifier 검증

In [24]:
results = []
SUPER_SLOW_REGRESSION = ["SVC","NuSVC","GaussianProcessClassifier","LabelPropagation","LabelSpreading"]
for reg in all_regs:
    reg_name = reg.__class__.__name__ 
    if reg_name not in SUPER_SLOW_REGRESSION:
        try:

            scores = cross_val_score(reg, train_X, train_y,  cv=5)

            if not scores.mean():
                break
            print("{}: Score {}".format(reg.__class__.__name__, scores.mean()))
            result = {
                "Name":reg.__class__.__name__, 
                "Score":scores.mean()
            }
            results.append(result)
        except Exception as e:
            print(e)
            pass
    
    

AdaBoostClassifier: Score 0.645331062985533
BaggingClassifier: Score 0.6209966183433362
BernoulliNB: Score 0.5623691025850702
CalibratedClassifierCV: Score 0.5988898565926107
index 1 is out of bounds for axis 1 with size 1
ComplementNB: Score 0.5610076416699457
DecisionTreeClassifier: Score 0.5712137054304351
DummyClassifier: Score 0.5051410863263184




ExtraTreeClassifier: Score 0.5491546370446061
ExtraTreesClassifier: Score 0.6291747154894257
GaussianNB: Score 0.5865703004703289
GradientBoostingClassifier: Score 0.6521953839707658
HistGradientBoostingClassifier: Score 0.6592691720900635
KNeighborsClassifier: Score 0.5541718617458635
LinearDiscriminantAnalysis: Score 0.5987184896769078
LinearSVC: Score 0.5987184883173349
LogisticRegression: Score 0.5988803368630142
LogisticRegressionCV: Score 0.599870492521787




MLPClassifier: Score 0.6332399762948491
MultinomialNB: Score 0.5698998408973385
NearestCentroid: Score 0.5503160617298313
PassiveAggressiveClassifier: Score 0.5156904046346427
Perceptron: Score 0.5249723009676788




QuadraticDiscriminantAnalysis: Score 0.5241537193328841
No neighbors found for test samples array([15376]), you can try using larger radius, giving a label for outliers, or considering removing them from your dataset.
RandomForestClassifier: Score 0.644788394892197
RidgeClassifier: Score 0.5987470502252703
RidgeClassifierCV: Score 0.5987470502252703
SGDClassifier: Score 0.5951387754149045


## 가장 잘 나온 모델 튜닝
- HistGradientBoostingClassifier

In [None]:
param_grid = { 
    'learning_rate':(0.1,0.01,0.005),
    'max_iter':(1000,2000),
    'max_leaf_nodes':(None,50,60,70),
    'tol':(1e-6,1e-7,1e-8)
}


#instantiate the gridsearch
hbc = HistGradientBoostingClassifier()
hgb_grid = GridSearchCV(estimator =hbc, param_grid= param_grid, n_jobs=-1, 
 cv=5, scoring='accuracy',
 verbose=3, refit=True)
#fit on the grid 
hgb_grid.fit(train_X, train_y)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  8.1min


In [None]:
# Print the best parameters found
print(hgb_grid.best_params_)
# Print the best scores found
print(hgb_grid.best_score_)

## 최종 예측

In [None]:
hbc = HistGradientBoostingClassifier(learning_rate= 0.01, max_iter=1000, max_leaf_nodes=50, tol=1e-07, random_state=42)
hbc.fit(train_X, train_y)

In [None]:

predicts = hbc.predict(test_X)

In [None]:
predicts

In [None]:
result = pd.DataFrame(predicts, columns=["current shot outcome"])

In [None]:
result["id"] = result.index
result

In [None]:
result["current shot outcome"].value_counts()

In [None]:
result.loc[result["current shot outcome"]==1,"current shot outcome"]= "SCORED"

In [None]:
result.loc[result["current shot outcome"]==0,"current shot outcome"]= "MISSED"

In [None]:
result

In [None]:
result = result[['id',"current shot outcome"]]
result

In [None]:
result.to_csv("submission.csv",index=False)