# 빅데이터 분석 프로그래밍 과제
## NBA 슛 결과 예측

### 1. 데이터 로딩

In [1]:
import pandas as pd

In [2]:
train_df = pd.read_csv("NBA_TRAIN.csv")

### 2. 데이터 분석

In [3]:
train_df.head()

Unnamed: 0,self previous shot,player position,home game,location x,opponent previous shot,home team,shot type,points,away team,location y,time,date,shoot player,time from last shot,quarter,current shot outcome
0,MISSED,PF,No,676.0,MISSED,NOP,Jump Shot,2,CHI,225.0,10:48,2017-04-02,Bobby Portis,9.0,1,SCORED
1,MISSED,PG,Yes,59.0,SCORED,LAL,Layup,2,DAL,230.0,1:50,2016-12-29,D'Angelo Russell,45.0,1,MISSED
2,SCORED,C,No,50.0,SCORED,BRO,Layup,2,CHA,269.0,4:06,2016-11-04,Cody Zeller,46.0,3,SCORED
3,SCORED,SG,No,194.0,MISSED,LAL,Pullup Jump Shot,2,POR,357.0,0:13,2017-01-10,Allen Crabbe,,4,SCORED
4,SCORED,PF,No,35.0,SCORED,MEM,Jump Shot,2,NYK,449.0,11:38,2017-04-07,Kyle O'Quinn,27.0,3,MISSED


In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105036 entries, 0 to 105035
Data columns (total 16 columns):
self previous shot        103835 non-null object
player position           105036 non-null object
home game                 105036 non-null object
location x                104834 non-null float64
opponent previous shot    104260 non-null object
home team                 105036 non-null object
shot type                 105036 non-null object
points                    105036 non-null int64
away team                 105036 non-null object
location y                104834 non-null float64
time                      105036 non-null object
date                      105036 non-null object
shoot player              105036 non-null object
time from last shot       100068 non-null float64
quarter                   105036 non-null int64
current shot outcome      105036 non-null object
dtypes: float64(3), int64(2), object(11)
memory usage: 12.8+ MB


In [5]:
train_df.describe()

Unnamed: 0,location x,points,location y,time from last shot,quarter
count,104834.0,105036.0,104834.0,100068.0,105036.0
mean,468.636568,2.315949,249.394462,33.2353,2.482815
std,345.224986,0.464895,111.120327,19.804537,1.137619
min,0.0,2.0,0.0,0.0,1.0
25%,94.0,2.0,197.0,22.0,1.0
50%,342.0,2.0,250.0,31.0,2.0
75%,843.0,3.0,302.0,41.0,3.0
max,933.0,3.0,500.0,228.0,8.0


In [6]:
for c in train_df.columns:
    if train_df[c].dtype=="object":
        print ("---- %s ---" % c)
        print (train_df[c].value_counts())

---- self previous shot ---
MISSED    56595
SCORED    47240
Name: self previous shot, dtype: int64
---- player position ---
SG    23310
PG    20643
SF    17966
PF    17333
C     16512
G      4719
F      4553
Name: player position, dtype: int64
---- home game ---
No     52523
Yes    52513
Name: home game, dtype: int64
---- opponent previous shot ---
SCORED     54452
MISSED     45376
BLOCKED     4432
Name: opponent previous shot, dtype: int64
---- home team ---
NOP    3685
HOU    3665
NYK    3627
DET    3625
BRO    3619
GSW    3610
DEN    3599
PHX    3569
OKL    3550
CLE    3549
CHI    3541
BOS    3541
ATL    3539
PHI    3534
ORL    3528
LAL    3525
CHA    3500
IND    3487
TOR    3471
POR    3451
WAS    3443
LAC    3429
SAS    3409
MIA    3405
SAC    3396
DAL    3382
MIN    3381
UTA    3344
MIL    3328
MEM    3304
Name: home team, dtype: int64
---- shot type ---
Jump Shot                                47020
Layup                                     7939
Pullup Jump Shot                 

In [7]:
train_df.columns

Index(['self previous shot', 'player position', 'home game', 'location x',
       'opponent previous shot', 'home team', 'shot type', 'points',
       'away team', 'location y', 'time', 'date', 'shoot player',
       'time from last shot', 'quarter', 'current shot outcome'],
      dtype='object')

### 3. 데이터 전처리

In [8]:

def time_converter(time):
    m,s = time.split(":")
    return int(m)*60+int(s)

def get_organized_df(df):
    cols = ['self previous shot', 'player position', 'home game', 'location x',
       'opponent previous shot', 'shot type', 'points',
        'location y', 'time', 'time from last shot', 'quarter']
    if 'current shot outcome' in df.columns:
        cols.append('current shot outcome')
    new_df = df[cols]
    
    rename_dict = {
    'self previous shot':'SPS', 
    'player position':'PP', 
    'home game':'HG', 
    'location x':'loc_x',
    'opponent previous shot':'OPS', 
    'shot type':'ST', 
    'location y':'loc_y', 
    'time from last shot':'TLS'}
    if 'current shot outcome' in df.columns:
        rename_dict['current shot outcome']='CSO'
    #컬럼 명을 짧게 변경
    new_df = new_df.rename(columns=rename_dict)
    
    #텍스트로 된 값을 숫자로 변경
    new_df.loc[new_df['SPS']=="SCORED",'SPS'] = 1
    new_df.loc[new_df['SPS']=="MISSED",'SPS'] = 0
    new_df.loc[new_df['OPS']=="BLOCKED",'OPS'] = 2
    new_df.loc[new_df['OPS']=="SCORED",'OPS'] = 1
    new_df.loc[new_df['OPS']=="MISSED",'OPS'] = 0
    if 'current shot outcome' in df.columns:
        new_df.loc[new_df['CSO']=="SCORED",'CSO'] = 1
        new_df.loc[new_df['CSO']=="MISSED",'CSO'] = 0
    new_df.loc[new_df['HG']=="Yes",'HG'] = 1
    new_df.loc[new_df['HG']=="No",'HG'] = 0
    new_df.loc[new_df['PP']=="PG",'PP'] = 0
    new_df.loc[new_df['PP']=="SG",'PP'] = 1
    new_df.loc[new_df['PP']=="SF",'PP'] = 2
    new_df.loc[new_df['PP']=="PF",'PP'] = 3
    new_df.loc[new_df['PP']=="C",'PP'] = 4
    new_df.loc[new_df['PP']=="F",'PP'] = 5
    new_df.loc[new_df['PP']=="G",'PP'] = 6
    
    #분:초 형태로 된 값을 초로 변경
    new_df['time'] = new_df['time'].apply(time_converter)
    
    
    return new_df

In [9]:
# 위에서 정의한 함수를 적용해서 데이터 전처리
new_df = get_organized_df(train_df)

In [10]:
new_df.head()

Unnamed: 0,SPS,PP,HG,loc_x,OPS,ST,points,loc_y,time,TLS,quarter,CSO
0,0,3,0,676.0,0,Jump Shot,2,225.0,648,9.0,1,1
1,0,0,1,59.0,1,Layup,2,230.0,110,45.0,1,0
2,1,4,0,50.0,1,Layup,2,269.0,246,46.0,3,1
3,1,1,0,194.0,0,Pullup Jump Shot,2,357.0,13,,4,1
4,1,3,0,35.0,1,Jump Shot,2,449.0,698,27.0,3,0


In [11]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105036 entries, 0 to 105035
Data columns (total 12 columns):
SPS        103835 non-null object
PP         105036 non-null int64
HG         105036 non-null int64
loc_x      104834 non-null float64
OPS        104260 non-null object
ST         105036 non-null object
points     105036 non-null int64
loc_y      104834 non-null float64
time       105036 non-null int64
TLS        100068 non-null float64
quarter    105036 non-null int64
CSO        105036 non-null int64
dtypes: float64(3), int64(6), object(3)
memory usage: 9.6+ MB


In [12]:
#결측값 제거
new_df = new_df.dropna()
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99685 entries, 0 to 105035
Data columns (total 12 columns):
SPS        99685 non-null object
PP         99685 non-null int64
HG         99685 non-null int64
loc_x      99685 non-null float64
OPS        99685 non-null object
ST         99685 non-null object
points     99685 non-null int64
loc_y      99685 non-null float64
time       99685 non-null int64
TLS        99685 non-null float64
quarter    99685 non-null int64
CSO        99685 non-null int64
dtypes: float64(3), int64(6), object(3)
memory usage: 9.9+ MB


### 4. 훈련 및 검증

In [13]:
#데이터 수 선택
num_of_row = 80000
#Feature 선택
chosen_feature = ['PP','HG','OPS','points','time','TLS','loc_x','loc_y']
X = new_df[chosen_feature].to_numpy()[:num_of_row]
#클래스 선택
y = new_df['CSO'].to_numpy()[:num_of_row]

In [14]:
X

array([[3, 0, 0, ..., 9.0, 676.0, 225.0],
       [0, 1, 1, ..., 45.0, 59.0, 230.0],
       [4, 0, 1, ..., 46.0, 50.0, 269.0],
       ...,
       [1, 1, 0, ..., 31.0, 264.0, 382.0],
       [1, 0, 1, ..., 44.0, 84.0, 87.0],
       [3, 0, 1, ..., 42.0, 53.0, 260.0]], dtype=object)

In [15]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split


In [16]:
y=y.astype('int')
#학습/테스트 용으로 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

#Decision Tree Classifier적용
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

name = clf.__class__.__name__
print(name)
print('****Results****')
train_predictions = clf.predict(X_test)
acc = accuracy_score(y_test, train_predictions)
print("Accuracy: {:.4%}".format(acc))

DecisionTreeClassifier
****Results****
Accuracy: 53.6818%


### 5. 예측

In [17]:
#예측할 데이터 불러오기
test_df = pd.read_csv("NBA_TEST.csv")

In [18]:
#마찬가지로 전처리 적용
new_df2 = get_organized_df(test_df)
new_df2.head()

Unnamed: 0,SPS,PP,HG,loc_x,OPS,ST,points,loc_y,time,TLS,quarter
0,,3,1,107.0,1,Floating Jump Shot,2,252.0,51,,1
1,0.0,4,1,52.0,1,Cutting Dunk Shot,2,250.0,104,30.0,1
2,0.0,1,1,225.0,0,Jump Shot,3,447.0,160,24.0,1
3,0.0,0,1,62.0,1,Layup,2,234.0,236,34.0,1
4,1.0,1,1,102.0,0,Floating Jump Shot,2,184.0,258,22.0,1


In [19]:
# 결측값 채우기
new_df2.fillna(0,inplace=True)

In [20]:
# Feature 선택
X = new_df2[chosen_feature].to_numpy()


In [21]:
X

array([[  3.,   1.,   1., ...,   0., 107., 252.],
       [  4.,   1.,   1., ...,  30.,  52., 250.],
       [  1.,   1.,   0., ...,  24., 225., 447.],
       ...,
       [  1.,   1.,   0., ...,  50., 701.,  86.],
       [  4.,   1.,   0., ...,   2., 888., 250.],
       [  3.,   1.,   0., ...,  76., 660., 318.]])

In [22]:
# 미리 훈련된 model(clf)를 갖고 새로 들어온 데이터에 대해 예측
result = clf.predict(X)


In [23]:
result

array([1, 1, 0, ..., 0, 1, 0])

In [24]:
# 결과 값과 id 값 매칭
df = pd.DataFrame(result,columns=["current shot outcome"])
df['id'] = test_df['id']
df.loc[df['current shot outcome']==1,'current shot outcome'] = "SCORED"
df.loc[df['current shot outcome']==0,'current shot outcome'] = "MISSED"

In [25]:
# 제출할 파일 생성
df.to_csv("submission.csv",index=False,columns=['id','current shot outcome'])