# 빅데이터 분석 프로그래밍 과제
## NBA 슛 결과 예측

### 1. 데이터 로딩

In [None]:
import pandas as pd

In [None]:
train_df = pd.read_csv("NBA_TRAIN.csv")

### 2. 데이터 분석

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df.describe()

In [None]:
for c in train_df.columns:
    if train_df[c].dtype=="object":
        print ("---- %s ---" % c)
        print (train_df[c].value_counts())

In [None]:
train_df.columns

### 3. 데이터 전처리

In [None]:

def time_converter(time):
    m,s = time.split(":")
    return int(m)*60+int(s)

def get_organized_df(df):
    cols = ['self previous shot', 'player position', 'home game', 'location x',
       'opponent previous shot', 'shot type', 'points',
        'location y', 'time', 'time from last shot', 'quarter']
    if 'current shot outcome' in df.columns:
        cols.append('current shot outcome')
    new_df = df[cols]
    
    rename_dict = {
    'self previous shot':'SPS', 
    'player position':'PP', 
    'home game':'HG', 
    'location x':'loc_x',
    'opponent previous shot':'OPS', 
    'shot type':'ST', 
    'location y':'loc_y', 
    'time from last shot':'TLS'}
    if 'current shot outcome' in df.columns:
        rename_dict['current shot outcome']='CSO'
    #컬럼 명을 짧게 변경
    new_df = new_df.rename(columns=rename_dict)
    
    #텍스트로 된 값을 숫자로 변경
    new_df.loc[new_df['SPS']=="SCORED",'SPS'] = 1
    new_df.loc[new_df['SPS']=="MISSED",'SPS'] = 0
    new_df.loc[new_df['OPS']=="BLOCKED",'OPS'] = 2
    new_df.loc[new_df['OPS']=="SCORED",'OPS'] = 1
    new_df.loc[new_df['OPS']=="MISSED",'OPS'] = 0
    if 'current shot outcome' in df.columns:
        new_df.loc[new_df['CSO']=="SCORED",'CSO'] = 1
        new_df.loc[new_df['CSO']=="MISSED",'CSO'] = 0
    new_df.loc[new_df['HG']=="Yes",'HG'] = 1
    new_df.loc[new_df['HG']=="No",'HG'] = 0
    new_df.loc[new_df['PP']=="PG",'PP'] = 0
    new_df.loc[new_df['PP']=="SG",'PP'] = 1
    new_df.loc[new_df['PP']=="SF",'PP'] = 2
    new_df.loc[new_df['PP']=="PF",'PP'] = 3
    new_df.loc[new_df['PP']=="C",'PP'] = 4
    new_df.loc[new_df['PP']=="F",'PP'] = 5
    new_df.loc[new_df['PP']=="G",'PP'] = 6
    
    #분:초 형태로 된 값을 초로 변경
    new_df['time'] = new_df['time'].apply(time_converter)
    
    
    return new_df

In [None]:
# 위에서 정의한 함수를 적용해서 데이터 전처리
new_df = get_organized_df(train_df)

In [None]:
new_df.head()

In [None]:
new_df.info()

In [None]:
#결측값 제거
new_df = new_df.dropna()
new_df.info()

### 4. 훈련 및 검증

In [None]:
#데이터 수 선택
num_of_row = 80000
#Feature 선택
chosen_feature = ['PP','HG','OPS','points','time','TLS','loc_x','loc_y']
X = new_df[chosen_feature].to_numpy()[:num_of_row]
#클래스 선택
y = new_df['CSO'].to_numpy()[:num_of_row]

In [None]:
X

In [None]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split


In [None]:
y=y.astype('int')
#학습/테스트 용으로 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

#Decision Tree Classifier적용
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

name = clf.__class__.__name__
print(name)
print('****Results****')
train_predictions = clf.predict(X_test)
acc = accuracy_score(y_test, train_predictions)
print("Accuracy: {:.4%}".format(acc))

### 5. 예측

In [None]:
#예측할 데이터 불러오기
test_df = pd.read_csv("NBA_TEST.csv")

In [None]:
#마찬가지로 전처리 적용
new_df2 = get_organized_df(test_df)
new_df2.head()

In [None]:
# 결측값 채우기
new_df2.fillna(0,inplace=True)

In [None]:
# Feature 선택
X = new_df2[chosen_feature].to_numpy()


In [None]:
X

In [None]:
# 미리 훈련된 model(clf)를 갖고 새로 들어온 데이터에 대해 예측
result = clf.predict(X)


In [None]:
result

In [None]:
# 결과 값과 id 값 매칭
df = pd.DataFrame(result,columns=["current shot outcome"])
df['id'] = test_df['id']
df.loc[df['current shot outcome']==1,'current shot outcome'] = "SCORED"
df.loc[df['current shot outcome']==0,'current shot outcome'] = "MISSED"

In [None]:
# 제출할 파일 생성
df.to_csv("submission.csv",index=False,columns=['id','current shot outcome'])