# Logistic Regression 과제

#### 데이터 출처

moneyball 데이터

In [1]:
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
from statsmodels.stats.outliers_influence import variance_inflation_factor
# from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

plt.style.use('seaborn')
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
data_dir = '../../data/moneyball/baseball.csv'
df_1 = pd.read_csv(data_dir)
df_1.head()

Unnamed: 0,Team,League,Year,RS,RA,W,OBP,SLG,BA,Playoffs,RankSeason,RankPlayoffs,G,OOBP,OSLG
0,ARI,NL,2012,734,688,81,0.328,0.418,0.259,0,,,162,0.317,0.415
1,ATL,NL,2012,700,600,94,0.32,0.389,0.247,1,4.0,5.0,162,0.306,0.378
2,BAL,AL,2012,712,705,93,0.311,0.417,0.247,1,5.0,4.0,162,0.315,0.403
3,BOS,AL,2012,734,806,69,0.315,0.415,0.26,0,,,162,0.331,0.428
4,CHC,NL,2012,613,759,61,0.302,0.378,0.24,0,,,162,0.335,0.424


* RS : Runs Scored - 득점? 
* RA : Rns Allowed - 피득점?
* W : Wins
* OBP : On-Base Percentage - how frequently a batter reaches base 출루율
* SLG: Slugging Percentage - 베이스 별로 가중치를 따로 두고 전체 합산 한 후 total base로 나눔
* BA : Batting Average - 타율 
* G : Games Played
* OOBP : Opponent On-Base Percentage 
* OSLG : Opponent Slugging Percentage

* target data : Playoffs

## EDA & Preprocessing

In [3]:
df_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1232 entries, 0 to 1231
Data columns (total 15 columns):
Team            1232 non-null object
League          1232 non-null object
Year            1232 non-null int64
RS              1232 non-null int64
RA              1232 non-null int64
W               1232 non-null int64
OBP             1232 non-null float64
SLG             1232 non-null float64
BA              1232 non-null float64
Playoffs        1232 non-null int64
RankSeason      244 non-null float64
RankPlayoffs    244 non-null float64
G               1232 non-null int64
OOBP            420 non-null float64
OSLG            420 non-null float64
dtypes: float64(7), int64(6), object(2)
memory usage: 144.5+ KB


* 팀, 리그 등의 범주형 변수, 그 외 연속형 변수처럼 보이는 feature들이 있습니다.
* 결측치가 존재하는 feature들도 보입니다.

In [4]:
df_1.describe()

Unnamed: 0,Year,RS,RA,W,OBP,SLG,BA,Playoffs,RankSeason,RankPlayoffs,G,OOBP,OSLG
count,1232.0,1232.0,1232.0,1232.0,1232.0,1232.0,1232.0,1232.0,244.0,244.0,1232.0,420.0,420.0
mean,1988.957792,715.081981,715.081981,80.904221,0.326331,0.397342,0.259273,0.198052,3.122951,2.717213,161.918831,0.332264,0.419743
std,14.819625,91.534294,93.079933,11.458139,0.015013,0.033267,0.012907,0.398693,1.738349,1.095234,0.624365,0.015295,0.02651
min,1962.0,463.0,472.0,40.0,0.277,0.301,0.214,0.0,1.0,1.0,158.0,0.294,0.346
25%,1976.75,652.0,649.75,73.0,0.317,0.375,0.251,0.0,2.0,2.0,162.0,0.321,0.401
50%,1989.0,711.0,709.0,81.0,0.326,0.396,0.26,0.0,3.0,3.0,162.0,0.331,0.419
75%,2002.0,775.0,774.25,89.0,0.337,0.421,0.268,0.0,4.0,4.0,162.0,0.343,0.438
max,2012.0,1009.0,1103.0,116.0,0.373,0.491,0.294,1.0,8.0,5.0,165.0,0.384,0.499


* OOBP와 OSLG에 결측치가 너무나도 많습니다. 섣불리 drop하기보다 통계치를 활용하여 전처리하겠습니다.
* 범주형 변수들은 인코딩해주고 나머지 분석에 필요없어 보이는 feature들을 걸러내겠습니다.
* 또한 Win 변수 외에 Playoff진출에 영향을 주는 feature들을 발견하기 위하여 W를 제거하겠습니다.

In [5]:
print(df_1.isnull().sum())
df_1['OOBP'].fillna(df_1['OOBP'].mean(), inplace=True)
df_1['OSLG'].fillna(df_1['OSLG'].mean(), inplace=True)
df_1.League.replace({'AL':0, 'NL':1}, inplace=True)
drop_list = ['Team','Year','RankSeason','RankPlayoffs','G','W']
df_1.drop(drop_list, axis=1, inplace=True)
df_1.head()

Team              0
League            0
Year              0
RS                0
RA                0
W                 0
OBP               0
SLG               0
BA                0
Playoffs          0
RankSeason      988
RankPlayoffs    988
G                 0
OOBP            812
OSLG            812
dtype: int64


Unnamed: 0,League,RS,RA,OBP,SLG,BA,Playoffs,OOBP,OSLG
0,1,734,688,0.328,0.418,0.259,0,0.317,0.415
1,1,700,600,0.32,0.389,0.247,1,0.306,0.378
2,0,712,705,0.311,0.417,0.247,1,0.315,0.403
3,0,734,806,0.315,0.415,0.26,0,0.331,0.428
4,1,613,759,0.302,0.378,0.24,0,0.335,0.424


In [6]:
from sklearn.model_selection import train_test_split

df_y = df_1['Playoffs']
df_x = df_1.drop(['Playoffs'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size = 0.2, random_state = 0)

# Q1. train_test_split module
Q. train_test_split() 함수에 들어가는 각각의 인자 값은 무엇을 의미하는가?
train_test_split(arrays, test_size, random_state)
- arrays : 독립변수와 종속변수 배열들.
- test_size : 검증용 데이터 개수. 1보다 작은 실수이면 비율을 나타낸다. train_size는 학습용 데이터 개수로, 둘 중 하나만 있으면 된다.
- random_state : 난수 시드

In [7]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
warnings.filterwarnings(action='once')

## Q2. Scaling
Scaling을 통해 우리가 하고자 하는 것은 무엇인가요? 

- 스케일링은 자료 집합에 적용되는 전처리 과정으로 모든 자료에 선형 변환을 적용하여 전체 자료의 분포를 평균 0, 분산 1이 되도록 만드는 과정이다.스케일링은 자료의 오버플로우(overflow)나 언더플로우(underflow)를 방지하고 독립 변수의 공분산 행렬의 조건수(condition number)를 감소시켜 최적화 과정에서의 안정성 및 수렴 속도를 향상시킨다.



## Q3. LogisticRegression() 모델을 만들어주세요. 그리고 만든 모델 인자값에 들어가는 값들의 의미를 설명해주세요.
- random_state : 데이터를 섞을 때 사용하는 랜덤 번호 생성기의 시드 값
- solver : 최적화 알고리즘 선택 {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}
- multi_class :  binary problem -> 'ovr', else ->‘multinomial'

In [8]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr').fit(X_train, y_train) 

In [9]:
LR.predict(X_test) 

array([1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0], dtype=int64)

In [13]:
print('Train : ', LR.score(X_train, y_train))
print('Test : ', LR.score(X_test, y_test))

Train :  0.8746618575293057
Test :  0.8536585365853658


## Q4. data를 교차검증 해주세요.(10-fold cross_validation)

- 10-fold cross_validation을 위한 인자값을 입력해주세요.
- kfold = selection.KFold("교차검증을 위한 인자 만들기")
- 교차검증 결과를 출력하고 해석합니다.


In [14]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=10, shuffle=True, random_state=0)

# count = 0
train = []
test = []
for i, j  in kf.split(df_x):
#     print('Iter: %d'%(count))
#     count += 1
    
    X_train, X_test = df_x.iloc[i], df_x.iloc[j]
    y_train, y_test = df_y.iloc[i], df_y.iloc[j]
#     print(LR.score(X_train, y_train))
#     print(LR.score(X_test, y_test))
    train.append(LR.score(X_train, y_train))
    test.append(LR.score(X_test, y_test))

print('Train : ',np.mean(train))
print('Test : ', np.mean(test))
    

Train :  0.872565374211001
Test :  0.8726003147128246


* k-fold를 적용한 모델은 test시 좀더 우수한 것으로 보입니다.

### ref
 * https://datascienceschool.net/view-notebook/266d699d748847b3a3aa7b9805b846ae/
 * https://computer-nerd.tistory.com/66