# Santander Customer Transaction Prediction

In [94]:
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

In [95]:
data = pd.read_csv('train.csv')

In [96]:
data.shape # 86103개의 데이터

(200000, 202)

In [97]:
# 결측치 확인
data.isnull().sum()

ID_code    0
target     0
var_0      0
var_1      0
var_2      0
          ..
var_195    0
var_196    0
var_197    0
var_198    0
var_199    0
Length: 202, dtype: int64

In [98]:
# 결측치가 있는 행 제거
data = data.dropna()

In [100]:
data.shape

(200000, 202)

## 10000개 데이터만 사용

In [101]:
rand = np.random.choice(len(data), 10000, replace=False)
data = data.iloc[rand]
data.shape

(10000, 202)

In [102]:
# feature와 label 분리
X = data.drop(['ID_code','target'],axis=1)
y = data['target']

In [103]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=1)


In [104]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)
X_train_std = scaler.transform(X_train)
X_test_std  = scaler.transform(X_test)

X_train_std.shape, X_test_std.shape

((8000, 200), (2000, 200))

## PCA를 적용하여 차원 축소 

In [105]:
# 누적 설명률이 70%~80% 이상인 지점을 찾아 주성분의 개수로 설정한다.
from sklearn.decomposition import PCA

comp_list = [10,30,50,70,90,110,130,150]
for i in comp_list:
    pca_tmp = PCA(n_components = i)
    pca_tmp.fit_transform(data[data.columns[2:]].values)
    print( 'Cumulative explained variation for {} principal components:'.format(i),np.sum(pca_tmp.explained_variance_ratio_))    
    

Cumulative explained variation for 10 principal components: 0.29129649246789413
Cumulative explained variation for 30 principal components: 0.550681862618698
Cumulative explained variation for 50 principal components: 0.7306666622888012
Cumulative explained variation for 70 principal components: 0.8340887497014857
Cumulative explained variation for 90 principal components: 0.9026529294455234
Cumulative explained variation for 110 principal components: 0.9497538440601236
Cumulative explained variation for 130 principal components: 0.9777089437008771
Cumulative explained variation for 150 principal components: 0.993518206362304


> 약 83%의 설명률을 보이는 주성분 개수를 70개로 설정

In [106]:
n_comp = 70
pca = PCA(n_components=n_comp)
pca.fit(X_train_std)
X_train_pca = pca.transform(X_train_std)
X_test_pca  = pca.transform(X_test_std)

X_train_pca.shape, X_test_pca.shape

((8000, 70), (2000, 70))

## SVM hyperparameter search

In [107]:
from sklearn.svm import SVC
from sklearn import metrics # model evaluation
from sklearn.model_selection import GridSearchCV

In [108]:
model_svm = SVC()

In [109]:
tuned_parameters_linear = {
 'C': (np.arange(0.1,1,0.1)) , 'kernel': ['linear'] 
}

tuned_parameters_kernel = {
 'C': (np.arange(0.1,1,0.1)) , 'gamma': [0.01,0.02,0.03,0.04,0.05], 'kernel': ['rbf']
}

In [110]:
svm_linear = GridSearchCV(model_svm, tuned_parameters_linear, cv=10, scoring='accuracy')
svm_kernel = GridSearchCV(model_svm, tuned_parameters_kernel, cv=10, scoring='accuracy')


In [111]:
svm_linear.fit(X_train_pca, y_train)
svm_kernel.fit(X_train_pca, y_train)

print('linear best score:',svm_linear.best_score_)
print('kernel best score:',svm_kernel.best_score_)

linear best score: 0.9005
kernel best score: 0.900625


In [112]:
print('linear:',svm_linear.best_params_)
print('kernel:',svm_kernel.best_params_)

linear: {'C': 0.1, 'kernel': 'linear'}
kernel: {'C': 0.9, 'gamma': 0.01, 'kernel': 'rbf'}


> kernel을 적용하여 SVM을 학습시켜 보겠다.

In [113]:
svc = SVC(C=0.9, gamma=0.01, kernel='rbf')
svc.fit(X_train_pca, y_train)
y_pred = svc.predict(X_test_pca)
print('Accuracy Score:', metrics.accuracy_score(y_test, y_pred))

Accuracy Score: 0.9025


In [114]:
'''
    질문) 0.9025의 정확도는 일반적으로 '낮은 편'으로 분류되는 지 궁금합니다!
'''

"\n    질문) 0.9025의 정확도는 일반적으로 '낮은 편'으로 분류되는 지 궁금합니다!\n"

---
## Stocastic Gradieng Descent Classifier
- linear 모델에만 사용 가능

In [115]:
train = pd.read_csv('train.csv')
test  = pd.read_csv('test.csv')

In [116]:
train.shape, test.shape

((200000, 202), (200000, 201))

In [117]:
# 결측치 확인
train.isnull().sum()

ID_code    0
target     0
var_0      0
var_1      0
var_2      0
          ..
var_195    0
var_196    0
var_197    0
var_198    0
var_199    0
Length: 202, dtype: int64

In [118]:
test.isnull().sum()

ID_code    0
var_0      0
var_1      0
var_2      0
var_3      0
          ..
var_195    0
var_196    0
var_197    0
var_198    0
var_199    0
Length: 201, dtype: int64

In [119]:
X_train = train[train.columns[2:]]
X_test  = test[test.columns[1:]]
y_train = train['target']

In [120]:
# 정규화
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std  = scaler.fit_transform(X_test)

X_train_std.shape, X_test_std.shape

((200000, 200), (200000, 200))

In [139]:
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(alpha=0.0001) # default
clf.fit(X_train_std, y_train)
y_pred = clf.predict(X_test_std)

clf2 = SGDClassifier(alpha=0.001) # alpha 수정 
clf2.fit(X_train_std, y_train)
y_pred2 = clf2.predict(X_test_std)

clf3 = SGDClassifier(alpha=0.0001) # 표준화하지 않은 데이터
clf3.fit(X_train, y_train)
y_pred3 = clf.predict(X_test)

In [140]:
result = pd.DataFrame()
result['ID_code'] = test['ID_code']
result['target']  = y_pred

result2 = pd.DataFrame()
result2['ID_code'] = test['ID_code']
result2['target'] = y_pred2

result3 = pd.DataFrame()
result3['ID_code'] = test['ID_code']
result3['target'] = y_pred3

In [141]:
result.to_csv('result.csv', header=['ID_code', 'target'], index=False)   # 0.57148
result2.to_csv('result2.csv', header=['ID_code', 'target'], index=False) # 0.50020
result3.to_csv('result3.csv', header=['ID_code', 'target'], index=False) # 0.50000

전반적으로 매우 저조한 예측률...을... 보인다...