In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier # 의사결정나무
from sklearn.linear_model import LogisticRegression # 로지스틱 회귀
from sklearn.ensemble import RandomForestClassifier

In [3]:
df = pd.read_csv("/content/drive/MyDrive/AMEX/data/train_data.csv", nrows=500000)
df = df.sort_values(['customer_ID', 'S_2'])
df = df.drop_duplicates(subset='customer_ID', keep='last')
train_labels =  pd.read_csv("/content/drive/MyDrive/AMEX/data/train_labels.csv") 

# Add labels 
df = df.merge(train_labels, on = 'customer_ID', how='left')

# Reponse column
y = "target"

# Features 목록
features = list(df.columns)
features.remove('customer_ID')
features.remove('S_2')
features.remove(y)

In [4]:
df.isnull().sum()

customer_ID        0
S_2                0
P_2              268
D_39               0
B_1                0
               ...  
D_142          34155
D_143            220
D_144              0
D_145            220
target             0
Length: 191, dtype: int64

In [5]:
df=df.fillna(-1)

In [6]:
df.isnull().sum()

customer_ID    0
S_2            0
P_2            0
D_39           0
B_1            0
              ..
D_142          0
D_143          0
D_144          0
D_145          0
target         0
Length: 191, dtype: int64

### split

In [None]:
# train = df.drop(columns=['customer_ID','S_2', 'target'])
# test = train['target']
# seed = 1004
# train_x, train_y, test_x, test_y = train_test_split(train, test, stratify=test, test_size = 0.4, random_state = seed)

In [46]:
seed = 1004
train, test = train_test_split(df, test_size=0.4, random_state = seed)

In [47]:
train_id = train['customer_ID']
train_x = train.drop(columns=['customer_ID','S_2', 'target'])
train_y = train['target']

In [48]:
test_id = test['customer_ID']
test_x = test.drop(columns=['customer_ID','S_2', 'target'])
test_y = test['target']

### one-hot encoding

In [49]:
# pd.get_dummies(df, dummy_na=True)
train_x = pd.get_dummies(train_x)
test_x = pd.get_dummies(test_x)

# modeling

## modeling (LogisticRegression)

In [42]:
lr = LogisticRegression(penalty='l2', C=10)
lr.fit(train_x, train_y)
train_score = lr.score(train_x, train_y)
test_score = lr.score(test_x, test_y)
print('train : ',train_score)
print('test : ',test_score)

train :  0.8940129693862162
test :  0.8888888888888888


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [43]:
# from sklearn.model_selection import GridSearchCV

# params = {'penalty':['l2', 'l1'],
#          'C':[0.01, 0.1, 1, 5, 10]}

# grid_clf = GridSearchCV(lr, param_grid=params, scoring='accuracy', cv=3)
# grid_clf.fit(train_x, train_y)
# print('최적 하이퍼 파라미터 :{0}, 최적 평균 정확도 :{1:.3f}'.format(grid_clf.best_params_,
#                                                    grid_clf.best_score_))

## modeling (DecisionTree)

In [None]:
tr = DecisionTreeClassifier()
tr.fit(train_x, train_y)
train_score = tr.score(train_x, train_y)
test_score = tr.score(test_x, test_y)
print('train : ',train_score)
print('test : ',test_score)

DecisionTreeClassifier()

## modeling(RandomForest)

In [50]:
rf = RandomForestClassifier(random_state=77)
rf.fit(train_x,train_y)
train_score = rf.score(train_x, train_y)
test_score = rf.score(test_x, test_y)
print('train : ',train_score)
print('test : ',test_score)

train :  1.0
test :  0.8934732778380987


In [61]:
rf = RandomForestClassifier(random_state=100, n_estimators=200)
rf.fit(train_x,train_y)
train_score = rf.score(train_x, train_y)
test_score = rf.score(test_x, test_y)
print('train : ',train_score)
print('test : ',test_score)

train :  1.0
test :  0.8921462178791169


In [62]:
rf = RandomForestClassifier(random_state=150, n_estimators=200)
rf.fit(train_x,train_y)
train_score = rf.score(train_x, train_y)
test_score = rf.score(test_x, test_y)
print('train : ',train_score)
print('test : ',test_score)

train :  1.0
test :  0.8926287851369284


In [63]:
esti = [150, 200, 250]
deeep = [10, 50, 100, 200]

for d in deeep:
  for n in esti:
    rf = RandomForestClassifier(random_state=77, n_estimators=n, max_depth=d)
    rf.fit(train_x,train_y)
    train_score = rf.score(train_x, train_y)
    test_score = rf.score(test_x, test_y)
    print('train : ',train_score, '/ n :',n,'/d : ',d)
    print('test : ',test_score)
  print()

train :  0.9429341269202928 / n : 150 /d :  10
test :  0.8911207624562674
train :  0.9421700313681332 / n : 200 /d :  10
test :  0.8913017251779467
train :  0.9419689535912491 / n : 250 /d :  10
test :  0.8916033297140789

train :  1.0 / n : 150 /d :  50
test :  0.8928097478586078
train :  1.0 / n : 200 /d :  50
test :  0.8940161660031367
train :  1.0 / n : 250 /d :  50
test :  0.8931716733019665

train :  1.0 / n : 150 /d :  100
test :  0.8932923151164194
train :  1.0 / n : 200 /d :  100
test :  0.8940161660031367
train :  1.0 / n : 250 /d :  100
test :  0.8935335987453251

train :  1.0 / n : 150 /d :  200
test :  0.8932923151164194
train :  1.0 / n : 200 /d :  200
test :  0.8940161660031367
train :  1.0 / n : 250 /d :  200
test :  0.8935335987453251



# 예측값 구하기

In [20]:
predict_lr = lr.predict(test_x)
predict_lr

array([0, 0, 0, ..., 1, 0, 0])

In [21]:
predict_tr = tr.predict(test_x)
predict_tr

array([0, 0, 0, ..., 0, 0, 0])

In [22]:
predict_rf = rf.predict(test_x)
predict_rf

array([0, 0, 0, ..., 1, 0, 0])

- 예측값

In [23]:
test_x_lr = test_x.copy()
test_x_tr = test_x.copy()
test_x_rf = test_x.copy()

# 예측값 결과 비교 확인

In [24]:
from sklearn.metrics import accuracy_score # 정확도 함수

In [25]:
print('LogisticRegression : ',accuracy_score(test_y,predict_lr))
print()
print('DecisionTree : ',accuracy_score(test_y,predict_tr))
print()
print('RandomForest : ',accuracy_score(test_y,predict_rf))

LogisticRegression :  0.8916636506213054

DecisionTree :  0.8374351550247315

RandomForest :  0.8783930510314875


기본 모델에서는 Logistic - RandomForest - DecisionTree