In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
# train 변수 : 문제 가져오기
data_train = pd.read_csv('data/kaggle/train.csv')
# test 변수 : 정답 가져오기
data_test = pd.read_csv('data/kaggle/test.csv')


In [3]:
# 필요없는 컬럼 drop
data_train = data_train.drop(['no','fnlwgt'],axis=1)
data_test = data_test.drop(['no','fnlwgt'],axis=1)


In [4]:
# 테스트 데이터와 트레인 데이터 일단 먼저 붙이기
data_concat = pd.concat([data_train,data_test])

In [5]:
 # 트레인 데이터의 정답
data_train['income']

0        0
1        0
2        0
3        0
4        1
        ..
29300    0
29301    0
29302    0
29303    0
29304    0
Name: income, Length: 29305, dtype: int64

In [6]:
data_concat

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,11th,7,Divorced,Machine-op-inspct,Not-in-family,White,Male,0,0,40,United-States,0.0
1,39,Private,Some-college,10,Divorced,Other-service,Not-in-family,White,Female,0,1721,55,United-States,0.0
2,35,Private,Bachelors,13,Never-married,Handlers-cleaners,Not-in-family,Asian-Pac-Islander,Female,0,0,50,Haiti,0.0
3,64,Private,Assoc-voc,11,Divorced,Tech-support,Not-in-family,White,Female,0,0,40,United-States,0.0
4,24,Private,Some-college,10,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,70,United-States,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19532,45,Self-emp-not-inc,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,60,United-States,
19533,48,Private,HS-grad,9,Never-married,Exec-managerial,Not-in-family,White,Female,0,0,50,United-States,
19534,63,Private,Some-college,10,Married-civ-spouse,Prof-specialty,Husband,White,Male,4386,0,40,United-States,
19535,18,Private,11th,7,Never-married,Sales,Own-child,White,Female,0,0,20,United-States,


In [7]:
# 정답 데이터
income_raw = data_concat['income']

In [8]:
# 전처리 되기 이전의 feature들 (전처리 되기전 문제들)
features_raw = data_concat.drop('income', axis = 1)

In [9]:
skewed = ['capital-gain', 'capital-loss']
features_log_transformed = pd.DataFrame(data = features_raw)
features_log_transformed[skewed] = features_raw[skewed].apply(lambda x: np.log(x + 1))

예를 들어, 'capital-gain'이라는 변수는 소득에 대한 정보를 나타내는데, 값이 작은 샘플들이 많고, 큰 값들은 소수에 해당하는 경우가 있습니다. 이런 경우에는 데이터의 분포가 왜곡되어 있을 수 있습니다.

로그 변환은 이런 경우에 주로 사용됩니다. 로그 변환은 각 데이터 포인트의 값을 로그 함수를 적용하여 변환합니다. 이는 큰 값들에 대해 작은 값으로 압축하고, 작은 값들에 대해 더 큰 차이를 만들어줍니다. 이로써 분포를 더 넓게 펼치고, 왜곡을 줄이는 효과를 얻을 수 있습니다.

예시를 들어보겠습니다. 원래 'capital-gain' 변수의 값이 [0, 100, 1000, 10000]이라고 가정해봅시다. 이 값들에 로그 변환을 적용하면, [0, 4.61, 6.91, 9.21]으로 변환됩니다. 이렇게 되면, 원래 값이 100과 10000 사이에 큰 차이가 있었지만, 로그 변환 후에는 값들이 더 가까워지고 분포가 더 균일해집니다.

In [10]:
# min max 스케일러를 이용하여 특징 벡터 (문제들) 스케일링 (최소 최대 정규화)
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler() 
numerical = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

features_log_minmax_transform = pd.DataFrame(data = features_log_transformed)
features_log_minmax_transform[numerical] = scaler.fit_transform(features_log_transformed[numerical])


In [11]:
features_final = pd.get_dummies(features_log_minmax_transform)

income = np.where(income_raw.values == '>50K', 1, 0)

encoded = list(features_final.columns)
print("{} total features after one-hot encoding.".format(len(encoded)))

print(encoded)

107 total features after one-hot encoding.
['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week', 'workclass_ ?', 'workclass_ Federal-gov', 'workclass_ Local-gov', 'workclass_ Never-worked', 'workclass_ Private', 'workclass_ Self-emp-inc', 'workclass_ Self-emp-not-inc', 'workclass_ State-gov', 'workclass_ Without-pay', 'education_ 10th', 'education_ 11th', 'education_ 12th', 'education_ 1st-4th', 'education_ 5th-6th', 'education_ 7th-8th', 'education_ 9th', 'education_ Assoc-acdm', 'education_ Assoc-voc', 'education_ Bachelors', 'education_ Doctorate', 'education_ HS-grad', 'education_ Masters', 'education_ Preschool', 'education_ Prof-school', 'education_ Some-college', 'marital-status_ Divorced', 'marital-status_ Married-AF-spouse', 'marital-status_ Married-civ-spouse', 'marital-status_ Married-spouse-absent', 'marital-status_ Never-married', 'marital-status_ Separated', 'marital-status_ Widowed', 'occupation_ ?', 'occupation_ Adm-clerical', 'occupation_ Armed-For

In [12]:
print(income)

0


In [13]:
# 훈련 데이터의 피쳐 (train, test를 합친 상태이기 때문에 train 데이터와 test 데이터를 다시 분리함)
X_train = features_final.iloc[:29305,:]
X_train

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,...,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia
0,0.109589,0.400000,0.0,0.000000,0.397959,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
1,0.301370,0.600000,0.0,0.889219,0.551020,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2,0.246575,0.800000,0.0,0.000000,0.500000,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0.643836,0.666667,0.0,0.000000,0.397959,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4,0.095890,0.600000,0.0,0.000000,0.704082,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29300,0.041096,0.533333,0.0,0.000000,0.397959,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
29301,0.054795,0.533333,0.0,0.000000,0.397959,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
29302,0.013699,0.333333,0.0,0.000000,0.397959,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
29303,0.219178,0.400000,0.0,0.000000,0.653061,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [14]:
### 테스트 데이터의 피쳐
X_test = features_final.iloc[29305:,:]
X_test

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,...,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia
0,0.013699,0.600000,0.000000,0.000000,0.153061,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0.164384,0.533333,0.000000,0.000000,0.397959,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2,0.383562,0.800000,0.000000,0.000000,0.397959,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
3,0.342466,0.800000,0.000000,0.000000,0.397959,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0.328767,1.000000,0.000000,0.901146,0.397959,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19532,0.383562,0.533333,0.000000,0.000000,0.602041,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
19533,0.424658,0.533333,0.000000,0.000000,0.500000,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
19534,0.630137,0.600000,0.728434,0.000000,0.397959,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
19535,0.013699,0.400000,0.000000,0.000000,0.193878,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [15]:
# 훈련 데이터 정답
y_train = data_train['income']
y_train

0        0
1        0
2        0
3        0
4        1
        ..
29300    0
29301    0
29302    0
29303    0
29304    0
Name: income, Length: 29305, dtype: int64

In [16]:
# lgb 모델 사용하여 예측
from lightgbm import LGBMClassifier

lgb = LGBMClassifier(random_state=200, n_jobs=-1, reg_alpha=0.9, reg_lambda=0.2, n_estimators=200)

# lgb 모델 훈련
lgb.fit(X_train, y_train)

# 훈련한 lgb 모델로 예측한 결과
y_pred = lgb.predict(X_test)

rs =  pd.read_csv('./data/kaggle/sample_submission.csv')
rs['income'] = y_pred
rs.to_csv('50K_01.csv', index=False)


In [17]:
# 방식은 이거임 -> 테스트 데이터도 훈련 데이터로 이용하자 테스트 데이터에 존재 하지 않는 정답 데이터는 처음에 훈련 데이터로 예측한 테스트 데이터 값을 정답 데이터로 사용하자
# 훈련 데이터와, 테스트 데이터 합치기 (테스트 데이터 정답은 훈련된 모델의 예측한 결과를 사용)
X_train_2 = pd.concat([X_train,X_test])
X_train_2

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,...,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia
0,0.109589,0.400000,0.000000,0.000000,0.397959,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
1,0.301370,0.600000,0.000000,0.889219,0.551020,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2,0.246575,0.800000,0.000000,0.000000,0.500000,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0.643836,0.666667,0.000000,0.000000,0.397959,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4,0.095890,0.600000,0.000000,0.000000,0.704082,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19532,0.383562,0.533333,0.000000,0.000000,0.602041,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
19533,0.424658,0.533333,0.000000,0.000000,0.500000,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
19534,0.630137,0.600000,0.728434,0.000000,0.397959,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
19535,0.013699,0.400000,0.000000,0.000000,0.193878,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [18]:
# 방식은 이거임 -> 테스트 데이터도 훈련 데이터로 이용하자 테스트 데이터에 존재 하지 않는 정답 데이터는 처음에 훈련 데이터로 예측한 테스트 데이터 값을 정답 데이터로 사용하자
# 훈련 데이터와, 테스트 데이터 합치기 (테스트 데이터 정답은 훈련된 모델의 예측한 결과를 사용)
X_train_2 = pd.concat([X_train,X_test])

In [19]:
y_train

0        0
1        0
2        0
3        0
4        1
        ..
29300    0
29301    0
29302    0
29303    0
29304    0
Name: income, Length: 29305, dtype: int64

In [20]:
y_1=y_train.to_numpy()

In [21]:
y_t = np.concatenate([y_1, y_pred])

In [22]:
y_t = list(y_t)

In [23]:
y_train_change=pd.Series(y_t)

In [24]:
### 모델 훈련
lgb.fit(X_train_2,y_train_change)

LGBMClassifier(n_estimators=200, random_state=200, reg_alpha=0.9,
               reg_lambda=0.2)

In [25]:
# lgb 모델로 예측한 결과
pre1 = lgb.predict(X_test)
# kaggle 양식 불러오기
rs =  pd.read_csv('./data/kaggle/sample_submission.csv')
rs['income'] = pre1 # 내 모델의 예측 결과를 양식에 넣기
rs.to_csv('Test01.csv', index = False)

In [26]:
# 위 과정 반복 -> 모델 예측한 결과중 더 좋은 결과를 다시 테스트 데이터의 정답으로 활용 + 트레인 데이터와 다시 합치고 다시 모델 학습
y_t = np.concatenate([y_1, pre1])
y_t = list(y_t)
y_train_change=pd.Series(y_t)

lgb.fit(X_train_2,y_train_change)

pre2 = lgb.predict(X_test)
# kaggle 양식 불러오기
rs =  pd.read_csv('./data/kaggle/sample_submission.csv')
rs['income'] = pre2 # 내 모델의 예측 결과를 양식에 넣기
rs.to_csv('Test02.csv', index = False)

In [27]:
y_t = np.concatenate([y_1, pre2])
y_t = list(y_t)
y_train_change=pd.Series(y_t)

lgb.fit(X_train_2,y_train_change)

pre3 = lgb.predict(X_test)
# kaggle 양식 불러오기
rs =  pd.read_csv('./data/kaggle/sample_submission.csv')
rs['income'] = pre3 # 내 모델의 예측 결과를 양식에 넣기
rs.to_csv('Last01.csv', index = False)

In [28]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(lgb, X_train_2, y_train_change, cv=5, scoring='accuracy')

# 교차 검증 결과 출력
print("교차 검증 정확도:", scores)
print("평균 정확도:", scores.mean())

교차 검증 정확도: [0.88064285 0.88217832 0.87889025 0.97757985 0.99140049]
평균 정확도: 0.9221383533749179


In [29]:
y_t = np.concatenate([y_1, pre3])
y_t = list(y_t)
y_train_change=pd.Series(y_t)

lgb.fit(X_train_2,y_train_change)

pre4 = lgb.predict(X_test)
# kaggle 양식 불러오기
rs =  pd.read_csv('./data/kaggle/sample_submission.csv')
rs['income'] = pre4 # 내 모델의 예측 결과를 양식에 넣기
rs.to_csv('Test04.csv', index = False)


In [30]:

scores = cross_val_score(lgb, X_train_2, y_train_change, cv=5, scoring='accuracy')

# 교차 검증 결과 출력
print("교차 검증 정확도:", scores)
print("평균 정확도:", scores.mean())

교차 검증 정확도: [0.88033576 0.8813594  0.87889025 0.97798935 0.99109337]
평균 정확도: 0.9219336262252631


In [31]:
y_t = np.concatenate([y_1, pre4])
y_t = list(y_t)
y_train_change=pd.Series(y_t)

lgb.fit(X_train_2,y_train_change)

pre5 = lgb.predict(X_test)
# kaggle 양식 불러오기
rs =  pd.read_csv('./data/kaggle/sample_submission.csv')
rs['income'] = pre5 # 내 모델의 예측 결과를 양식에 넣기
rs.to_csv('Test05.csv', index = False)

scores = cross_val_score(lgb, X_train_2, y_train_change, cv=5, scoring='accuracy')

# 교차 검증 결과 출력
print("교차 검증 정확도:", scores)
print("평균 정확도:", scores.mean())

교차 검증 정확도: [0.88033576 0.88105231 0.87919738 0.97880835 0.99170762]
평균 정확도: 0.9222202827996664


In [32]:
y_t = np.concatenate([y_1, pre5])
y_t = list(y_t)
y_train_change=pd.Series(y_t)

lgb.fit(X_train_2,y_train_change)

pre6 = lgb.predict(X_test)
# kaggle 양식 불러오기
rs =  pd.read_csv('./data/kaggle/sample_submission.csv')
rs['income'] = pre6 # 내 모델의 예측 결과를 양식에 넣기
rs.to_csv('Last06.csv', index = False)

scores = cross_val_score(lgb, X_train_2, y_train_change, cv=5, scoring='accuracy')

# 교차 검증 결과 출력
print("교차 검증 정확도:", scores)
print("평균 정확도:", scores.mean())

교차 검증 정확도: [0.88064285 0.88156413 0.879095   0.97880835 0.99068387]
평균 정확도: 0.9221588409708996


In [33]:
y_t = np.concatenate([y_1, pre6])
y_t = list(y_t)
y_train_change=pd.Series(y_t)

lgb.fit(X_train_2,y_train_change)

pre7 = lgb.predict(X_test)
# kaggle 양식 불러오기
rs =  pd.read_csv('./data/kaggle/sample_submission.csv')
rs['income'] = pre7 # 내 모델의 예측 결과를 양식에 넣기
rs.to_csv('Last07.csv', index = False)

scores = cross_val_score(lgb, X_train_2, y_train_change, cv=5, scoring='accuracy')

# 교차 검증 결과 출력
print("교차 검증 정확도:", scores)
print("평균 정확도:", scores.mean())

교차 검증 정확도: [0.88094994 0.88064285 0.87837838 0.9790131  0.99099099]
평균 정확도: 0.9219950533826058


In [34]:
y_t = np.concatenate([y_1, pre7])
y_t = list(y_t)
y_train_change=pd.Series(y_t)

lgb.fit(X_train_2,y_train_change)

pre8 = lgb.predict(X_test)
# kaggle 양식 불러오기
rs =  pd.read_csv('./data/kaggle/sample_submission.csv')
rs['income'] = pre8 # 내 모델의 예측 결과를 양식에 넣기
rs.to_csv('Last08.csv', index = False)

scores = cross_val_score(lgb, X_train_2, y_train_change, cv=5, scoring='accuracy')

# 교차 검증 결과 출력
print("교차 검증 정확도:", scores)
print("평균 정확도:", scores.mean())

교차 검증 정확도: [0.87900502 0.88084758 0.87889025 0.97952498 0.99027437]
평균 정확도: 0.9217084387265573


In [35]:
y_t = np.concatenate([y_1, pre8])
y_t = list(y_t)
y_train_change=pd.Series(y_t)

lgb.fit(X_train_2,y_train_change)

pre9 = lgb.predict(X_test)
# kaggle 양식 불러오기
rs =  pd.read_csv('./data/kaggle/sample_submission.csv')
rs['income'] = pre9 # 내 모델의 예측 결과를 양식에 넣기
rs.to_csv('Last09.csv', index = False)

scores = cross_val_score(lgb, X_train_2, y_train_change, cv=5, scoring='accuracy')

# 교차 검증 결과 출력
print("교차 검증 정확도:", scores)
print("평균 정확도:", scores.mean())

교차 검증 정확도: [0.8799263  0.88023339 0.87981163 0.97921785 0.99211712]
평균 정확도: 0.9222612579916296


In [36]:
y_t = np.concatenate([y_1, pre9])
y_t = list(y_t)
y_train_change=pd.Series(y_t)

lgb.fit(X_train_2,y_train_change)

pre10 = lgb.predict(X_test)
# kaggle 양식 불러오기
rs =  pd.read_csv('./data/kaggle/sample_submission.csv')
rs['income'] = pre10 # 내 모델의 예측 결과를 양식에 넣기
rs.to_csv('Last10.csv', index = False)

scores = cross_val_score(lgb, X_train_2, y_train_change, cv=5, scoring='accuracy')

# 교차 검증 결과 출력
print("교차 검증 정확도:", scores)
print("평균 정확도:", scores.mean())

교차 검증 정확도: [0.88013103 0.8799263  0.878276   0.98054873 0.99109337]
평균 정확도: 0.921995084821372


In [37]:
y_t = np.concatenate([y_1, pre10])
y_t = list(y_t)
y_train_change=pd.Series(y_t)

lgb.fit(X_train_2,y_train_change)

pre11 = lgb.predict(X_test)
# kaggle 양식 불러오기
rs =  pd.read_csv('./data/kaggle/sample_submission.csv')
rs['income'] = pre11 # 내 모델의 예측 결과를 양식에 넣기
rs.to_csv('Last11.csv', index = False)

scores = cross_val_score(lgb, X_train_2, y_train_change, cv=5, scoring='accuracy')

# 교차 검증 결과 출력
print("교차 검증 정확도:", scores)
print("평균 정확도:", scores.mean())

교차 검증 정확도: [0.87972157 0.88033576 0.878276   0.97972973 0.99150287]
평균 정확도: 0.921913184739472


In [38]:
y_t = np.concatenate([y_1, pre11])
y_t = list(y_t)
y_train_change=pd.Series(y_t)

lgb.fit(X_train_2,y_train_change)

pre12 = lgb.predict(X_test)
# kaggle 양식 불러오기
rs =  pd.read_csv('./data/kaggle/sample_submission.csv')
rs['income'] = pre12 # 내 모델의 예측 결과를 양식에 넣기
rs.to_csv('Last12.csv', index = False)

scores = cross_val_score(lgb, X_train_2, y_train_change, cv=5, scoring='accuracy')

# 교차 검증 결과 출력
print("교차 검증 정확도:", scores)
print("평균 정확도:", scores.mean())

교차 검증 정확도: [0.88013103 0.8799263  0.87766175 0.97829648 0.99088862]
평균 정확도: 0.9213808342071215


In [39]:
y_t = np.concatenate([y_1, pre12])
y_t = list(y_t)
y_train_change=pd.Series(y_t)

lgb.fit(X_train_2,y_train_change)

pre13 = lgb.predict(X_test)
# kaggle 양식 불러오기
rs =  pd.read_csv('./data/kaggle/sample_submission.csv')
rs['income'] = pre13 # 내 모델의 예측 결과를 양식에 넣기
rs.to_csv('Last13.csv', index = False)

scores = cross_val_score(lgb, X_train_2, y_train_change, cv=5, scoring='accuracy')

# 교차 검증 결과 출력
print("교차 검증 정확도:", scores)
print("평균 정확도:", scores.mean())

교차 검증 정확도: [0.87972157 0.88002866 0.877457   0.97911548 0.99068387]
평균 정확도: 0.9214013155153497


In [40]:
y_t = np.concatenate([y_1, pre13])
y_t = list(y_t)
y_train_change=pd.Series(y_t)

lgb.fit(X_train_2,y_train_change)

pre14 = lgb.predict(X_test)
# kaggle 양식 불러오기
rs =  pd.read_csv('./data/kaggle/sample_submission.csv')
rs['income'] = pre14 # 내 모델의 예측 결과를 양식에 넣기
rs.to_csv('Last14.csv', index = False)

scores = cross_val_score(lgb, X_train_2, y_train_change, cv=5, scoring='accuracy')

# 교차 검증 결과 출력
print("교차 검증 정확도:", scores)
print("평균 정확도:", scores.mean())

교차 검증 정확도: [0.88146177 0.88023339 0.87837838 0.98013923 0.99027437]
평균 정확도: 0.9220974263890632


In [41]:
y_t = np.concatenate([y_1, pre14])
y_t = list(y_t)
y_train_change=pd.Series(y_t)

lgb.fit(X_train_2,y_train_change)

pre15 = lgb.predict(X_test)
# kaggle 양식 불러오기
rs =  pd.read_csv('./data/kaggle/sample_submission.csv')
rs['income'] = pre15 # 내 모델의 예측 결과를 양식에 넣기
rs.to_csv('Last15.csv', index = False)

scores = cross_val_score(lgb, X_train_2, y_train_change, cv=5, scoring='accuracy')

# 교차 검증 결과 출력
print("교차 검증 정확도:", scores)
print("평균 정확도:", scores.mean())

교차 검증 정확도: [0.88043812 0.87951684 0.87714988 0.98013923 0.99119574]
평균 정확도: 0.9216879616101645


In [42]:
y_t = np.concatenate([y_1, pre15])
y_t = list(y_t)
y_train_change=pd.Series(y_t)

lgb.fit(X_train_2,y_train_change)

pre16 = lgb.predict(X_test)
# kaggle 양식 불러오기
rs =  pd.read_csv('./data/kaggle/sample_submission.csv')
rs['income'] = pre16 # 내 모델의 예측 결과를 양식에 넣기
rs.to_csv('Last16.csv', index = False)

scores = cross_val_score(lgb, X_train_2, y_train_change, cv=5, scoring='accuracy')

# 교차 검증 결과 출력
print("교차 검증 정확도:", scores)
print("평균 정확도:", scores.mean())

교차 검증 정확도: [0.88094994 0.87900502 0.87817363 0.98054873 0.99047912]
평균 정확도: 0.9218312867534897


In [43]:
y_t = np.concatenate([y_1, pre16])
y_t = list(y_t)
y_train_change=pd.Series(y_t)

lgb.fit(X_train_2,y_train_change)

pre17 = lgb.predict(X_test)
# kaggle 양식 불러오기
rs =  pd.read_csv('./data/kaggle/sample_submission.csv')
rs['income'] = pre17 # 내 모델의 예측 결과를 양식에 넣기
rs.to_csv('Last17.csv', index = False)

scores = cross_val_score(lgb, X_train_2, y_train_change, cv=5, scoring='accuracy')

# 교차 검증 결과 출력
print("교차 검증 정확도:", scores)
print("평균 정확도:", scores.mean())

교차 검증 정확도: [0.88054049 0.87972157 0.87848075 0.9790131  0.99191237]
평균 정확도: 0.9219336555681116


In [44]:
y_t = np.concatenate([y_1, pre17])
y_t = list(y_t)
y_train_change=pd.Series(y_t)

lgb.fit(X_train_2,y_train_change)

pre18 = lgb.predict(X_test)
# kaggle 양식 불러오기
rs =  pd.read_csv('./data/kaggle/sample_submission.csv')
rs['income'] = pre18 # 내 모델의 예측 결과를 양식에 넣기
rs.to_csv('Last18.csv', index = False)

scores = cross_val_score(lgb, X_train_2, y_train_change, cv=5, scoring='accuracy')

# 교차 검증 결과 출력
print("교차 검증 정확도:", scores)
print("평균 정확도:", scores.mean())

교차 검증 정확도: [0.8816665  0.87941447 0.87817363 0.98044636 0.99314087]
평균 정확도: 0.9225683644354952


In [45]:
y_t = np.concatenate([y_1, pre18])
y_t = list(y_t)
y_train_change=pd.Series(y_t)

lgb.fit(X_train_2,y_train_change)

pre19 = lgb.predict(X_test)
# kaggle 양식 불러오기
rs =  pd.read_csv('./data/kaggle/sample_submission.csv')
rs['income'] = pre19 # 내 모델의 예측 결과를 양식에 넣기
rs.to_csv('Last19.csv', index = False)

scores = cross_val_score(lgb, X_train_2, y_train_change, cv=5, scoring='accuracy')

# 교차 검증 결과 출력
print("교차 검증 정확도:", scores)
print("평균 정확도:", scores.mean())

교차 검증 정확도: [0.88187123 0.87880029 0.87848075 0.97850123 0.99201474]
평균 정확도: 0.9219336471844406


In [46]:
y_t = np.concatenate([y_1, pre19])
y_t = list(y_t)
y_train_change=pd.Series(y_t)

lgb.fit(X_train_2,y_train_change)

pre20 = lgb.predict(X_test)
# kaggle 양식 불러오기
rs =  pd.read_csv('./data/kaggle/sample_submission.csv')
rs['income'] = pre20 # 내 모델의 예측 결과를 양식에 넣기
rs.to_csv('Last20.csv', index = False)

scores = cross_val_score(lgb, X_train_2, y_train_change, cv=5, scoring='accuracy')

# 교차 검증 결과 출력
print("교차 검증 정확도:", scores)
print("평균 정확도:", scores.mean())

교차 검증 정확도: [0.88105231 0.88023339 0.87837838 0.97993448 0.99201474]
평균 정확도: 0.9223226599979594


In [47]:
y_t = np.concatenate([y_1, pre20])
y_t = list(y_t)
y_train_change=pd.Series(y_t)

lgb.fit(X_train_2,y_train_change)

pre21 = lgb.predict(X_test)
# kaggle 양식 불러오기
rs =  pd.read_csv('./data/kaggle/sample_submission.csv')
rs['income'] = pre21 # 내 모델의 예측 결과를 양식에 넣기
rs.to_csv('Last21.csv', index = False)

scores = cross_val_score(lgb, X_train_2, y_train_change, cv=5, scoring='accuracy')

# 교차 검증 결과 출력
print("교차 검증 정확도:", scores)
print("평균 정확도:", scores.mean())

교차 검증 정확도: [0.88156413 0.87951684 0.879095   0.97911548 0.99221949]
평균 정확도: 0.9223021891693198


In [48]:
y_t = np.concatenate([y_1, pre21])
y_t = list(y_t)
y_train_change=pd.Series(y_t)

lgb.fit(X_train_2,y_train_change)

pre22 = lgb.predict(X_test)
# kaggle 양식 불러오기
rs =  pd.read_csv('./data/kaggle/sample_submission.csv')
rs['income'] = pre22 # 내 모델의 예측 결과를 양식에 넣기
rs.to_csv('Last22.csv', index = False)

scores = cross_val_score(lgb, X_train_2, y_train_change, cv=5, scoring='accuracy')

# 교차 검증 결과 출력
print("교차 검증 정확도:", scores)
print("평균 정확도:", scores.mean())

교차 검증 정확도: [0.88084758 0.87941447 0.87817363 0.97880835 0.99242424]
평균 정확도: 0.9219336555681114


In [49]:
y_t = np.concatenate([y_1, pre22])
y_t = list(y_t)
y_train_change=pd.Series(y_t)

lgb.fit(X_train_2,y_train_change)

pre23 = lgb.predict(X_test)
# kaggle 양식 불러오기
rs =  pd.read_csv('./data/kaggle/sample_submission.csv')
rs['income'] = pre23 # 내 모델의 예측 결과를 양식에 넣기
rs.to_csv('Last23.csv', index = False)

scores = cross_val_score(lgb, X_train_2, y_train_change, cv=5, scoring='accuracy')

# 교차 검증 결과 출력
print("교차 검증 정확도:", scores)
print("평균 정확도:", scores.mean())

교차 검증 정확도: [0.88094994 0.87951684 0.87807125 0.9798321  0.99191237]
평균 정확도: 0.9220565014991262


In [50]:
y_t = np.concatenate([y_1, pre23])
y_t = list(y_t)
y_train_change=pd.Series(y_t)

lgb.fit(X_train_2,y_train_change)

pre24 = lgb.predict(X_test)
# kaggle 양식 불러오기
rs =  pd.read_csv('./data/kaggle/sample_submission.csv')
rs['income'] = pre24 # 내 모델의 예측 결과를 양식에 넣기
rs.to_csv('Last24.csv', index = False)

scores = cross_val_score(lgb, X_train_2, y_train_change, cv=5, scoring='accuracy')

# 교차 검증 결과 출력
print("교차 검증 정확도:", scores)
print("평균 정확도:", scores.mean())

교차 검증 정확도: [0.8816665  0.87890265 0.87735463 0.97952498 0.99191237]
평균 정확도: 0.9218722242189331


In [51]:
y_t = np.concatenate([y_1, pre24])
y_t = list(y_t)
y_train_change=pd.Series(y_t)

lgb.fit(X_train_2,y_train_change)

pre25 = lgb.predict(X_test)
# kaggle 양식 불러오기
rs =  pd.read_csv('./data/kaggle/sample_submission.csv')
rs['income'] = pre25 # 내 모델의 예측 결과를 양식에 넣기
rs.to_csv('Last25.csv', index = False)

scores = cross_val_score(lgb, X_train_2, y_train_change, cv=5, scoring='accuracy')

# 교차 검증 결과 출력
print("교차 검증 정확도:", scores)
print("평균 정확도:", scores.mean())

교차 검증 정확도: [0.88094994 0.87941447 0.87817363 0.9794226  0.99191237]
평균 정확도: 0.9219746035131438


In [52]:
y_t = np.concatenate([y_1, pre25])
y_t = list(y_t)
y_train_change=pd.Series(y_t)

lgb.fit(X_train_2,y_train_change)

pre26 = lgb.predict(X_test)
# kaggle 양식 불러오기
rs =  pd.read_csv('./data/kaggle/sample_submission.csv')
rs['income'] = pre26 # 내 모델의 예측 결과를 양식에 넣기
rs.to_csv('Last26.csv', index = False)

scores = cross_val_score(lgb, X_train_2, y_train_change, cv=5, scoring='accuracy')

# 교차 검증 결과 출력
print("교차 검증 정확도:", scores)
print("평균 정확도:", scores.mean())

교차 검증 정확도: [0.88084758 0.8796192  0.87776413 0.98024161 0.99221949]
평균 정확도: 0.9221384015810262
