# [T-Academy X KaKr] 성인 인구조사 소득 예측 대회

## 목적
 - 1994년 미국 성인을 대상으로 한 조사를 통해 연소득이 $50,000 이 넘는지 안 넘는지 구분하기

# Setting

In [None]:
# t

In [None]:
import pandas as pd
import pandas_profiling
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
pd.options.display.max_rows=150
%matplotlib inline

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
# import os
# os.listdir('../input/')

In [None]:
path = '/kaggle/input/kakr-4th-competition/'

df = pd.read_csv(path + 'train.csv')
df_t = pd.read_csv(path + 'test.csv')
submission = pd.read_csv(path + 'sample_submission.csv')

print(df.shape)
print(df_t.shape)

In [None]:
df.info()

# Data Processing (step.01)

In [None]:
# education 컬럼은 education_num과 1대1 매칭 >> education 컬럼 제외
# df['education'].value_counts().values - df['education_num'].value_counts().values
df = df.drop('education', axis = 1)

# income column >> 50K 기준으로 대소 비교 변수 파생 (True/False)
df['is_income_over_50K'] = df['income']=='>50K'

print(df.shape);  df.head(2)

In [None]:
# # fnlwgt 컬럼은 사후 층화 가중치로서, 한 표본이 몇 개를 대표하는지 의미.
# # 'https://m.blog.naver.com/stat833/221747335785'
# df_fnlwgt = df.head(5)
# df_fnlwgt.drop(df_fnlwgt.index[0], inplace = True)
# df_fnlwgt

###  one-hot 인코딩

In [None]:
## train data

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

dtypes = df.dtypes
encoders = {}
for column in df.columns:
    if str(dtypes[column]) == 'object':
        encoder = LabelEncoder()
        encoder.fit(df[column])
        encoders[column] = encoder
df_num = df.copy()

for column in encoders.keys():
    encoder = encoders[column]
    df_num[column] = encoder.transform(df[column])
    
print(df_num.shape)
df_num.tail(3)

In [None]:
## test data

dtypes = df_t.dtypes
encoders = {}
for column in df_t.columns:
    if str(dtypes[column]) == 'object':
        encoder = LabelEncoder()
        encoder.fit(df_t[column])
        encoders[column] = encoder
df_num_t = df_t.copy()

for column in encoders.keys():
    encoder = encoders[column]
    df_num_t[column] = encoder.transform(df_t[column])
    
print(df_num_t.shape)
df_num_t.tail(3)

In [None]:
# 상관관계 heatmap

plt.figure(figsize=(10,10))
sns.heatmap(data = df_num.corr(), annot=True, 
fmt = '.2f', linewidths=.5, cmap='Blues')

# age, education_num, marital_status, relationship, sex, capital_gain, capital_loss, hours_per_week 가 연관있어 보인다.

# Data Processing (step.02)

### Capital_gain & Capital_loss

In [None]:
# train
df['capital_gain_bool'] = df['capital_gain'] > 0
df['capital_loss_bool'] = df['capital_loss'] > 0
print(df.shape);  
display(df[['capital_loss', 'capital_loss_bool']].head())

# test
df_t['capital_gain_bool'] = df_t['capital_gain'] > 0
df_t['capital_loss_bool'] = df_t['capital_loss'] > 0
print(df_t.shape)
display(df_t[['capital_loss', 'capital_loss_bool']].head())

# EDA (step.03)

## 03-1. sex + marital_status + education_num

In [None]:
# 3. 남성이면서, 기혼자면서, 14년 이상 공부한 사람들
df['Male_married_14y_edu'] =  (df['sex'] == 'Male') & (df['marital_status'] == 'Married-civ-spouse') & (df['education_num'] >= 14)
print(df.shape)
df[['sex', 'marital_status','education_num','Male_married_14y_edu']].head()

In [None]:
# test
df_t['Male_married_14y_edu'] =  (df_t['sex'] == 'Male') & (df_t['marital_status'] == 'Married-civ-spouse') & (df_t['education_num'] >= 14)
print(df.shape)
df_t[['sex', 'marital_status','education_num','Male_married_14y_edu']].head()

In [None]:
# 1,235 명 중 968명이 연 소득 5만 달러 이상입니다.
df[['Male_married_14y_edu','is_income_over_50K']].value_counts()

## 03-2. occupation + marital_status + workclass

In [None]:
# occupation
df['ceo_prof_m'] = ((df['occupation'] == 'Exec-managerial') | (df['occupation'] == 'Prof-specialty')) & (df['marital_status'] == 'Married-civ-spouse')
print(df.shape)
df[['occupation', 'marital_status', 'ceo_prof_m']].head()

In [None]:
# test
df_t['ceo_prof_m'] = ((df_t['occupation'] == 'Exec-managerial') | (df_t['occupation'] == 'Prof-specialty')) & (df_t['marital_status'] == 'Married-civ-spouse')
print(df_t.shape)
df_t[['occupation', 'marital_status', 'ceo_prof_m']].head()

In [None]:
df[['ceo_prof_m','is_income_over_50K']].value_counts()

# PREPROCESSING (step.04)

In [None]:
# 1 age
# 나이를 구간화(Binning) 합니다.
df['age_bin'] = pd.qcut(df['age'], 10, labels = False)

print(df.shape)
df[['age', 'age_bin']]

In [None]:
# test
df_t['age_bin'] = pd.qcut(df_t['age'], 10, labels = False)

print(df_t.shape)
df_t[['age', 'age_bin']]

In [None]:
# 2. hours_per_week:
# 정규화
df['hours_per_week_log'] = np.log(df['hours_per_week'] + 1)

# test
df_t['hours_per_week_log'] = np.log(df_t['hours_per_week'] + 1)

In [None]:
figure, (ax1, ax2) = plt.subplots(nrows=1, ncols=2)
figure.set_size_inches(18, 4)


sns.distplot(df["hours_per_week"], ax=ax1)
sns.distplot(df["hours_per_week_log"], ax=ax2)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
# one hot 인코딩

dtypes = df.dtypes
encoders = {}
for column in df.columns:
    if str(dtypes[column]) == 'object':
        encoder = LabelEncoder()
        encoder.fit(df[column])
        encoders[column] = encoder
df_num = df.copy()

for column in encoders.keys():
    encoder = encoders[column]
    df_num[column] = encoder.transform(df[column])
    
print(df_num.shape)
df_num.tail(10)

In [None]:
# test one hot 인코딩

dtypes = df_t.dtypes
encoders = {}
for column in df_t.columns:
    if str(dtypes[column]) == 'object':
        encoder = LabelEncoder()
        encoder.fit(df_t[column])
        encoders[column] = encoder
df_num_t = df_t.copy()

for column in encoders.keys():
    encoder = encoders[column]
    df_num_t[column] = encoder.transform(df_t[column])
    
print(df_num_t.shape)
df_num_t.tail(10)

# Feature Engineering (step.05)

In [None]:
# encode
to_encode = ['workclass', 'marital_status',
             'occupation', 'relationship', 'race', 'sex', 'native_country']

encoded = pd.get_dummies(df[to_encode])
encoded_t = pd.get_dummies(df_t[to_encode])
encoded['native_country_Holand-Netherlands'] = 0

In [None]:
df.columns

In [None]:
features = ['education_num','capital_gain_bool', 'age_bin', 'hours_per_week_log', 'Male_married_14y_edu', 'ceo_prof_m']
target = 'is_income_over_50K'

In [None]:
preprocessed = df[features]
preprocessed_t = df_t[features]

In [None]:
print(preprocessed.shape)
print(preprocessed_t.shape)

In [None]:
X_train = pd.concat([preprocessed, encoded], axis=1)
X_test = pd.concat([preprocessed_t, encoded_t], axis=1)
y_train = df[target]

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)

# Model Tuning & Evaluation (step.06)

# Model Tuning & Evaluation (step.06-1)

In [None]:
# 데이터 분석 라이브러리
import numpy as np
import pandas as pd

# 시각화 라이브러리
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from plotnine import *

# 모델링 라이브러리
from category_encoders.ordinal import OrdinalEncoder
from sklearn.model_selection import KFold
from lightgbm import LGBMClassifier

# 기타 라이브러리
import random
import gc
import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
sns.set_style("whitegrid")

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)

In [None]:
# # path = '/kaggle/input/kakr-4th-competition/'

# # train = pd.read_csv(path + 'train.csv')
# # test = pd.read_csv(path + 'test.csv')
# # sample_submission = pd.read_csv(path + 'sample_submission.csv')

# # print(train.shape)
# print(test.shape)

In [None]:
# train.drop(['id'], axis=1, inplace=True)
# test.drop(['id'], axis=1, inplace=True)

In [None]:
# y = train['income'] != '<=50K'
# X = train.drop(['income'], axis=1)

In [None]:
# # 라벨 인코더 생성
# LE_encoder = OrdinalEncoder(list(X.columns))

# # train, test 데이터에 인코딩 적용
# X = LE_encoder.fit_transform(X, y)
# test = LE_encoder.transform(test)

In [None]:
NFOLDS = 5
folds = KFold(n_splits=NFOLDS)

columns = X_train.columns
splits = folds.split(X_train, y_train)
y_preds = np.zeros(X_test.shape[0])

feature_importances = pd.DataFrame()
feature_importances['feature'] = columns

In [None]:
model = LGBMClassifier(objective='binary', verbose=400, random_state=91)


for fold_n, (train_index, valid_index) in enumerate(splits):
    print('Fold: ', fold_n+1)
    X_t, X_v = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_t, y_v = y_train.iloc[train_index], y_train.iloc[valid_index]

    evals = [(X_t, y_t), (X_v, y_v)]
    model.fit(X_t, y_t, eval_metric='f1', eval_set=evals, verbose=True)
    
    feature_importances[f'fold_{fold_n + 1}'] = model.feature_importances_
        
    y_preds += model.predict(X_test).astype(int) / NFOLDS
    
    del X_t, X_v, y_t, y_v
    gc.collect()

In [None]:
sample_submission['prediction'] = y_preds

for ix, row in sample_submission.iterrows():
    if row['prediction'] > 0.5:
        sample_submission.loc[ix, 'prediction'] = 1
    else:
        sample_submission.loc[ix, 'prediction'] = 0

sample_submission = sample_submission.astype({"prediction": int})
sample_submission.to_csv('submission_10_28_2.csv', index=False)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 4))
col = 'prediction'
value_counts = sample_submission[col].value_counts()
sns.countplot(x=col, data=sample_submission, palette="Set2", edgecolor='black', order = value_counts.index)

for i, v in value_counts.reset_index().iterrows():
    ax.text(i-0.05, v[col]+150 , v[col])