# imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import warnings; warnings.filterwarnings(action='ignore')

# load datasets
from sklearn import datasets

# model_selection.splitter
from sklearn.model_selection import train_test_split       # function

# model_selection.model_validation
from sklearn.model_selection import KFold, StratifiedKFold # cross-validator
from sklearn.model_selection import cross_val_score

# model_selection.hyper_parameter_optimizer
from sklearn.model_selection import GridSearchCV

# models/estimators
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# models : boost
import xgboost as xgb
import lightgbm

# preprocessing.encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder # pd.get_dummies()

# preprocessing.scaling
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

# preprocessing.binarizing
from sklearn.preprocessing import Binarizer

# metrics.scores
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score

# metrics.curves
from sklearn.metrics import precision_recall_curve, plot_precision_recall_curve
from sklearn.metrics import roc_curve, plot_roc_curve

# metrics.reports
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.metrics import classification_report

# mymodule
from mymodule import check_nan, encode, mapping, split_xy
from mymodule import pr_curve, ra_curve, my_eval, max_eval, split_fit_score

# read dataset

In [None]:
df = pd.read_csv('./code_test/kote_train.csv')

# check dataset

## basic

In [None]:
df.head()

In [None]:
df.info()

## target balance check

In [None]:
df['target'].value_counts()

## missing val check ; nan_cols

In [None]:
df.isna().sum()[df.isna().sum() != 0].sort_values(ascending=False)

In [None]:
nan_cols = check_nan(df)

# EDA & feature engineering

In [None]:
# 'id': 등록자 ID
# 'city': 지역코드
# 'city_development_index': 지역개발지수
# 'gender': 성별
# 'relevent_experience': 관련경험
# 'enrolled_university': 대학과정
# 'education_level': 교육수준
# 'major_discipline': 전공
# 'experience':경력(년)
# 'company_size': 소속회사 직원수
# 'company_type': 소속회사 유형
# 'last_new_job': 현재 직업 경력(년)
# 'training_hours': 교육 이수 시간
# 'target': 목표변수, 다른 일을 찾는 경우1, 아니면 0

## encoding

In [None]:
# 레벨 무관(onehot, get_dummies) ; gender, major_discipline
# 레벨 부여(labelencoder)        ; enrolled_university, education_level, experience, last_new_job

# get_dummies   ; label + onehot + fillna, feature_unique로 컬럼 생성하여 0or1 정리, 반드시 미리 결측처리, only df
# label, onehot ;input df, array, list 등

LabelEncoder()

In [None]:
object_cols = encode(df)

In [None]:
df.head()

In [None]:
for col in object_cols:
    print('-' * 30)
    print(df[col].value_counts())

### get_dummies 연습

In [None]:
df2 = df.copy()
df2 = pd.get_dummies(df, columns=['gender', 'major_discipline'], drop_first=True)
df2.info()

### LabelEncoder 연습

In [None]:
for col in list(df2.columns)[1:11]:
    df2[col] = LabelEncoder().fit_transform(df2[col])
df2.info()

In [None]:
# city
df['city'] = df['city'].apply(lambda x: x[5:])
df['city'] = df['city'].astype('int32')
df['city'].unique()

In [None]:
# gender
mapping(df, 'gender')

In [None]:
# relevent_experience
mapping(df, 'relevent_experience')

In [None]:
# enrolled_university
dict = {'no_enrollment':0, 'Full time course':2, 'Part time course':1}
df['enrolled_university'] = df['enrolled_university'].map(dict)
df['enrolled_university'].value_counts()

In [None]:
# education_level
dict = {'Graduate':2,'Masters':3,'High School':1,'Phd':4,'Primary School':0}
df['education_level'] = df['education_level'].map(dict)
df['education_level'].value_counts()

In [None]:
# experience
dict = {}
keys = ['>20', '5', '4', '3', '6', '2', '7', '10', '9', '8', '15', '11', '14', '1', '<1', '16', '12', '13', '17', '19', '18', '20']
vals = [21, 5, 4, 3, 6, 2, 7, 10, 9, 8, 15, 11, 14, 1, 0, 16, 12, 13, 17, 19, 18, 20]
for i in range(len(keys)):
    dict[keys[i]] = vals[i]

df['experience'] = df['experience'].map(dict)
df['experience'].value_counts()

In [None]:
# major_discipline
mapping(df, 'major_discipline')

In [None]:
# company_size
dict = {'50-99':2, '100-500':3, '10000+':7, '10/49':1, '1000-4999':5, '<10':0, '500-999':4, '5000-9999':6}
df['company_size'] = df['company_size'].map(dict)
df['company_size'].value_counts()

In [None]:
# company_type
mapping(df, 'company_type')

In [None]:
# last_new_job
dict = {'1':1, '>4':5, '2':2, 'never':0, '4':4, '3':3, }
df['last_new_job'] = df['last_new_job'].map(dict)
df['last_new_job'].value_counts()

In [None]:
encode(df)

## fillna

In [None]:
nan_cols

In [None]:
# fillna mode
for col in nan_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

In [None]:
check_nan(df)

## binning

## score v0

In [None]:
df.info()

In [None]:
X, y = split_xy(df, 'target')
model = RandomForestClassifier(n_estimators=500, random_state=0)
split_fit_score(X, y, model, th_list=[0.2925, 0.295]) # best_threshold=0.295

## plots

In [None]:
plot_cols = ['gender', 'relevent_experience', 'enrolled_university', 
             'education_level', 'major_discipline', 'experience', 
             'company_size', 'company_type', 'last_new_job', 'target']

f, ax = plt.subplots(5, 2, figsize=(15, 20))
for i, col in enumerate(plot_cols):
        c = i % 2
        r = i // 2
        sns.countplot(col, data=df, ax=ax[r, c])

## corr 상관분석

In [None]:
# 상관분석 ; 두 연속형 변수(x, y) 간 선형적 관계 분석, 인과관계를 설명하지 않음
# ~ 0.1 : 무관
# ~ 0.3 : 약한 상관관계
# ~ 0.7 : 강한 상관관계
# ~ 1.0 : 매우 강한 상관관계, 다중공선성

# 다중공선성 ; 데이터 분석시 부정적 영향
# 1. drop ; 상관관계가 적은 피쳐부터 drop
# 2. PCA(차원축소) ; 피쳐 병합, 원본 데이터 유추 불가능

# 양적 데이터 ; 수치(연속형, 이산형)
# 질적 데이터 ; 범주(명목형, 등급형)

# 피어슨 상관계수 ; 연속형 수치, 일반적으로 사용
# 스피어만 상관계수 ; 범주형 수치

# 분산 ; 한 변수가 퍼져있는 정도
# 공분산 ; 두 변수가 퍼져있는 정도

# 문자열이 있어도 오류가 나지 않지만, 분석에서 제외됨 : 반드시 encode
# 결측치 처리 후 상관분석하는 것이 좋으나, 결측치가 있어도 상관분석 가능

In [None]:
df.corr()

In [None]:
fig, ax = plt.subplots(figsize=(18, 15))
sns.heatmap(df.corr(), annot=True, fmt='.2f', cmap='Blues')

In [None]:
fig, ax = plt.subplots(figsize=(18, 7))
sns.heatmap(df.corr().nlargest(7, 'target'), annot=True, fmt='.2f', cmap='Blues')

## drop cols

In [None]:
drop_cols = ['enrollee_id']

# 문제 정의

주체 : 데이터 분석 관련 비즈니스 회사

목적 : 회사가 운영하는 교육과정에 등록한 데이터 분석가를 채용하고 관리하기 위함

목표 : 누가 이 일을 계속 하고싶어 하는지

In [None]:
pd.DataFrame([0, 1])