In [1]:
# 기본
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 경고 뜨지 않게 설정
import warnings
warnings.filterwarnings('ignore')

# 그래프 설정
sns.set()

# 그래프 기본 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
# plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['figure.figsize'] = 12, 6
plt.rcParams['font.size'] = 14
plt.rcParams['axes.unicode_minus'] = False

# 데이터 전처리 알고리즘
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# 학습용과 검증용으로 나누는 함수
from sklearn.model_selection import train_test_split

# 교차 검증
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

# 평가함수
# 분류용
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

# 회귀용
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# 모델의 최적의 하이퍼 파라미터를 찾기 위한 도구
from sklearn.model_selection import GridSearchCV

# 머신러닝 알고리즘 - 분류
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

# 머신러닝 알고리즘 - 회귀
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor

# 학습 모델 저장을 위한 라이브러리
import pickle

### 작업 개요
- 각 컬럼들의 데이터를 결과 데이터와 상관관계가 높은 데이터를 생성한다.
- 결과 데이터를 기반으로 데이터가 만들어지기 때문에 상관관계가 높아질 수 밖에 없다.
- 예측할 데이터도 동일한 형태를 기반으로 생성해야 한다.
- 예측할 데이터도 이미 가지고 있는 상황에서 사용하기 좋다.
- 하지만 미래에 발생할 데이터를 실시간으로 예측하는 시스템 구축시에도 꽤 좋은 결과를 보였었습니다.
- 여기에서는 결과데이터가 사망/생존이기 때문에 생존률 데이터를 발생시킨다.

### 데이터를 불러온다.

In [2]:
# 데이터를 읽어온다.
train_df = pd.read_csv('data/train100.csv')
test_df = pd.read_csv('data/test100.csv')

In [3]:
# 제외할 컬럼들을 제외한다.
# 만약 Fare가 없다고 오류가 나면 코드에서 Fare 없애주세요
train_df.drop(['Fare', 'FareGrade', 'CabinArea'], axis=1, inplace=True)
test_df.drop(['Fare', 'FareGrade', 'CabinArea'], axis=1, inplace=True)

display(train_df.columns)
display(test_df.columns)

Index(['Survived', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked', 'LastName',
       'Title', 'Ages'],
      dtype='object')

Index(['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked', 'LastName', 'Title',
       'Ages'],
      dtype='object')

In [4]:
# 두 데이터 프레임을 합친다.
all_df = pd.concat([train_df, test_df])
all_df.reset_index(inplace=True, drop=True)
all_df

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Embarked,LastName,Title,Ages
0,0.0,3,male,1,0,S,Braund,Mr,20.0
1,1.0,1,female,1,0,C,Cumings,Mrs,30.0
2,1.0,3,female,0,0,S,Heikkinen,Miss,20.0
3,1.0,1,female,1,0,S,Futrelle,Mrs,30.0
4,0.0,3,male,0,0,S,Allen,Mr,30.0
...,...,...,...,...,...,...,...,...,...
1304,,3,male,0,0,S,Spector,Mr,20.0
1305,,1,female,0,0,C,Oliva y Ocana,Dona,30.0
1306,,3,male,0,0,S,Saether,Mr,30.0
1307,,3,male,0,0,S,Ware,Mr,20.0


In [5]:
# 문자열 -> 숫자
sex_encoder = LabelEncoder()
embarked_encoder = LabelEncoder()
lastname_encoder = LabelEncoder()
title_encoder = LabelEncoder()

sex_encoder.fit(all_df['Sex'])
embarked_encoder.fit(all_df['Embarked'])
lastname_encoder.fit(all_df['LastName'])
title_encoder.fit(all_df['Title'])

all_df['Sex'] = sex_encoder.transform(all_df['Sex'])
all_df['Embarked'] = embarked_encoder.transform(all_df['Embarked'])
all_df['LastName'] = lastname_encoder.transform(all_df['LastName'])
all_df['Title'] = title_encoder.transform(all_df['Title'])

all_df

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Embarked,LastName,Title,Ages
0,0.0,3,1,1,0,2,100,12,20.0
1,1.0,1,0,1,0,0,182,13,30.0
2,1.0,3,0,0,0,2,329,9,20.0
3,1.0,1,0,1,0,2,267,13,30.0
4,0.0,3,1,0,0,2,15,12,30.0
...,...,...,...,...,...,...,...,...,...
1304,,3,1,0,0,2,753,12,20.0
1305,,1,0,0,0,0,593,3,30.0
1306,,3,1,0,0,2,699,12,30.0
1307,,3,1,0,0,2,827,12,20.0


In [6]:
# 데이터 프레임을 복제한다.
temp = all_df.copy()
temp

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Embarked,LastName,Title,Ages
0,0.0,3,1,1,0,2,100,12,20.0
1,1.0,1,0,1,0,0,182,13,30.0
2,1.0,3,0,0,0,2,329,9,20.0
3,1.0,1,0,1,0,2,267,13,30.0
4,0.0,3,1,0,0,2,15,12,30.0
...,...,...,...,...,...,...,...,...,...
1304,,3,1,0,0,2,753,12,20.0
1305,,1,0,0,0,0,593,3,30.0
1306,,3,1,0,0,2,699,12,30.0
1307,,3,1,0,0,2,827,12,20.0


In [7]:
# 각 컬럼 데이터를 생존률로 변경한다.
temp['Pclass'] = temp.groupby('Pclass')['Survived'].transform('mean')
temp['Sex'] = temp.groupby('Sex')['Survived'].transform('mean')
temp['SibSp'] = temp.groupby('SibSp')['Survived'].transform('mean')
temp['Parch'] = temp.groupby('Parch')['Survived'].transform('mean')
temp['Embarked'] = temp.groupby('Embarked')['Survived'].transform('mean')
temp['LastName'] = temp.groupby('LastName')['Survived'].transform('mean')
temp['Title'] = temp.groupby('Title')['Survived'].transform('mean')
temp['Ages'] = temp.groupby('Ages')['Survived'].transform('mean')

temp

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Embarked,LastName,Title,Ages
0,0.0,0.242363,0.188908,0.535885,0.343658,0.339009,0.0,0.156673,0.313351
1,1.0,0.629630,0.742038,0.535885,0.343658,0.553571,1.0,0.792000,0.441624
2,1.0,0.242363,0.742038,0.345395,0.343658,0.339009,1.0,0.697802,0.313351
3,1.0,0.629630,0.742038,0.535885,0.343658,0.339009,0.5,0.792000,0.441624
4,0.0,0.242363,0.188908,0.345395,0.343658,0.339009,0.5,0.156673,0.441624
...,...,...,...,...,...,...,...,...,...
1304,,0.242363,0.188908,0.345395,0.343658,0.339009,,0.156673,0.313351
1305,,0.629630,0.742038,0.345395,0.343658,0.553571,,,0.441624
1306,,0.242363,0.188908,0.345395,0.343658,0.339009,,0.156673,0.441624
1307,,0.242363,0.188908,0.345395,0.343658,0.339009,,0.156673,0.313351


In [8]:
# 생존율을 기반으로 간단한 통계값들을 구한다.
temp['mean'] = 0
temp['max'] = 0
temp['min'] = 0
temp['indivisual'] = 0
temp['wealth'] = 0
temp['diff'] = 0

In [10]:
a1 = temp.mean(axis=1)
a2 = temp.max(axis=1)
a3 = temp.min(axis=1)
a4 = (temp['Sex'] + temp['Ages'] + temp['SibSp'] + temp['Parch'] + temp['LastName'] + temp['Title']) / 6
a5 = (temp['Pclass'] + temp['Embarked']) / 2
a6 = temp['max'] - temp['min']

In [11]:
# 생존율을 기반으로 간단한 통계값들을 구한다.
temp['mean'] = a1
temp['max'] = a2
temp['min'] = a3
temp['indivisual'] = a4
temp['wealth'] = a5
temp['diff'] = a6
temp

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Embarked,LastName,Title,Ages,mean,max,min,indivisual,wealth,diff
0,0.0,0.242363,0.188908,0.535885,0.343658,0.339009,0.0,0.156673,0.313351,0.141323,0.535885,0.0,0.256413,0.290686,0
1,1.0,0.629630,0.742038,0.535885,0.343658,0.553571,1.0,0.792000,0.441624,0.402560,1.000000,0.0,0.642534,0.591601,0
2,1.0,0.242363,0.742038,0.345395,0.343658,0.339009,1.0,0.697802,0.313351,0.334908,1.000000,0.0,0.573707,0.290686,0
3,1.0,0.629630,0.742038,0.535885,0.343658,0.339009,0.5,0.792000,0.441624,0.354923,1.000000,0.0,0.559201,0.484319,0
4,0.0,0.242363,0.188908,0.345395,0.343658,0.339009,0.5,0.156673,0.441624,0.170509,0.500000,0.0,0.329376,0.290686,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,,0.242363,0.188908,0.345395,0.343658,0.339009,,0.156673,0.313351,0.148412,0.345395,0.0,,0.290686,0
1305,,0.629630,0.742038,0.345395,0.343658,0.553571,,,0.441624,0.254660,0.742038,0.0,,0.591601,0
1306,,0.242363,0.188908,0.345395,0.343658,0.339009,,0.156673,0.441624,0.158279,0.441624,0.0,,0.290686,0
1307,,0.242363,0.188908,0.345395,0.343658,0.339009,,0.156673,0.313351,0.148412,0.345395,0.0,,0.290686,0


In [12]:
temp.isna().sum()

Survived      418
Pclass          0
Sex             0
SibSp           0
Parch           2
Embarked        0
LastName      230
Title           1
Ages            0
mean            0
max             0
min             0
indivisual    232
wealth          0
diff            0
dtype: int64

In [14]:
# 구한 값을 원본 데이터 프레임에 붙혀준다.
all_df['mean'] = temp['mean']
all_df['max'] = temp['max']
all_df['min'] = temp['min']
all_df['wealth'] = temp['wealth']
all_df['diff'] = temp['diff']
all_df

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Embarked,LastName,Title,Ages,mean,max,min,wealth,diff
0,0.0,3,1,1,0,2,100,12,20.0,0.141323,0.535885,0.0,0.290686,0
1,1.0,1,0,1,0,0,182,13,30.0,0.402560,1.000000,0.0,0.591601,0
2,1.0,3,0,0,0,2,329,9,20.0,0.334908,1.000000,0.0,0.290686,0
3,1.0,1,0,1,0,2,267,13,30.0,0.354923,1.000000,0.0,0.484319,0
4,0.0,3,1,0,0,2,15,12,30.0,0.170509,0.500000,0.0,0.290686,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,,3,1,0,0,2,753,12,20.0,0.148412,0.345395,0.0,0.290686,0
1305,,1,0,0,0,0,593,3,30.0,0.254660,0.742038,0.0,0.591601,0
1306,,3,1,0,0,2,699,12,30.0,0.158279,0.441624,0.0,0.290686,0
1307,,3,1,0,0,2,827,12,20.0,0.148412,0.345395,0.0,0.290686,0


In [18]:
# 학습용과 예측용으로 데이터를 다시 나눈다.
train_df = all_df.query('Survived.notna()')
test_df = all_df.query('Survived.isna()')
test_df.drop('Survived', axis=1, inplace=True)

train_df.reset_index(inplace=True, drop=True)
test_df.reset_index(inplace=True, drop=True)

display(train_df.isna().sum())
display(test_df.isna().sum())

Survived    0
Pclass      0
Sex         0
SibSp       0
Parch       0
Embarked    0
LastName    0
Title       0
Ages        0
mean        0
max         0
min         0
wealth      0
diff        0
dtype: int64

Pclass      0
Sex         0
SibSp       0
Parch       0
Embarked    0
LastName    0
Title       0
Ages        0
mean        0
max         0
min         0
wealth      0
diff        0
dtype: int64

In [21]:
# 처리가 완료된 데이터를 저장한다.
train_df.to_csv('data/train200.csv', index=False)
test_df.to_csv('data/test200.csv', index=False)

In [22]:
temp1 = pd.read_csv('data/train200.csv')
temp2 = pd.read_csv('data/test200.csv')

display(temp1)
display(temp2)

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Embarked,LastName,Title,Ages,mean,max,min,wealth,diff
0,0.0,3,1,1,0,2,100,12,20.0,0.141323,0.535885,0.0,0.290686,0
1,1.0,1,0,1,0,0,182,13,30.0,0.402560,1.000000,0.0,0.591601,0
2,1.0,3,0,0,0,2,329,9,20.0,0.334908,1.000000,0.0,0.290686,0
3,1.0,1,0,1,0,2,267,13,30.0,0.354923,1.000000,0.0,0.484319,0
4,0.0,3,1,0,0,2,15,12,30.0,0.170509,0.500000,0.0,0.290686,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0.0,2,1,0,0,2,535,15,20.0,0.133543,0.472826,0.0,0.405918,0
887,1.0,1,0,0,0,2,294,9,10.0,0.344411,1.000000,0.0,0.484319,0
888,0.0,3,0,1,2,2,383,9,20.0,0.224697,0.742038,0.0,0.290686,0
889,1.0,1,1,0,0,0,69,12,20.0,0.302079,1.000000,0.0,0.591601,0


Unnamed: 0,Pclass,Sex,SibSp,Parch,Embarked,LastName,Title,Ages,mean,max,min,wealth,diff
0,3,1,0,0,1,401,12,30.0,0.204159,0.750000,0.0,0.315986,0
1,3,0,1,0,2,846,13,40.0,0.259767,0.792000,0.0,0.290686,0
2,2,1,0,0,1,552,12,60.0,0.170220,0.472826,0.0,0.431218,0
3,3,1,0,0,2,854,12,20.0,0.148412,0.345395,0.0,0.290686,0
4,3,0,1,1,2,342,13,20.0,0.322535,1.000000,0.0,0.290686,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,3,1,0,0,2,753,12,20.0,0.148412,0.345395,0.0,0.290686,0
414,1,0,0,0,0,593,3,30.0,0.254660,0.742038,0.0,0.591601,0
415,3,1,0,0,2,699,12,30.0,0.158279,0.441624,0.0,0.290686,0
416,3,1,0,0,2,827,12,20.0,0.148412,0.345395,0.0,0.290686,0
