In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_csv('titanic/train.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
data['Embarked'].fillna('S', inplace = True)
data['Fare'].fillna(0, inplace=True)
data['Fare'] = data['Fare'].map(lambda x : np.log(x) if x > 0 else 0)

In [4]:
data['Initial'] = data['Name'].str.extract('([A-Za-z]+)\.')
data['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don', 'Dona'],['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr','Other'],inplace=True)
mapping = {
    "Mr":0,
    "Miss":1,
    "Mrs" : 1,
    "Master":2,
    "Other":3
}

data['Initial'] = data['Initial'].map(mapping)

In [5]:
mapping_sex = {
    'male' : 0,
    'female': 1
}

mapping_em = {
    'S' :0,
    'C' :1,
    'Q' :2
}


data['Sex'] = data['Sex'].map(mapping_sex)
data['Embarked'] = data['Embarked'].map(mapping_em)


data.drop(['PassengerId', "Ticket", "Cabin", "Name"], axis = 1, inplace = True)

In [6]:
data.groupby('Initial')['Age'].mean()

Initial
0    32.739609
1    27.834615
2     4.574167
3    45.888889
Name: Age, dtype: float64

In [7]:
data.loc[ (data['Age'].isnull()) & (data['Initial'] == 0), 'Age' ] = 32
data.loc[ (data['Age'].isnull()) & (data['Initial'] == 1), 'Age' ] = 28
data.loc[ (data['Age'].isnull()) & (data['Initial'] == 2), 'Age' ] = 5
data.loc[ (data['Age'].isnull()) & (data['Initial'] == 3), 'Age' ] = 45

In [8]:
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Initial
0,0,3,0,22.0,1,0,1.981001,0,0
1,1,1,1,38.0,1,0,4.266662,1,1
2,1,3,1,26.0,0,0,2.070022,0,1
3,1,1,1,35.0,1,0,3.972177,0,1
4,0,3,0,35.0,0,0,2.085672,0,0


In [9]:
y = data['Survived']
X = data.drop('Survived', axis = 1)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

In [12]:
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)

In [13]:
pred = rf.predict(X_test)
print("정확도 :{0:.3f}".format(accuracy_score(y_test, pred)))

정확도 :0.832


In [16]:
gb = GradientBoostingClassifier(random_state=0)
gb.fit(X_train, y_train)

In [17]:
gb_param_grid = {
    'n_estimators' : [100, 200],
    'max_depth' : [6, 8, 10, 12],
    'min_samples_leaf' : [3, 5, 7, 10],
    'min_samples_split' : [2, 3, 5, 10]
}

In [18]:
gb_grid = GridSearchCV(gb, param_grid = gb_param_grid, scoring="accuracy", n_jobs= -1, verbose = 1)
gb_grid.fit(X_train, y_train)

Fitting 5 folds for each of 128 candidates, totalling 640 fits


In [21]:
gb_grid.best_score_

0.8272234807446074

In [22]:
gb_grid.best_params_

{'max_depth': 6,
 'min_samples_leaf': 10,
 'min_samples_split': 2,
 'n_estimators': 100}

In [None]:
#가장 좋은 파라미터들로 모델 생성
#타이타닉의 테스트 데이터를 입력
#결과를 제출, 점수, 등수, 코드를 함께 카페 제출

In [55]:
test = pd.read_csv('titanic/test.csv')
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [56]:
test['Embarked'].fillna('S', inplace = True)
test['Fare'].fillna(0, inplace=True)
test['Fare'] = data['Fare'].map(lambda x : np.log(x) if x > 0 else 0)

In [57]:
test['Initial'] = test['Name'].str.extract('([A-Za-z]+)\.')
test['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don', 'Dona'],['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr','Other'],inplace=True)
mapping = {
    "Mr":0,
    "Miss":1,
    "Mrs" : 1,
    "Master":2,
    "Other":3
}

test['Initial'] = test['Initial'].map(mapping)

In [58]:
test['Sex'] = test['Sex'].map(mapping_sex)
test['Embarked'] = test['Embarked'].map(mapping_em)


test.drop(['PassengerId', "Ticket", "Cabin", "Name"], axis = 1, inplace = True)

In [59]:
test.loc[ (test['Age'].isnull()) & (test['Initial'] == 0), 'Age' ] = 32
test.loc[ (test['Age'].isnull()) & (test['Initial'] == 1), 'Age' ] = 30
test.loc[ (test['Age'].isnull()) & (test['Initial'] == 2), 'Age' ] = 7
test.loc[ (test['Age'].isnull()) & (test['Initial'] == 3), 'Age' ] = 42

In [62]:
mysubmission=pd.read_csv("titanic/gender_submission.csv")
mysubmission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [63]:
pred1=gb_grid.predict(test)
mysubmission["Survived"] = pred1

In [64]:
mysubmission.to_csv("mysubmission_gb.csv",index=False)
pred2=rf.predict(test)

In [65]:
mysubmission["Survived"] = pred2
mysubmission.to_csv("mysubmission_rf.csv",index=False)

In [None]:
"""
데이터 불균형 : 클래스가 어느 한 쪽으로만 일방적으로 존재
해결방법
1) 오버 샘플링 : 클래스가 적은 쪽의 데이터를 랜덤 복원 샘플링하여 복사 붙여넣기를 반복하여 두 클래스의 
                 비율을 비슷하게 함
2) 언더 샘플링 : 클래스가 많은 쪽의 데이터를 랜덤 샘플링하여 삭제하기를 반복하여 두 쿨래스의 비율이
                 비슷하게 함
3) 오버 & 언더 샘플링
ex) Y : 1000건 vs N : 10건 => 1010 / 2 = 505, Y는 505건이 될때까지 언더샘플링 수행, 
                      N은 505건이 될 때까지 오버샘플링
4) SMOTE 알고리즘 #데이터를 늘리는거
기존 데이터를 적절하게 혼합하여 새로운 데이터를 생성하는 방법


"""

In [66]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

In [67]:
card_df = pd.read_csv('creditcard.csv')
card_df.head(3)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0


In [68]:
from sklearn.model_selection import train_test_split 

In [None]:
train_test_split(card_df[:,:-1])

In [72]:
x_features=card_df.iloc[:,:-1] #x, 284807 rows × 30 columns
y_target=card_df.iloc[:,-1] #y, Length: 284807

In [77]:
xtrain, xtest, ytrain, ytest = train_test_split(x_features,y_target, 
                                                test_size=0.3, random_state=20231023, stratify=y_target) 
#x데이터가 먼저오고 두번째인수는 y데이터와야함, 세번째 인수는 테스트 사이즈
#stratify 층화추출 원데이터의 클래스비율 유지하면서 나눠준다

In [78]:
ytrain.value_counts()

0    199020
1       344
Name: Class, dtype: int64

In [79]:
ytest.value_counts()

0    85295
1      148
Name: Class, dtype: int64

In [80]:
def get_preprocessed_df(df=None):
    df_copy = df.copy()
    df_copy.drop('Time', axis=1, inplace=True)
    return df_copy

In [81]:
df_copy=get_preprocessed_df(card_df)

In [82]:
df_copy

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,4.356170,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,-0.975926,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,-0.484782,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,-0.399126,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


In [83]:
def get_train_test_dataset(df=None):
    df_copy = get_preprocessed_df(df)
    X_features = df_copy.iloc[:, :-1]
    y_target = df_copy.iloc[:, -1]
    X_train, X_test, y_train, y_test = \
    train_test_split(X_features, y_target, test_size=0.3, random_state=0, stratify=y_target)
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = get_train_test_dataset(card_df)

In [84]:
print(y_train.value_counts()/y_train.shape[0] * 100)

0    99.827451
1     0.172549
Name: Class, dtype: float64


In [85]:
print(y_test.value_counts()/y_test.shape[0] * 100)

0    99.826785
1     0.173215
Name: Class, dtype: float64


In [86]:
from sklearn.linear_model import LogisticRegression

In [89]:
lr_clf = LogisticRegression()

In [90]:
lr_clf.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [93]:
lr_clf.predict(X_test)
pd.Series(lr_clf.predict(X_test)).value_counts()

0    85341
1      102
dtype: int64

In [95]:
lr_clf.predict_proba(X_test) #proba 확률로 출력해주는함수다
lr_clf.predict_proba(X_test)[:,1]

array([0.00134831, 0.00012345, 0.00019178, ..., 0.00024669, 0.00074666,
       0.00013475])