In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_csv('titanic/train.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
data['Embarked'].fillna('S', inplace = True)
data['Fare'].fillna(0, inplace=True)
data['Fare'] = data['Fare'].map(lambda x : np.log(x) if x > 0 else 0)

In [4]:
data['Initial'] = data['Name'].str.extract('([A-Za-z]+)\.')
data['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don', 'Dona'],['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr','Other'],inplace=True)
mapping = {
    "Mr":0,
    "Miss":1,
    "Mrs" : 1,
    "Master":2,
    "Other":3
}

data['Initial'] = data['Initial'].map(mapping)

In [5]:
mapping_sex = {
    'male' : 0,
    'female': 1
}

mapping_em = {
    'S' :0,
    'C' :1,
    'Q' :2
}


data['Sex'] = data['Sex'].map(mapping_sex)
data['Embarked'] = data['Embarked'].map(mapping_em)


data.drop(['PassengerId', "Ticket", "Cabin", "Name"], axis = 1, inplace = True)

In [6]:
data.groupby('Initial')['Age'].mean()

Initial
0    32.739609
1    27.834615
2     4.574167
3    45.888889
Name: Age, dtype: float64

In [7]:
data.loc[ (data['Age'].isnull()) & (data['Initial'] == 0), 'Age' ] = 32
data.loc[ (data['Age'].isnull()) & (data['Initial'] == 1), 'Age' ] = 28
data.loc[ (data['Age'].isnull()) & (data['Initial'] == 2), 'Age' ] = 5
data.loc[ (data['Age'].isnull()) & (data['Initial'] == 3), 'Age' ] = 45

In [8]:
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Initial
0,0,3,0,22.0,1,0,1.981001,0,0
1,1,1,1,38.0,1,0,4.266662,1,1
2,1,3,1,26.0,0,0,2.070022,0,1
3,1,1,1,35.0,1,0,3.972177,0,1
4,0,3,0,35.0,0,0,2.085672,0,0


In [9]:
y = data['Survived']
X = data.drop('Survived', axis = 1)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

In [12]:
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)

In [13]:
pred = rf.predict(X_test)
print("정확도 :{0:.3f}".format(accuracy_score(y_test, pred)))

정확도 :0.832


In [16]:
gb = GradientBoostingClassifier(random_state=0)
gb.fit(X_train, y_train)

In [17]:
gb_param_grid = {
    'n_estimators' : [100, 200],
    'max_depth' : [6, 8, 10, 12],
    'min_samples_leaf' : [3, 5, 7, 10],
    'min_samples_split' : [2, 3, 5, 10]
}

In [18]:
gb_grid = GridSearchCV(gb, param_grid = gb_param_grid, scoring="accuracy", n_jobs= -1, verbose = 1)
gb_grid.fit(X_train, y_train)

Fitting 5 folds for each of 128 candidates, totalling 640 fits


In [21]:
gb_grid.best_score_

0.8272234807446074

In [22]:
gb_grid.best_params_

{'max_depth': 6,
 'min_samples_leaf': 10,
 'min_samples_split': 2,
 'n_estimators': 100}

In [None]:
#가장 좋은 파라미터들로 모델 생성
#타이타닉의 테스트 데이터를 입력
#결과를 제출, 점수, 등수, 코드를 함께 카페 제출

In [55]:
test = pd.read_csv('titanic/test.csv')
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [56]:
test['Embarked'].fillna('S', inplace = True)
test['Fare'].fillna(0, inplace=True)
test['Fare'] = data['Fare'].map(lambda x : np.log(x) if x > 0 else 0)

In [57]:
test['Initial'] = test['Name'].str.extract('([A-Za-z]+)\.')
test['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don', 'Dona'],['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr','Other'],inplace=True)
mapping = {
    "Mr":0,
    "Miss":1,
    "Mrs" : 1,
    "Master":2,
    "Other":3
}

test['Initial'] = test['Initial'].map(mapping)

In [58]:
test['Sex'] = test['Sex'].map(mapping_sex)
test['Embarked'] = test['Embarked'].map(mapping_em)


test.drop(['PassengerId', "Ticket", "Cabin", "Name"], axis = 1, inplace = True)

In [59]:
test.loc[ (test['Age'].isnull()) & (test['Initial'] == 0), 'Age' ] = 32
test.loc[ (test['Age'].isnull()) & (test['Initial'] == 1), 'Age' ] = 30
test.loc[ (test['Age'].isnull()) & (test['Initial'] == 2), 'Age' ] = 7
test.loc[ (test['Age'].isnull()) & (test['Initial'] == 3), 'Age' ] = 42

In [62]:
mysubmission=pd.read_csv("titanic/gender_submission.csv")
mysubmission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [63]:
pred1=gb_grid.predict(test)
mysubmission["Survived"] = pred1

In [64]:
mysubmission.to_csv("mysubmission_gb.csv",index=False)
pred2=rf.predict(test)

In [65]:
mysubmission["Survived"] = pred2
mysubmission.to_csv("mysubmission_rf.csv",index=False)

In [None]:
"""
데이터 불균형 : 클래스가 어느 한 쪽으로만 일방적으로 존재
해결방법
1) 오버 샘플링 : 클래스가 적은 쪽의 데이터를 랜덤 복원 샘플링하여 복사 붙여넣기를 반복하여 두 클래스의 
                 비율을 비슷하게 함
2) 언더 샘플링 : 클래스가 많은 쪽의 데이터를 랜덤 샘플링하여 삭제하기를 반복하여 두 쿨래스의 비율이
                 비슷하게 함
3) 오버 & 언더 샘플링
ex) Y : 1000건 vs N : 10건 => 1010 / 2 = 505, Y는 505건이 될때까지 언더샘플링 수행, 
                      N은 505건이 될 때까지 오버샘플링
4) SMOTE 알고리즘 #데이터를 늘리는거
기존 데이터를 적절하게 혼합하여 새로운 데이터를 생성하는 방법


"""

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

In [2]:
card_df = pd.read_csv('creditcard.csv')
card_df.head(3)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0


In [3]:
from sklearn.model_selection import train_test_split 

In [5]:
x_features=card_df.iloc[:,:-1] #x, 284807 rows × 30 columns
y_target=card_df.iloc[:,-1] #y, Length: 284807

In [6]:
xtrain, xtest, ytrain, ytest = train_test_split(x_features,y_target, 
                                                test_size=0.3, random_state=20231023, stratify=y_target) 
#x데이터가 먼저오고 두번째인수는 y데이터와야함, 세번째 인수는 테스트 사이즈
#stratify 층화추출 원데이터의 클래스비율 유지하면서 나눠준다

In [9]:
ytrain.value_counts()

0    199020
1       344
Name: Class, dtype: int64

In [10]:
ytest.value_counts()

0    85295
1      148
Name: Class, dtype: int64

In [11]:
def get_preprocessed_df(df=None):
    df_copy = df.copy()
    df_copy.drop('Time', axis=1, inplace=True)
    return df_copy

In [12]:
df_copy=get_preprocessed_df(card_df)

In [13]:
df_copy

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,4.356170,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,-0.975926,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,-0.484782,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,-0.399126,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


In [15]:
def get_train_test_dataset(df=None):
    df_copy = get_preprocessed_df(df)
    X_features = df_copy.iloc[:, :-1]
    y_target = df_copy.iloc[:, -1]
    X_train, X_test, y_train, y_test = \
    train_test_split(X_features, y_target, test_size=0.3, random_state=0, stratify=y_target)
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = get_train_test_dataset(card_df)

In [24]:
X_train.head()
X_test.head()
y_train.head()
y_test.head()
X_train.shape #(199364, 29)
X_test.shape# (85443, 29)

(85443, 29)

In [16]:
print(y_train.value_counts()/y_train.shape[0] * 100)

0    99.827451
1     0.172549
Name: Class, dtype: float64


In [17]:
print(y_test.value_counts()/y_test.shape[0] * 100)

0    99.826785
1     0.173215
Name: Class, dtype: float64


In [25]:
from sklearn.linear_model import LogisticRegression

In [26]:
lr_clf = LogisticRegression()

In [27]:
lr_clf.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [28]:
lr_clf.predict(X_test)
pd.Series(lr_clf.predict(X_test)).value_counts()

0    85341
1      102
dtype: int64

In [30]:
lr_clf.predict_proba(X_test) #proba 확률로 출력해주는함수다
lr_clf.predict_proba(X_test)

array([[9.98651690e-01, 1.34831002e-03],
       [9.99876546e-01, 1.23454097e-04],
       [9.99808217e-01, 1.91783045e-04],
       ...,
       [9.99753313e-01, 2.46686772e-04],
       [9.99253336e-01, 7.46664288e-04],
       [9.99865249e-01, 1.34751489e-04]])

In [31]:
pip install lightgbm

Collecting lightgbm
  Obtaining dependency information for lightgbm from https://files.pythonhosted.org/packages/b3/f8/ee33e36194eb03a76eccf3adac3fba51f0e56fbd20609bb531659d48d3cb/lightgbm-4.1.0-py3-none-win_amd64.whl.metadata
  Downloading lightgbm-4.1.0-py3-none-win_amd64.whl.metadata (19 kB)
Downloading lightgbm-4.1.0-py3-none-win_amd64.whl (1.3 MB)
   ---------------------------------------- 0.0/1.3 MB ? eta -:--:--
   ------------------- -------------------- 0.7/1.3 MB 20.8 MB/s eta 0:00:01
   ---------------------------------------  1.3/1.3 MB 27.7 MB/s eta 0:00:01
   ---------------------------------------- 1.3/1.3 MB 16.7 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.1.0
Note: you may need to restart the kernel to use updated packages.


In [33]:
from lightgbm import  LGBMClassifier

In [34]:
lgbm_clf=LGBMClassifier(n_estimators=1000, num_leaves=64, boost_from_average=False)
#boost_from_average 는 데이터가 불균형 분포되어 있을때 False, 균형일경우 True

In [36]:
lgbm_clf.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 344, number of negative: 199020
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008553 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 199364, number of used features: 29












In [38]:
pred=lgbm_clf.predict(X_test) #test데이터에대한 예측결과
pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [39]:
lgbm_clf.predict_proba(X_test)

array([[9.99999997e-01, 3.49830240e-09],
       [1.00000000e+00, 1.08517297e-10],
       [1.00000000e+00, 2.23232317e-10],
       ...,
       [9.99999996e-01, 3.66393546e-09],
       [9.99999999e-01, 6.63996306e-10],
       [1.00000000e+00, 1.08138237e-10]])

In [40]:
X_train.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
211605,-8.367621,7.402969,-5.114191,-2.966792,-0.985904,-1.660018,0.397816,1.00825,5.290976,9.315735,...,3.589299,-0.557927,0.349087,0.301734,0.66233,1.145939,-0.012273,1.513736,0.669504,0.69
231628,-1.290662,0.764452,-0.169233,-1.688528,1.984718,-0.71721,1.667635,-0.228295,-0.959071,-0.679429,...,0.05762,0.138615,0.390374,-0.678774,0.292254,1.425865,0.265341,0.051405,0.103879,55.47
110643,-0.60686,0.233923,2.633361,-0.155762,-0.32885,-0.070593,0.173023,0.113394,0.680147,-0.686651,...,-0.129287,0.007393,0.265045,-0.040615,0.406088,-0.498823,0.121851,-0.04684,-0.087259,11.5
11478,1.374168,-0.330389,0.812971,-0.183112,-1.074771,-0.933788,-0.544238,-0.348237,0.614925,0.080408,...,0.078163,-0.134742,0.016793,0.00261,0.714527,0.536175,-0.307487,-0.003555,0.013024,10.0
243724,-2.146505,-7.603113,-4.928224,1.449173,-2.083871,-0.984759,3.629816,-1.302973,-1.434148,-0.021909,...,4.009181,1.274986,-0.85767,-2.109682,0.169338,-0.322888,0.777822,-0.552258,0.304777,2290.05


In [41]:
X_train.describe() 

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
count,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,...,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0
mean,0.000386,-0.000637,0.000587,-0.001239,0.000596,0.000806,-0.000127,0.0019,0.000387,-0.000203,...,0.000546,0.001181,0.00077,-0.000907,0.000708,0.00016,-0.001173,-0.000476,0.000252,88.286313
std,1.959976,1.658734,1.512912,1.413217,1.362707,1.322962,1.216585,1.191216,1.098032,1.086637,...,0.774205,0.736703,0.724805,0.63009,0.605543,0.52083,0.482453,0.40137,0.33083,248.033917
min,-56.40751,-72.715728,-32.965346,-5.683171,-35.18212,-26.160506,-43.557242,-73.216718,-13.434066,-24.588262,...,-28.009635,-34.830382,-10.933144,-44.807735,-2.836627,-10.295397,-2.604551,-22.565679,-11.710896,0.0
25%,-0.92149,-0.598848,-0.889918,-0.849806,-0.690631,-0.767513,-0.556268,-0.207917,-0.642713,-0.535148,...,-0.211459,-0.227817,-0.540206,-0.162056,-0.353481,-0.31718,-0.328089,-0.070961,-0.0532,5.64
50%,0.015893,0.065727,0.180635,-0.019842,-0.054758,-0.273142,0.040109,0.023008,-0.050546,-0.091521,...,-0.062514,-0.02899,0.007585,-0.011301,0.041293,0.016244,-0.053516,0.001207,0.011043,22.0
75%,1.316633,0.804333,1.028314,0.744065,0.609794,0.400874,0.570132,0.32772,0.597585,0.456096,...,0.133886,0.186863,0.529286,0.147035,0.440205,0.351141,0.240123,0.090888,0.078216,77.0
max,2.45493,22.057729,9.382558,16.875344,34.801666,21.550496,36.877368,20.007208,15.594995,23.745136,...,39.420904,27.202839,10.50309,22.528412,4.022866,6.07085,3.463246,12.152401,33.847808,19656.53


In [42]:
import numpy as np

In [44]:
#이상치 제거
q25=np.percentile(X_train['V1'].values, 25)# 25%센트지점 값
q75=np.percentile(X_train['V1'].values, 75)
iqr=q75-q25
iqr15= iqr*1.5
#표준화

In [47]:
lowest_val = q25-iqr15 #하한 바운더리
highest_val = q75+iqr15 #상한 바운더리

In [53]:
X_train['V1'][(X_train['V1'] < lowest_val) | (X_train['V1'] > highest_val)].index #이상치


Int64Index([211605, 151631,  76190, 207772, 220170, 109519, 166031, 283347,
             18527, 259701,
            ...
            222507,  72745,  20946,  77564, 169338, 125442, 264145, 216442,
             20260,  19760],
           dtype='int64', length=4928)

In [None]:
#타이타닉

In [None]:
#SMOTE 오버샘플링

In [54]:
pip install -U imbalanced-learn

Collecting imbalanced-learn
  Obtaining dependency information for imbalanced-learn from https://files.pythonhosted.org/packages/a3/9e/fbe60a768502af54563dcb59ca7856f5a8833b3ad5ada658922e1ab09b7f/imbalanced_learn-0.11.0-py3-none-any.whl.metadata
  Downloading imbalanced_learn-0.11.0-py3-none-any.whl.metadata (8.3 kB)
Downloading imbalanced_learn-0.11.0-py3-none-any.whl (235 kB)
   ---------------------------------------- 0.0/235.6 kB ? eta -:--:--
   --------------------------------------  235.5/235.6 kB 15.0 MB/s eta 0:00:01
   ---------------------------------------- 235.6/235.6 kB 7.3 MB/s eta 0:00:00
Installing collected packages: imbalanced-learn
  Attempting uninstall: imbalanced-learn
    Found existing installation: imbalanced-learn 0.10.1
    Uninstalling imbalanced-learn-0.10.1:
      Successfully uninstalled imbalanced-learn-0.10.1
Successfully installed imbalanced-learn-0.11.0
Note: you may need to restart the kernel to use updated packages.


In [55]:
from imblearn.over_sampling import SMOTE

In [56]:
smote=SMOTE(random_state=0)

In [58]:
X_train_over, y_train_over = smote.fit_resample(X_train, y_train)

In [59]:
X_train.shape

(199364, 29)

In [60]:
X_train_over.shape

(398040, 29)

In [62]:
pd.Series(y_train_over).value_counts()

0    199020
1    199020
Name: Class, dtype: int64

In [63]:
lr_clf=LogisticRegression()

In [64]:
lr_clf.fit(X_train_over, y_train_over)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [66]:
lr_clf.predict(X_test)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [67]:
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [74]:
cancer_data=load_breast_cancer()
cancer_data
X_data=cancer_data.data
y_label=cancer_data.target

In [76]:
np.shape(X_data) #(569,30)

(569, 30)

In [78]:
X_train, X_test, y_train, y_test=train_test_split(X_data, y_label,
                                                  test_size=0.2,
                                                  random_state=20231024)

In [82]:
# 스태킹(stack)
# 기본모델(knn, rf, dt, adaboost)
knn_clf=KNeighborsClassifier(n_neighbors=5)
rf_clf=RandomForestClassifier(n_estimators=100, random_state=42)
dt_clf=DecisionTreeClassifier()
ada_clf=AdaBoostClassifier(n_estimators=100)

In [83]:
knn_clf.fit(X_train, y_train)
rf_clf.fit(X_train, y_train)
dt_clf.fit(X_train, y_train)
ada_clf.fit(X_train, y_train)

In [84]:
knn_pred=knn_clf.predict(X_test)
rf_pred=rf_clf.predict(X_test)
dt_pred=dt_clf.predict(X_test)
ada_pred=ada_clf.predict(X_test)

In [85]:
print(accuracy_score(y_test, knn_pred))
print(accuracy_score(y_test, rf_pred))
print(accuracy_score(y_test, dt_pred))
print(accuracy_score(y_test, ada_pred))


0.9298245614035088
0.9473684210526315
0.9385964912280702
0.9736842105263158


In [88]:
knn_pred.shape #(114,)

(114,)

In [90]:
pred=np.array([knn_pred, rf_pred, dt_pred, ada_pred])#(114,) => (4,114)
pred.shape

(4, 114)

In [93]:
pred=np.transpose(pred) # (4,114) 행렬을 (114,4)로 바꿔주기위해 transpose
pred #114,4

array([[1, 1, 1, 1],
       [0, 0, 0, 0],
       [1, 1, 1, 1],
       [1, 1, 1, 1],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [1, 1, 1, 1],
       [0, 0, 0, 0],
       [1, 1, 1, 1],
       [1, 1, 1, 1],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [1, 1, 1, 1],
       [1, 1, 1, 1],
       [1, 1, 1, 1],
       [0, 0, 0, 0],
       [1, 1, 1, 1],
       [1, 1, 1, 1],
       [1, 1, 1, 1],
       [1, 1, 1, 1],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [1, 1, 1, 1],
       [0, 0, 0, 0],
       [1, 1, 1, 1],
       [1, 1, 1, 1],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [1, 1, 1, 1],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 1, 1, 1],
       [1, 1, 1, 1],
       [1, 1, 1, 1],
       [0, 1, 1, 1],
       [0, 0, 0, 0],
       [1, 1, 1, 1],
       [1, 1, 1, 1],
       [1, 1, 1, 1],
       [1, 1, 1, 1],
       [1, 1, 1, 1],
       [1, 1, 1, 1],
       [1, 1, 1, 1],
       [1, 1,

In [94]:
#최종 스태킹 모델 : 로지스틱 회귀

In [96]:
lr_final=LogisticRegression()

In [97]:
lr_final.fit(pred, y_test)

In [98]:
final = lr_final.predict(pred)
final

array([1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0,
       1, 1, 0, 1])

In [99]:
accuracy_score(final, y_test)

0.9649122807017544

In [None]:
"""
*서로다른 알고리즘으로 형성 : 보팅 , 하나의 알고리즘으로 형성 : 배깅
*부스팅(정형 데이터에 적합) : 이전 분류기의 학습 결과 바탕으로 다음 분류기의 학습 데이터의 샘플 가중치 조정해 학습하는 방법
-XGBoost가 GradientBoost보다 오버피팅 방지에 적합하고, 속도가 훨씬 빠르다
-XGBoost는 GridSearchCv할시에는 파라미터가 많다보니 오래걸린다.
-XGBoost는 균형 잡힌 트리 이용(양쪽 트리에 대한 깊이 균등하게 = Level-wise tree growth) -> 오버피팅 줄이기 위해 -> 속도는 느려짐
-LIghtGradiendBoost는 GridSearchCv할시에 속도는 빠른데 자료가 많으면 과대적합에 걸린다 -> 10000개 데이터 미만 있어야 오키
-LightGBM은 Loss 가 가장 큰 노드를 선택하여 Subtree를 구성한다 = Leaf-wise tree growth ->속도 빠름
*스태킹
각각의 모델에다가 test데이터를 집어넣어 예측을 하는데 이러한 데이터를 다시 training 데이터로 사용
    -> Svm      -> 예측값
data-> RF       -> 예측값 => training data로 사용
    -> Light GBM-> 예측값
Cv기반 stacking = 각각의 모델들이 교차검증 기반으로 학습데이터 생성
*데이터 불균형 : 클래스가 어느 한쪽으로만 일방적으로 존재
해결방법
1) 오버 샘플링 : 클래스가 적은 쪽의 데이터를 랜덤 복원 샘플링하여 복사 붙여넣기를 반복하여 두 클래스의 비율을 비슷하게함
2) 언더 샘플링 : 클래스가 많은 쪽의 데이터를 랜덤 샘플링하여 삭제하기를 반복하여 두 클래스의 비율을 비슷하게 함
3) 오버 & 언더 샘플링
ex) Y:1000건 vs N:10건 => 1010/2 = 505, Y는 505건이 될때까지 언더 샘플링, N은 505건이 될때까지 오버 샘플링한다.
4) SMOTE 알고리즘
기존 데이터를 적절하게 혼합하여 새로운 데이터를 생성하는 방법
ROC커브 이용해 모델 정확도 확인
ROC커브 : 민감도(SE)와 1-특이도(SP)로 그린 그림
 => 1-특이도(SP)=1- TN/TN+FP = FP/TN+FP = 실제 음성인 것과 양성으로 잘못 예측한 것의 비율 = 위양성율 (FPR)
AUC커브는 Roc커브의 아래쪽 면적이고 이 값은 클수록 좋은 모델이다
민감도(Sensitivity=SE) = TP/TP+FN = 맞춘 양성/실제 양성 = FN이 작을수록 정확성이 증가, 민감도는 클수록 좋다
특이도(Specificity=SP) = TN/TN+FP = 맞춘 음성/실제 음성 = FP이 작을수록 좋고, 특이도가 클수록 좋은 모델
*Linear Regression(선형회귀) = 데이터들의 분포가 직선형태
- cost function값이 작다 = 직선과 점사이의 거리가 가깝다
- cost가 작을수록 좋은 모델이라 할수 있음
- h(x) - y = 예측값 -실제값 = ERROR
- 우리가 찾는값 = cost를 작게하는 h(x)값 -> W,b를 찾는다
- W랑 b를 조금씩 바꿔가면서 cost값을 줄인다
- cost가 최솟값을 갖았다 판단되면 중단
*sigmoid 함수 : 모든 값을 0~1사이로 나타낸다 -> 0.5보다 크면 1, 작으면 0, 기존의 cost 함수 사용 못해서 log함수 그래프로 사용
*Softmax Cross-Entropy : 다중분류할경우 사용
- softmax function : 각각의 분류기로부터 값을 전달받아 확률로 변환해주는 함수 -> 가장 높은 확률을 갖는 값을 정답으로 예측
                     이후 가장 높은 값의 확률을 w,b를 조정해 1로 나머지는 0으로 만듬
linear regression -> sigmoid이용한 binaryclassification(logits) ->  softmax
*Association Rule Mining(연관분석)
Itemset ; 항목집합 -> 하나이상의 아이템들의 집합
Support count : 항목집합의 발생 빈도수 => 항목집합의 아이템이 모두 다있는 거래가 몇건인지
Support : 항목집합에 대한 카운트(Support count)를 전체 거래에 대해 비중으로 나타낸것
Frequent itemset: 빈발항목 집합 => 최소 지지 인계값(mnsup)이상에 해당하는 항목집합
                  ex) mnsup를 2로 설정하면 support count가 2이상일경우 빈발항목 집합이라 한다
Association Rule(연관규칙) : x라고 하는 항목집합에 대한 구매가 발생됐을때 y라는 항목에 대해 구매가 이뤄진다
Support(지지도) : X,Y 라는 두개의 아이템 집합을 모두 포함하는 거래가 전체중에 얼마나 있나 비율
confidence(신뢰도) : 조건부 확률 -> x상품이 구매됐을때 y가 구매될 확률이 얼마나 되나? -> X라고 하는 항목집합의 거래 수가 분모로 온다
                   -> x,y둘다 포함하는 집합 수/ x만 포함하는 집합 수
연관분석 필요성 : 대용량 데이터 베이스에서 기존에는 발견할수 없는 아이템간의 관계를 발견할수 있다.

ex) minsup=0.1 =>최소 마트에온 고객 10명중 1명은 꼭 사는것? 잘 설정해줘야한다.
Frequent Itemset Generation: support >= minsup

Rule Generation : 빈발아이템셋으로부터 룰이 나온다

지지도를 만족하면 빈발 항목 집합(frequent)
지지도를 만족하지 못하면 비빈발 항목 집합(Infrequent)
"""

In [None]:
"""
temset = 항목집합 ex) 1itemset = {milk}, 3itemset = {milk, bred, diaper}
Supprt count = itemset의 빈도수
Support = 항목집합의 빈도수를 전체 거래수로 나눈것
Frequent itemset = 빈발 항목집합. minimum support 최소 지지도(threshold(mnsup)) 이상에
해당하는 항목 집합을 빈발 항목집합이라고 한다.

지지도(support) : P(A∩B)
신뢰도(confidence) : P(A∩B)/P(A)
향상도(lift) : P(A∩B)/P(A)P(B)

향상도 값에 따른 관계와 의미
1이다 : 두 품목간에 연괏넝이 없는 서로 독립적이 관계
1보다작다: 두 품목이 서로 음의 상관관계(품목 A와 품목 B는 연관성이 없음)
1보다크다: 두 품목이 서로 양의 상관과계
          (품목 B를 구매할 확률보다 품목 A를 구매하 후에 품목 B를 구매할 확률이 더 높다,
           즉, 품목 A와 B의 연관성이 매우 높음)
연관분석의 필요성 : 대용량 데이터 베이스에서 기존에는 발견할 수 없는 아이템간의 관계를 발견할 수 있다.
"""

In [None]:
#연관규칙

In [103]:
pip install mlxtend

Collecting mlxtend
  Obtaining dependency information for mlxtend from https://files.pythonhosted.org/packages/73/da/d5d77a9a7a135c948dbf8d3b873655b105a152d69e590150c83d23c3d070/mlxtend-0.23.0-py3-none-any.whl.metadata
  Downloading mlxtend-0.23.0-py3-none-any.whl.metadata (7.3 kB)
Downloading mlxtend-0.23.0-py3-none-any.whl (1.4 MB)
   ---------------------------------------- 0.0/1.4 MB ? eta -:--:--
   ------------- -------------------------- 0.5/1.4 MB 15.9 MB/s eta 0:00:01
   ------------------------------------- -- 1.4/1.4 MB 21.6 MB/s eta 0:00:01
   ---------------------------------------  1.4/1.4 MB 18.4 MB/s eta 0:00:01
   ---------------------------------------- 1.4/1.4 MB 13.2 MB/s eta 0:00:00
Installing collected packages: mlxtend
Successfully installed mlxtend-0.23.0
Note: you may need to restart the kernel to use updated packages.


In [104]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [105]:
dataset=[['사과','치즈','생수'],
['생수','호두','치즈','고등어'],
['수박','사과','생수'],
['생수','호두','치즈','옥수수']]

In [106]:
te = TransactionEncoder()

In [107]:
te_ary=te.fit(dataset).transform(dataset)

In [111]:
te.columns_
df=pd.DataFrame(te_ary, columns=te.columns_)
df

Unnamed: 0,고등어,사과,생수,수박,옥수수,치즈,호두
0,False,True,True,False,False,True,False
1,True,False,True,False,False,True,True
2,False,True,True,True,False,False,False
3,False,False,True,False,True,True,True


In [118]:
type(apriori(df, min_support=0.5, use_colnames=True))

pandas.core.frame.DataFrame

In [116]:
freq_itemsets=apriori(df, min_support=0.5, use_colnames=True)

In [117]:
freq_itemsets

Unnamed: 0,support,itemsets
0,0.5,(사과)
1,1.0,(생수)
2,0.75,(치즈)
3,0.5,(호두)
4,0.5,"(사과, 생수)"
5,0.75,"(치즈, 생수)"
6,0.5,"(생수, 호두)"
7,0.5,"(치즈, 호두)"
8,0.5,"(치즈, 생수, 호두)"


In [121]:
res=association_rules(freq_itemsets, metric='lift')
res[res['lift']>1]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
6,(치즈),(호두),0.75,0.5,0.5,0.666667,1.333333,0.125,1.5,1.0
7,(호두),(치즈),0.5,0.75,0.5,1.0,1.333333,0.125,inf,0.5
8,"(치즈, 생수)",(호두),0.75,0.5,0.5,0.666667,1.333333,0.125,1.5,1.0
10,"(생수, 호두)",(치즈),0.5,0.75,0.5,1.0,1.333333,0.125,inf,0.5
11,(치즈),"(생수, 호두)",0.75,0.5,0.5,0.666667,1.333333,0.125,1.5,1.0
13,(호두),"(치즈, 생수)",0.5,0.75,0.5,1.0,1.333333,0.125,inf,0.5


In [None]:
pip install mlxtend


In [None]:
import pandas as pd
import numpy as np
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [122]:
dataset = [['Milk', 'Onion', 'Nutmeg', 'Eggs', 'Yogurt'],
           ['Onion', 'Nutmeg', 'Eggs', 'Yogurt'],
           ['Milk', 'Apple', 'Eggs'],
           ['Milk', 'Unicorn', 'Corn', 'Yogurt'],
           ['Corn', 'Onion', 'Onion', 'Ice cream', 'Eggs']]

In [125]:
"""
마트의 데이터분석가
- 번들, 유통, 대체 등
"""

'\n마트의 데이터분석가\n- 번들, 유통, 대체 등\n'

In [126]:
te = TransactionEncoder() #이중리스트를 바꿔주는것

In [131]:
te_ary=te.fit(dataset).transform(dataset)
te_ary

array([[False, False,  True, False,  True,  True,  True, False,  True],
       [False, False,  True, False, False,  True,  True, False,  True],
       [ True, False,  True, False,  True, False, False, False, False],
       [False,  True, False, False,  True, False, False,  True,  True],
       [False,  True,  True,  True, False, False,  True, False, False]])

In [128]:
te.columns_
df=pd.DataFrame(te_ary, columns=te.columns_)
df

Unnamed: 0,Apple,Corn,Eggs,Ice cream,Milk,Nutmeg,Onion,Unicorn,Yogurt
0,False,False,True,False,True,True,True,False,True
1,False,False,True,False,False,True,True,False,True
2,True,False,True,False,True,False,False,False,False
3,False,True,False,False,True,False,False,True,True
4,False,True,True,True,False,False,True,False,False


In [136]:
freq_itemsets=apriori(df, min_support=0.4, use_colnames=True)
freq_itemsets

Unnamed: 0,support,itemsets
0,0.4,(Corn)
1,0.8,(Eggs)
2,0.6,(Milk)
3,0.4,(Nutmeg)
4,0.6,(Onion)
5,0.6,(Yogurt)
6,0.4,"(Eggs, Milk)"
7,0.4,"(Eggs, Nutmeg)"
8,0.6,"(Eggs, Onion)"
9,0.4,"(Eggs, Yogurt)"


In [154]:
res=association_rules(freq_itemsets, metric='lift')
res[res['lift']>2.0]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
23,"(Eggs, Yogurt)",(Nutmeg),0.4,0.4,0.4,1.0,2.5,0.24,inf,1.0
26,(Nutmeg),"(Eggs, Yogurt)",0.4,0.4,0.4,1.0,2.5,0.24,inf,1.0
35,"(Onion, Yogurt)",(Nutmeg),0.4,0.4,0.4,1.0,2.5,0.24,inf,1.0
38,(Nutmeg),"(Onion, Yogurt)",0.4,0.4,0.4,1.0,2.5,0.24,inf,1.0
41,"(Eggs, Onion, Yogurt)",(Nutmeg),0.4,0.4,0.4,1.0,2.5,0.24,inf,1.0
45,"(Eggs, Nutmeg)","(Onion, Yogurt)",0.4,0.4,0.4,1.0,2.5,0.24,inf,1.0
46,"(Eggs, Yogurt)","(Onion, Nutmeg)",0.4,0.4,0.4,1.0,2.5,0.24,inf,1.0
47,"(Onion, Nutmeg)","(Eggs, Yogurt)",0.4,0.4,0.4,1.0,2.5,0.24,inf,1.0
48,"(Onion, Yogurt)","(Eggs, Nutmeg)",0.4,0.4,0.4,1.0,2.5,0.24,inf,1.0
52,(Nutmeg),"(Eggs, Onion, Yogurt)",0.4,0.4,0.4,1.0,2.5,0.24,inf,1.0


In [None]:
"""
높은 향상도를 보여주는 품목
달걀,요거트 <-> 넛맥 2.5
양파,요거트 <-> 넛맥 2.5
달걀,양파,요거트 <-> 넛맥 2.5
달걀,넛맥 <-> 양파,요거트 2.5
달걀,요거트 <-> 양파,넛맥 2.5
"""
"""
품목별 지지도
달걀 0.8
넛맥 0.4
양파 0.6
우유 0.6
요거트 0.6

위 값으로 보았을때 넛맥은 다른 상품과 같이 팔았을때 잘 팔리는것을 볼 수 있다.

"""