In [None]:
!pip install imbalanced-learn

import numpy as np
import pandas as pd
import sklearn
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
import statsmodels.api as sm
import matplotlib.pyplot as plt
import time

from collections import Counter
from imblearn.under_sampling import RandomUnderSampler

In [None]:
webtoon = pd.read_csv('C:/Users/dltjw/Downloads/webtoonData.csv')
webtoon

In [None]:
webtoon.drop(["titleId"], axis=1, inplace = True)

In [None]:
#contentGenre와 typeGenre one-hot encoding
webtoon = pd.get_dummies(data = webtoon, columns=['contentGenre'], prefix='contGenre')
webtoon = pd.get_dummies(data = webtoon, columns=['typeGenre'], prefix='typeGenre')


In [None]:
# 회귀모델에서 b0를 위한 상수항 추가
webtoon = sm.add_constant(webtoon,has_constant = "add")


In [None]:
#열 순서 변경
webtoon = webtoon[['const', 'totalStar', 'heartLog', 'starDif',
                   'starParRatio', 'viewRatio', 'dateInterval', 'words0', 'words1',
                   'contGenre_action', 'contGenre_comic', 'contGenre_daily',
                   'contGenre_drama', 'contGenre_fantasy', 'contGenre_historical',
                   'contGenre_pure', 'contGenre_sensibility', 'contGenre_sports',
                   'contGenre_thrill', 'typeGenre_스토리', 'typeGenre_에피소드',
                   'typeGenre_옴니버스','isPublic']]


In [None]:
# X,y dataset 분리
X, y = webtoon.iloc[:,:-1], webtoon['isPublic']
# class imbalance 확인
y.sum()/len(y)

In [None]:
# 임계값 함수
def cut_off(y, threshold) :
    Y = y.copy()
    Y[Y>threshold] = 1
    Y[Y<threshold] = 0
    return Y.astype(int)

In [None]:
# 정확도 acc 함수 

def acc(cfmat):
	acc = (cfmat[0,0] + cfmat[1,1]) / np.sum(cfmat)
	return acc

### 8:2 undersampling & Logistic

In [None]:
#under sampling 8:2
undersample = RandomUnderSampler(sampling_strategy=0.25,random_state=121818)
X_under, y_under = undersample.fit_resample(X, y)
print(Counter(y_under))

In [None]:
accList = []
aucList = []
for i in range(50):
    train_X, test_X ,train_y, test_y = train_test_split(X_under, y_under, stratify=y_under, train_size=0.8,test_size=0.2,
                                                        random_state=121818+i)
    # Lasso 적합
    ll = Lasso(alpha = 0.01)
    result = ll.fit(train_X,train_y)
    pred_Y_prob = ll.predict(test_X)
    pred_Y_class = cut_off(pred_Y_prob, 0.5)
    
    ## 예측 및 acc,auc 계산
    cfmat = confusion_matrix(test_y,pred_Y_class)
    accList.append(acc(cfmat))
    fpr,tpr, thresholds = metrics.roc_curve(test_y,pred_Y_class,pos_label=1)
    aucList.append(np.trapz(tpr,fpr))
    
    if i==0:
        print(cfmat)

In [None]:
np.array(aucList).mean()

In [None]:
np.array(accList).mean()

### 2:1 Undersampling & Logistic

In [None]:
#under sampling
undersample = RandomUnderSampler(sampling_strategy=0.5,random_state=1220)
X_under2, y_under2 = undersample.fit_resample(X, y)
print(Counter(y_under2))

In [None]:
accList2 = []
aucList2 = []

In [None]:
for i in range(50):
    train_X2, test_X2 ,train_y2, test_y2 = train_test_split(X_under2, y_under2, stratify=y_under2, train_size=0.8,test_size=0.2,
                                                            random_state=1220+i)
    # Lasso 적합
    ll = Lasso(alpha = 0.01 )
    result = ll.fit(train_X2,train_y2)
    pred_Y_prob = ll.predict(test_X2)
    pred_Y_class = cut_off(pred_Y_prob, 0.5)
    
    ## 예측 및 acc,auc 계산
    cfmat2 = confusion_matrix(test_y2,pred_Y_class)
    accList2.append(acc(cfmat2))
    fpr,tpr, thresholds = metrics.roc_curve(test_y2,pred_Y_class,pos_label=1)
    aucList2.append(np.trapz(tpr,fpr))
    if i==0:
        print(cfmat2)

In [None]:
np.array(aucList2).mean()

In [None]:
np.array(accList2).mean()

### 1:1 Undersamplikng & Logistic

In [None]:
#under sampling
undersample = RandomUnderSampler(sampling_strategy=1,random_state=121818)
X_under3, y_under3 = undersample.fit_resample(X, y)
print(Counter(y_under3))


In [None]:

accList3 = []
aucList3 = []


In [None]:
for i in range(50):
    train_X3, test_X3 ,train_y3, test_y3 = train_test_split(X_under3, y_under3, stratify=y_under3, train_size=0.8,test_size=0.2,
                                                            random_state=121818+i)
    # Lasso 적합
    ll = Lasso(alpha = 0.01 )
    result = ll.fit(train_X3,train_y3)
    pred_Y_prob = ll.predict(test_X3)
    pred_Y_class = cut_off(pred_Y_prob, 0.5)
    
    ## 예측 및 acc,auc 계산
    cfmat3 = confusion_matrix(test_y3,pred_Y_class)
    accList3.append(acc(cfmat3))
    fpr,tpr, thresholds = metrics.roc_curve(test_y3,pred_Y_class,pos_label=1)
    aucList3.append(np.trapz(tpr,fpr))
    if i==0:
        print(cfmat3)

In [None]:
print(np.array(aucList3).mean())
print(np.array(accList3).mean())

### 9:1 sampling & randomfores

In [None]:
accList4 = []
aucList4 = []
for i in range(50):
    X_train4, X_test4 ,y_train4, y_test4 = train_test_split(X, y, stratify=y, train_size=0.9,test_size=0.1,
                                                            random_state=121818+i)
    rf = RandomForestClassifier(random_state=121818+i)
    rf.fit(X_train4, y_train4)
    
    accList4.append(rf.score(X_test4,y_test4))
    
    y_pred = rf.predict(X_test4)
    cfmat = confusion_matrix(y_test4, y_pred)
    
    y_pred_proba = rf.predict_proba(X_test4)[:,1]
    auc = roc_auc_score(y_test4, y_pred_proba)
    aucList4.append(auc)
    if i==0:
        print(cfmat)

In [None]:
print(np.array(aucList4).mean())
print(np.array(accList4).mean())

### 4:3 Undersampling & RandomForest

In [None]:
accList5 = []
aucList5 = []


In [None]:
undersample = RandomUnderSampler(sampling_strategy=0.7,random_state=121818)
X_under, y_under = undersample.fit_resample(X, y)


In [None]:
for i in range(50):
    X_train_under, X_test_under, y_train_under, y_test_under = train_test_split(X_under, y_under, stratify=y_under, test_size=0.2, random_state=121818)

    rf = RandomForestClassifier(random_state=121818+i)
    rf.fit(X_train_under, y_train_under)
    
    accList5.append(rf.score(X_test_under,y_test_under))
    
    y_pred = rf.predict(X_test_under)
    cfmat = confusion_matrix(y_test_under, y_pred)
    
    y_pred_proba = rf.predict_proba(X_test_under)[:,1]
    auc = roc_auc_score(y_test_under, y_pred_proba)
    aucList5.append(auc)
    if i==0:
        print(cfmat)

In [None]:
print(np.array(aucList5).mean())
print(np.array(accList5).mean())

### 8:2 Undersampling & AdaBoost

In [None]:
#under sampling 8:2
undersample = RandomUnderSampler(sampling_strategy=0.25,random_state=121818)
X_under, y_under = undersample.fit_resample(X, y)
print(Counter(y_under))

In [None]:
accList6 = []
aucList6 = []
for i in range(50):
    train_X6, test_X6 ,train_y6, test_y6 = train_test_split(X_under, y_under, stratify=y_under, train_size=0.8,test_size=0.2,
                                                        random_state=121818+i)
    
    abc = AdaBoostClassifier(n_estimators=50,
                         learning_rate=1)
    model = abc.fit(train_X6, train_y6)
    pred_Y_class = model.predict(test_X6)
    
    ## 예측 및 acc,auc 계산
    cfmat = confusion_matrix(test_y6,pred_Y_class)
    accList6.append(acc(cfmat))
    fpr,tpr, thresholds = metrics.roc_curve(test_y6,pred_Y_class,pos_label=1)
    aucList6.append(np.trapz(tpr,fpr))
    
    if i==0:
        print(cfmat)

In [None]:

print(np.array(aucList6).mean())
print(np.array(accList6).mean())

### 변수 선택

In [None]:
# 변수 선택 전후의 차이를 비교하기 위해 random_state 지정함
X_train_under, X_test_under, y_train_under, y_test_under = train_test_split(
    X_under, y_under, stratify=y_under, test_size=0.2, random_state=121818)

rf_under = RandomForestClassifier(random_state=121818)
rf_under.fit(X_train_under, y_train_under)

y_pred_proba = rf_under.predict_proba(X_test_under)[:,1]
roc_auc_score(y_test_under, y_pred_proba)

In [None]:
# 변수 중요도 확인
feature_df = pd.DataFrame()
feature_df['feature'] = X.columns
feature_df['importance'] = rf_under.feature_importances_
feature_df

In [None]:
# confusion matrix
y_under_pred = rf_under.predict(X_test_under)
confusion_matrix(y_test_under, y_under_pred)# type장르 제외
noType = webtoon.drop(['typeGenre_스토리','typeGenre_에피소드','typeGenre_옴니버스'], axis=1).copy()
X, y = noType.iloc[:,:-1], noType['isPublic']

undersample = RandomUnderSampler(sampling_strategy=0.75,random_state=121818)
X_under, y_under = undersample.fit_resample(X, y)

X_train_under, X_test_under, y_train_under, y_test_under = train_test_split(
    X_under, y_under, stratify=y_under, test_size=0.2, random_state=121818)

rf_under = RandomForestClassifier(random_state=121818)
rf_under.fit(X_train_under, y_train_under)

y_pred_proba = rf_under.predict_proba(X_test_under)[:,1]
roc_auc_score(y_test_under, y_pred_proba)

In [None]:
# confusion matrix
y_under_pred = rf_under.predict(X_test_under)
confusion_matrix(y_test_under, y_under_pred)

In [None]:
y_pred_proba # 예측 확률

In [None]:
print(*y_test_under) # 실제값

In [None]:
X_test_under.loc[99] # 하나의 웹툰 확인 