In [1]:
#Base & visualization
import pandas as pd
import random
import os
import numpy as np
import warnings
import matplotlib.pylab as plt
import seaborn as sns

#sklearn module & utils
from tqdm.notebook import tqdm
from sklearn.feature_selection import SelectPercentile
from sklearn.model_selection import StratifiedKFold , KFold, train_test_split, cross_val_score, cross_validate
from sklearn.feature_selection import SelectPercentile
warnings.filterwarnings('ignore') 

# hyperparameter
import optuna
from optuna.samplers import TPESampler

#Scaling
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler, RobustScaler

# Encoding
import category_encoders as ce

#Sampling
from imblearn.over_sampling import BorderlineSMOTE

#Modeling
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier, VotingClassifier,RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import RidgeClassifier, RidgeClassifierCV
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, f1_score
from xgboost import XGBClassifier
from pathlib import Path
DATA_PATH = Path('dataset')

### 데이터 자르기

In [3]:
train = pd.read_csv(DATA_PATH / 'train.csv')
test = pd.read_csv(DATA_PATH / 'test.csv')

In [47]:
train_TO_idx = (train["PRODUCT_CODE"] == "T_31") | (train["PRODUCT_CODE"] == "O_31")
train_A_idx  = (train["PRODUCT_CODE"] == "A_31")
test_TO_idx = (test["PRODUCT_CODE"] == "T_31") | (test["PRODUCT_CODE"] == "O_31")
test_A_idx  = (test["PRODUCT_CODE"] == "A_31")

train_TO = train[train_TO_idx]
train_A = train[train_A_idx]
test_TO = test[test_TO_idx]
test_A = test[test_A_idx]

In [48]:
def exclude(df, df_test):
    for col in df:
        if df[col].nunique() < 2 and col != "PRODUCT_CODE":
            df.drop(columns=col, inplace=True)
            df_test.drop(columns=col, inplace=True)
    return df, df_test

In [None]:
train_TO, test_TO = exclude(train_TO, test_TO)
train_A, test_A = exclude(train_A, test_A)

In [26]:
train_TO.to_csv("split_dataset/train_TO.csv", index=False)
test_TO.to_csv("split_dataset/test_TO.csv", index=False)
train_A.to_csv("split_dataset/train_A.csv", index=False)
test_A.to_csv("split_dataset/test_A.csv", index=False)

In [52]:
train_TO = pd.read_csv("split_dataset/train_TO.csv")
test_TO  = pd.read_csv("split_dataset/test_TO.csv")
train_A  = pd.read_csv("split_dataset/train_A.csv")
test_A   = pd.read_csv("split_dataset/test_A.csv")

In [53]:
def scale(df, df_test):
    scaler = StandardScaler()
    num_features = df.select_dtypes(exclude=['object']).columns.to_list()[2:]
    
    df[num_features] = scaler.fit_transform(df[num_features])
    df_test[num_features] = scaler.transform(df_test[num_features])
    return df, df_test
train_TO, test_TO = scale(train_TO, test_TO)
train_A, test_A = scale(train_A, test_A)


In [55]:
corr_TO = train_TO.corr()['Y_Quality']
corr_A = train_A.corr()['Y_Quality']
corr_TO = corr_TO[2:]
corr_A = corr_A[2:]


In [57]:
corr_TO = corr_TO.sort_values()
corr_A = corr_A.sort_values()

In [64]:
pd.DataFrame(corr_A).to_csv("correlation/correlation_A.csv")

### 파생변수 생성

In [2]:
X_TO = pd.read_csv("split_dataset/train_TO.csv")
test_TO = pd.read_csv("split_dataset/test_TO.csv")

X_A = pd.read_csv("split_dataset/train_A.csv")
test_A = pd.read_csv("split_dataset/test_A.csv")

In [3]:
correlation_TO = pd.read_csv("correlation/correlation_TO.csv")
correlation_A = pd.read_csv("correlation/correlation_A.csv")

In [4]:
Q = correlation_TO["Y_Quality"]
Q = np.abs(Q)
correlation_TO["Y_Quality"] = Q

Q = correlation_A["Y_Quality"]
Q = np.abs(Q)
correlation_A["Y_Quality"] = Q

In [5]:
TO_imp = correlation_TO.sort_values("Y_Quality").iloc[-10:]['Column']
A_imp = correlation_A.sort_values("Y_Quality").iloc[-10:]['Column']
TO_imp = list(TO_imp)
A_imp = list(A_imp)

In [6]:
def fill_median(df , test_df):
    # Train에서부터 통계정보 (중앙값) 을 가지고 할당해줌
    num_features = df.select_dtypes(exclude=['object']).columns.to_list()
    for c in num_features:
        m = df[c].median()
        df.fillna({c: m}, inplace=True)
        test_df.fillna({c: m}, inplace=True)
    return df, test_df

X_TO, test_TO = fill_median(X_TO, test_TO)
X_A, test_A   = fill_median(X_A, test_A)

In [7]:
def scaling(df, test_df):
    scaler = StandardScaler()
    num_features = df.select_dtypes(exclude=['object']).columns.to_list()[2:]
    df[num_features] = scaler.fit_transform(df[num_features])
    test_df[num_features] = scaler.transform(test_df[num_features])
    return df, test_df

X_TO, test_TO = scaling(X_TO, test_TO)
X_A, test_A = scaling(X_A, test_A)

In [8]:
TO_imp, A_imp

(['X_529',
  'X_532',
  'X_124',
  'X_125',
  'X_530',
  'X_90',
  'X_121',
  'X_73',
  'X_120',
  'X_699'],
 ['X_1012',
  'X_1010',
  'X_335',
  'X_367',
  'X_368',
  'X_318',
  'X_1523',
  'X_1524',
  'X_1525',
  'X_1407'])

###  Simple Cal TO

In [None]:
from itertools import combinations, product
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import wasserstein_distance as wd
from scipy.stats import ks_2samp as ks , energy_distance as ed

Additional_cols = X_TO[['PRODUCT_ID', 'Y_Class', 'Y_Quality', 'TIMESTAMP', 'LINE', "PRODUCT_CODE"]]
test_Additional_cols = test_TO[['PRODUCT_ID', 'TIMESTAMP', 'LINE', "PRODUCT_CODE"]]
num_features = test_TO.select_dtypes(exclude=['object']).columns.to_list()

target = X_TO["Y_Class"]

idx2 = X_TO.Y_Class == 2
idx1 = X_TO.Y_Class == 1
idx0 = X_TO.Y_Class == 0

stores=[]
newcol = ["+" , "-" , "x"]

a = list(combinations(TO_imp , 2))

fig , axes = plt.subplots(nrows=len(a) ,ncols=len(newcol),
                          figsize=(20,80) )
plt.subplots_adjust(left=0.05, bottom=0.01, right=0.99, 
                    top=0.99, wspace=None, hspace=0.7)

ax = axes.flatten()
combination = list(map(list, product(a, newcol))) ## 180

for idx , col in enumerate(combination) : 
    one , two = col[0]
    cal = col[1]
    name = "{} {} {}".format(one , cal , two)
    if cal == "+" :
        X_TO[name] = (X_TO.loc[: , one] + X_TO.loc[: , two])
        test_TO[name] = (test_TO.loc[:, one] + test_TO.loc[:, two])
    elif cal == "-" :
        X_TO[name] = (X_TO.loc[: , one] - X_TO.loc[: , two])
        test_TO[name] = (test_TO.loc[:, one] - test_TO.loc[:, two])
    elif cal == "x" :
        X_TO[name] = (X_TO.loc[: , one] * X_TO.loc[: , two])
        test_TO[name] = (test_TO.loc[:, one] * test_TO.loc[:, two])
        
    #target2 = X_TO.loc[idx2 , [name]].dropna()
    #target1 = X_TO.loc[idx1 , [name]].dropna()
    #target0 = X_TO.loc[idx0 , [name]].dropna()
    
    #wdist_1 = wd(np.squeeze(target1) , np.squeeze(target0))
    #wdist_2 = wd(np.squeeze(target1) , np.squeeze(target2))
    #wdist_3 = wd(np.squeeze(target0) , np.squeeze(target2))
    #if wdist_1 > 0.5 and wdist_2 > .5 and wdist_3 > .5 :
    stores.append(name)
    Additional_cols[name] = X_TO[name]
    """
    sns.distplot(target0 , ax = ax[idx])
    sns.distplot(target1 , ax = ax[idx])
    sns.distplot(target2 , ax = ax[idx])
    
    kdist , b = ks(np.squeeze(target1) , np.squeeze(target0))
    wdist = wd(np.squeeze(target1) , np.squeeze(target0))
    edist = ed(np.squeeze(target1) , np.squeeze(target0))
    msg = "[{}] | ed : {} | ks : {} | wd : {}".format(name , 
                                                np.round(edist,3),
                                                np.round(kdist,3),
                                                np.round(wdist,3),
                                            )
    ax[idx].set_title(msg , fontsize=  10)
    """

for idx, col in enumerate(stores):
    target2 = X_TO.loc[idx2 , [col]].dropna()
    target1 = X_TO.loc[idx1 , [col]].dropna()
    target0 = X_TO.loc[idx0 , [col]].dropna()
    
    wdist_1 = wd(np.squeeze(target1) , np.squeeze(target0))
    wdist_2 = wd(np.squeeze(target1) , np.squeeze(target2))
    wdist_3 = wd(np.squeeze(target0) , np.squeeze(target2))
    if wdist_1 > 0.5 and wdist_2 > .5 and wdist_3 > .5 :
        sns.distplot(target0 , ax = ax[idx])
        sns.distplot(target1 , ax = ax[idx])
        sns.distplot(target2 , ax = ax[idx])
        
        kdist , b = ks(np.squeeze(target1) , np.squeeze(target0))
        wdist = wd(np.squeeze(target1) , np.squeeze(target0))
        edist = ed(np.squeeze(target1) , np.squeeze(target0))
        msg = "[{}] | ed : {} | ks : {} | wd : {}".format(name , 
                                                    np.round(edist,3),
                                                    np.round(kdist,3),
                                                    np.round(wdist,3),
                                                    )
        ax[idx].set_title(msg , fontsize=  10)
    else:
        X_TO.drop(columns=col, inplace=True)
        test_TO.drop(columns=col, inplace=True)
        
        

plt.savefig("./New.png")
plt.show()

In [37]:
print(X_TO.shape, test_TO.shape)

(349, 606) (243, 604)


### POLY-Method

In [59]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(4, interaction_only=True ,include_bias=False)
important = TO_imp
#poly.fit_trans
poly_result = poly.fit_transform(X_TO[important])
poly_result_test = poly.transform(test_TO[important])

cols = ["poly_{}".format(i) for i in range(poly_result.shape[1])]
cols_test = ["poly_{}".format(i) for i in range(poly_result_test.shape[1])]

output = pd.DataFrame(poly_result , columns= cols)
output_test = pd.DataFrame(poly_result_test, columns=cols_test)
output.shape


(349, 385)

In [60]:
from scipy.stats import wasserstein_distance as wd

idx2 = X_TO.Y_Class == 2
idx1 = X_TO.Y_Class == 1
idx0 = X_TO.Y_Class == 0

store = []
for idx , col in enumerate(cols) : 
    target2 = output.loc[idx2 , [col]].dropna()
    target1 = output.loc[idx1 , [col]].dropna()
    target0 = output.loc[idx0 , [col]].dropna()
    
    wdist_1 = wd(np.squeeze(target1) , np.squeeze(target0))
    wdist_2 = wd(np.squeeze(target1) , np.squeeze(target2))
    wdist_3 = wd(np.squeeze(target0) , np.squeeze(target2))
    if wdist_1 > .5 and wdist_2 > .5 and wdist_3 > .5:
        store.append(col)
        Additional_cols[col] = output[col]
        X_TO[col] = output[col]
        test_TO[col] = output_test[col]
        
len(store)  ## 39

83

In [None]:

fig , axes = plt.subplots(nrows=21 ,ncols=4,
                         figsize=(20,42) )
plt.subplots_adjust(left=0.05, bottom=0.01, right=0.99, 
                    top=0.99, wspace=None, hspace=0.7)
ax = axes.flatten()
for idx , col in enumerate(store): 
    target2 = output.loc[idx2 , [col]].dropna()
    target1 = output.loc[idx1 , [col]].dropna()
    target0 = output.loc[idx0 , [col]].dropna()
    
    kdist , b = ks(np.squeeze(target1) , np.squeeze(target0))
    wdist = wd(np.squeeze(target1) , np.squeeze(target0))
    edist = ed(np.squeeze(target1) , np.squeeze(target0))
    
    sns.distplot(target0 , ax = ax[idx])
    sns.distplot(target1 , ax = ax[idx])
    sns.distplot(target2 , ax = ax[idx])
    
    msg = "[{}] | ed : {} | ks : {} | wd : {}".format(col , 
                                                np.round(edist,3),
                                                np.round(kdist,3),
                                                np.round(wdist,3),
                                               )
    ax[idx].set_title(msg , fontsize=  10)
plt.savefig("./New2.png")
plt.show()

In [68]:
print(X_TO.shape, test_TO.shape)

(349, 689) (243, 687)


In [70]:
X_TO.to_csv("split_dataset/train_add_TO.csv", index=False)
test_TO.to_csv("split_dataset/test_add_TO.csv", index=False)

In [83]:
corr_X_TO = X_TO.drop(columns="Y_Class")
C = corr_X_TO.corr()[["Y_Quality"]]
C = C.iloc[1:]
C = C.sort_values("Y_Quality")
C.to_csv("correlation/correlation_add_TO.csv")

### Simple Cal A

In [None]:
from itertools import combinations, product
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import wasserstein_distance as wd
from scipy.stats import ks_2samp as ks , energy_distance as ed

Additional_cols = X_A[['PRODUCT_ID', 'Y_Class', 'Y_Quality', 'TIMESTAMP', 'LINE', "PRODUCT_CODE"]]
test_Additional_cols = test_A[['PRODUCT_ID', 'TIMESTAMP', 'LINE', "PRODUCT_CODE"]]
num_features = test_A.select_dtypes(exclude=['object']).columns.to_list()

target = X_A["Y_Class"]

idx2 = X_A.Y_Class == 2
idx1 = X_A.Y_Class == 1
idx0 = X_A.Y_Class == 0

stores=[]
newcol = ["+" , "-" , "x"]

a = list(combinations(A_imp , 2))

fig , axes = plt.subplots(nrows=len(a) ,ncols=len(newcol),
                          figsize=(20,80) )
plt.subplots_adjust(left=0.05, bottom=0.01, right=0.99, 
                    top=0.99, wspace=None, hspace=0.7)

ax = axes.flatten()
combination = list(map(list, product(a, newcol))) ## 180

for idx , col in enumerate(combination) : 
    one , two = col[0]
    cal = col[1]
    name = "{} {} {}".format(one , cal , two)
    if cal == "+" :
        X_A[name] = (X_A.loc[: , one] + X_A.loc[: , two])
        test_A[name] = (test_A.loc[:, one] + test_A.loc[:, two])
    elif cal == "-" :
        X_A[name] = (X_A.loc[: , one] - X_A.loc[: , two])
        test_A[name] = (test_A.loc[:, one] - test_A.loc[:, two])
    elif cal == "x" :
        X_A[name] = (X_A.loc[: , one] * X_A.loc[: , two])
        test_A[name] = (test_A.loc[:, one] * test_A.loc[:, two])
  
    stores.append(name)
    Additional_cols[name] = X_A[name]

for idx, col in enumerate(stores):
    target2 = X_A.loc[idx2 , [col]].dropna()
    target1 = X_A.loc[idx1 , [col]].dropna()
    target0 = X_A.loc[idx0 , [col]].dropna()
    
    wdist_1 = wd(np.squeeze(target1) , np.squeeze(target0))
    wdist_2 = wd(np.squeeze(target1) , np.squeeze(target2))
    wdist_3 = wd(np.squeeze(target0) , np.squeeze(target2))
    if wdist_1 > 0.5 and wdist_2 > .5 and wdist_3 > .5 :
        sns.distplot(target0 , ax = ax[idx])
        sns.distplot(target1 , ax = ax[idx])
        sns.distplot(target2 , ax = ax[idx])
        
        kdist , b = ks(np.squeeze(target1) , np.squeeze(target0))
        wdist = wd(np.squeeze(target1) , np.squeeze(target0))
        edist = ed(np.squeeze(target1) , np.squeeze(target0))
        msg = "[{}] | ed : {} | ks : {} | wd : {}".format(name , 
                                                    np.round(edist,3),
                                                    np.round(kdist,3),
                                                    np.round(wdist,3),
                                                    )
        ax[idx].set_title(msg , fontsize=  10)
    else:
        X_A.drop(columns=col, inplace=True)
        test_A.drop(columns=col, inplace=True)

plt.show()

In [12]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(4, interaction_only=True ,include_bias=False)
important = A_imp
#poly.fit_trans
poly_result = poly.fit_transform(X_A[important])
poly_result_test = poly.transform(test_A[important])

cols = ["poly_{}".format(i) for i in range(poly_result.shape[1])]
cols_test = ["poly_{}".format(i) for i in range(poly_result_test.shape[1])]

output = pd.DataFrame(poly_result , columns= cols)
output_test = pd.DataFrame(poly_result_test, columns=cols_test)
output.shape


(249, 385)

In [13]:
from scipy.stats import wasserstein_distance as wd

idx2 = X_A.Y_Class == 2
idx1 = X_A.Y_Class == 1
idx0 = X_A.Y_Class == 0

store = []
for idx , col in enumerate(cols) : 
    target2 = output.loc[idx2 , [col]].dropna()
    target1 = output.loc[idx1 , [col]].dropna()
    target0 = output.loc[idx0 , [col]].dropna()
    
    wdist_1 = wd(np.squeeze(target1) , np.squeeze(target0))
    wdist_2 = wd(np.squeeze(target1) , np.squeeze(target2))
    wdist_3 = wd(np.squeeze(target0) , np.squeeze(target2))
    if wdist_1 > .5 and wdist_2 > .5 and wdist_3 > .5:
        store.append(col)
        Additional_cols[col] = output[col]
        X_A[col] = output[col]
        test_A[col] = output_test[col]
        
len(store)  ## 39

16

In [None]:

fig , axes = plt.subplots(nrows=21 ,ncols=4,
                         figsize=(20,42) )
plt.subplots_adjust(left=0.05, bottom=0.01, right=0.99, 
                    top=0.99, wspace=None, hspace=0.7)
ax = axes.flatten()
for idx , col in enumerate(store): 
    target2 = output.loc[idx2 , [col]].dropna()
    target1 = output.loc[idx1 , [col]].dropna()
    target0 = output.loc[idx0 , [col]].dropna()
    
    kdist , b = ks(np.squeeze(target1) , np.squeeze(target0))
    wdist = wd(np.squeeze(target1) , np.squeeze(target0))
    edist = ed(np.squeeze(target1) , np.squeeze(target0))
    
    sns.distplot(target0 , ax = ax[idx])
    sns.distplot(target1 , ax = ax[idx])
    sns.distplot(target2 , ax = ax[idx])
    
    msg = "[{}] | ed : {} | ks : {} | wd : {}".format(col , 
                                                np.round(edist,3),
                                                np.round(kdist,3),
                                                np.round(wdist,3),
                                               )
    ax[idx].set_title(msg , fontsize=  10)
plt.savefig("./New2.png")
plt.show()

In [15]:
print(X_A.shape, test_A.shape)

(249, 1906) (67, 1904)


In [29]:
X_A.to_csv("split_dataset/train_add_A.csv", index=False)
test_A.to_csv("split_dataset/test_add_A.csv", index=False)

NameError: name 'X_A' is not defined

In [17]:
corr_X_A = X_A.drop(columns="Y_Class")
C = corr_X_A.corr()[["Y_Quality"]]
C = C.iloc[1:]
C = C.sort_values("Y_Quality")
C.to_csv("correlation/correlation_add_A.csv")

In [39]:
test_TO = pd.read_csv("split_dataset/test_add_TO.csv")

In [41]:
for col in test_TO:
    if 'poly' in col:
        test_TO.rename(columns={col: "TO_"+col}, inplace=True)

In [44]:
total_train = pd.concat([X_TO, X_A])
total_test  = pd.concat([test_TO, test_A])

In [46]:
total_train.sort_values("PRODUCT_ID", inplace=True)
total_test.sort_values("PRODUCT_ID", inplace=True)

In [49]:
total_train.to_csv("train_add.csv", index=False)
total_test.to_csv("test_add.csv", index=False)

In [12]:
import pandas as pd
best = pd.read_csv("best.csv")["Y_Class"]
cur = pd.read_csv("submission.csv")["Y_Class"]
PCA = pd.read_csv("submission_PCA.csv")["Y_Class"]

In [13]:
match = 0
for i in range(310):
    if best.iloc[i] == cur.iloc[i]:
        match+= 1
print((match / 310) * 100)
    

94.83870967741936


In [14]:
match = 0
for i in range(310):
    if best.iloc[i] == PCA.iloc[i]:
        match+= 1
print((match / 310) * 100)

93.2258064516129


In [15]:
cur.value_counts()

1    253
0     32
2     25
Name: Y_Class, dtype: int64

In [16]:
best.value_counts()

1    244
0     40
2     26
Name: Y_Class, dtype: int64

In [6]:
PCA.value_counts()

1    251
0     38
2     21
Name: Y_Class, dtype: int64

In [57]:
tmp = pd.read_csv("dataset/test.csv")
tmp['PRODUCT_CODE'].value_counts()

T_31    239
A_31     67
O_31      4
Name: PRODUCT_CODE, dtype: int64