In [1]:
import pandas as pd
import gc
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import RobustScaler, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.feature_selection import VarianceThreshold
import lightgbm as lgb
from sklearn.impute import KNNImputer

In [15]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
train.shape

(598, 2881)

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
y = train['Y_Class']

for col in train:
    if train[col].nunique() < 2:
        train.drop(columns=col, inplace=True)
        test.drop(columns=col, inplace=True)

num_features = test.select_dtypes(exclude=['object']).columns.to_list()

scaler = StandardScaler()
train[num_features] = scaler.fit_transform(train[num_features])
test[num_features] = scaler.transform(test[num_features])


#from math import *
corr = pd.read_csv('../correlation/correlation.csv')
# Y_Quality 제거
corr = corr.iloc[:-1,:]
important = list(corr[abs(corr['correlation'])>=0.1]['feature'])
#important
train = train[important]
test = test[important]

# 중복된 변수 제거
dup = ~train.T.duplicated()
train = train.loc[:, dup]
test = test.loc[:, dup]



In [3]:
def nan_process(df):
    nans_df = df.isna()
    nans_groups={}
    for col in df.columns:
        cur_group = nans_df[col].sum()
        try:
            nans_groups[cur_group].append(col)
        except:
            nans_groups[cur_group]=[col]
    del nans_df; x=gc.collect()

    pop_list = []
    for k,v in nans_groups.items():
        if len(v) <= 2:
            pop_list.append(k)
    for p in pop_list:
        nans_groups.pop(p)

    keys = list(nans_groups.keys())
    print(len(keys))
    return keys, nans_groups

train_keys, train_nan_groups = nan_process(train)

17


In [None]:
for key in train_keys:
    print(key, len(train_nan_groups[key]))

In [4]:
imputer = KNNImputer()
train_clone = train.copy()
num_features = test.select_dtypes(exclude=['object']).columns.to_list()
train_clone[num_features] = imputer.fit_transform(train_clone[num_features])

In [None]:
def make_corr(Vs,Vtitle=''):
    cols = Vs
    plt.figure(figsize=(40,40))
    sns.heatmap(np.abs(train_clone[cols].corr(method= 'kendall')), cmap='RdBu_r', annot=True, center=0.0)
    if Vtitle!='': plt.title(Vtitle,fontsize=5)
    else: plt.title(Vs[0]+' - '+Vs[-1],fontsize=1)
    plt.show()

Vs = [2093, 2023, 2702, 2017, 2701, 1703, 2700, 1945]

for i in range(len(Vs)):
    Vs[i] = "X_" + str(Vs[i])
make_corr(Vs)

### PCA ALGORITHM

In [5]:
# PCA 클래스 설정
class PCA_transform:
  def __init__(self):
    self.cols_list = []
    self.pca_list = []
    self.n_pca_list = []
    self.size = 0
  
  # PCA 클래스의 학습 및 input 값 변환
  def fit_transform(self, X_input, col, n_pca):
    X_pca = X_input[col]
    # n차원으로 차원 축소, target 정보는 제외
    pca = PCA(n_components = n_pca, random_state=25)

    # PCA 학습
    pca.fit(X_pca)

    # PCA transform 후 데이터프레임으로 자료형 변경
    X_pca = pca.transform(X_pca)
    X_pca = pd.DataFrame(X_pca, columns = self.naming(n_pca))
    
    X_input = pd.concat([X_input, X_pca], axis = 1)
    X_input = X_input.drop(col, axis = 1)

    self.cols_list.append(col)
    self.pca_list.append(pca)
    self.n_pca_list.append(n_pca)
    self.size += 1

    return X_input

  # 학습된 PCA 값으로 transform
  def transform(self, X_input):
    for idx in range(self.size):
      X_input = self._idx_transform(X_input, idx)
    
    return X_input

  # n번째 PCA 변환
  def _idx_transform(self, X_input, idx):
    X_pca = X_input[self.cols_list[idx]]

    # pca transform 후 데이터프레임으로 자료형 변경
    X_pca = self.pca_list[idx].transform(X_pca)
    X_pca = pd.DataFrame(X_pca, columns = self.naming(self.n_pca_list[idx], idx))

    X_input = pd.concat([X_input, X_pca], axis = 1)
    X_input = X_input.drop(self.cols_list[idx], axis = 1)

    return X_input

  # PCA 된 컬럼 이름 규칙
  def naming(self, number, name = None):
    if (name is None):
      name = self.size
    names = []
    for idx in range(number):
      names.append(f'PCA_{str(name)}_{idx}')
    return names


In [6]:
imputer = KNNImputer()

train_clone = train.copy()
test_clone = test.copy()

num_features = test.select_dtypes(exclude=['object']).columns.to_list()
train_clone[num_features] = imputer.fit_transform(train_clone[num_features])
test_clone[num_features] = imputer.transform(test_clone[num_features])

In [7]:
import random
import os
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(25) 
SEED = 25

In [8]:
correlation_groups = [
    [1218, 1482, 1517, 1549, 1483, 1547, 1548, 1208, 1093, 1087, 1144, 1511, 1203, 1202, 1542],
    [306, 364, 332, 366, 365, 333, 295, 321, 320, 331, 270, 302, 296, 299, 370, 253, 293, 254, 297, 294, 300, 291, 248],
    [2187, 2286, 2241, 2278, 2345, 2610, 2366, 2353, 2303, 2368, 2407, 2216, 2377, 2133, 2327, 2603, 2123, 2139, 2237, 2392, 2164, 2102, 2081],
    [2000, 2014, 1988, 2030, 1982, 2016, 2006, 2010, 2038, 1992, 1990, 2034, 1986, 2003, 2009, 1993, 2029, 1981, 2035, 1975, 2013, 1971, 2037, 1989, 2031, 1983],
    [1417, 1593, 1412, 1597, 1413, 1596, 1598, 1591, 1411, 1599, 1594, 1606, 1410, 1601, 1415, 1600, 1414, 1602, 1592],
    [1942, 1940, 1928, 1785, 2777, 2778, 1906, 1886, 1884, 1779, 1904, 1932, 1934, 1874, 1912, 1896, 1936, 1920, 1914, 1898, 1880, 1868, 1890, 1867, 1878, 1876, 1888, 1815, 1882, 1894, 1892, 1902, 1900, 1938, 1910, 1908, 1944],
    [2813, 1895, 2786, 212, 1921, 2483, 2842, 2783, 2522, 2806, 1927, 2484, 2478, 1887, 2498, 2506, 1889, 2539, 1911, 2500, 1903, 2487, 2814, 1752, 1905, 220, 2509, 1897, 1881, 211, 1885, 1883, 1913, 2708, 1919, 2495, 2707, 2025, 2706, 1764, 245, 2512, 1935, 2019, 1899, 1917, 2095, 1893, 2819, 1943, 1909, 2829, 2826, 1762, 1929, 1925, 1923, 2468, 1432, 226, 1873, 2533, 1879, 1937]
]

for i in range(len(correlation_groups)):
    for j in range(len(correlation_groups[i])):
        correlation_groups[i][j] = "X_" + str(correlation_groups[i][j])

In [9]:
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import f1_score
from lightgbm import LGBMClassifier
#params = study.best_params
params = {'num_leaves': 17,
    'max_depth': 11,
    'learning_rate': 0.07028290319049474,
    'n_estimators': 78,
    'class_weight': 'balanced',
    'min_child_samples': 12,
    'subsample': 0.831632859850219,
    'colsample_bytree': 0.9362544923583181,
    'reg_alpha': 0.01941513921336218,
    'reg_lambda': 0.0021722692515700652}

# -- Define the objective function
def objective(trial):
    imputer = KNNImputer()
    train_clone = train.copy()
    num_features = test.select_dtypes(exclude=['object']).columns.to_list()
    train_clone[num_features] = imputer.fit_transform(train_clone[num_features])
    pca_5 = PCA_transform()
    for k in range(len(correlation_groups)):
        pca_n_components=trial.suggest_int(f"{k}_pca_n_components", 2, len(correlation_groups[k])) # suggest an integer from 2 to 30
        train_clone = pca_5.fit_transform(train_clone, correlation_groups[k], pca_n_components)
   
    kf = KFold(n_splits=10, shuffle=True, random_state=25)

    clfs = []
    scores = []
    for train_index, test_index in kf.split(train_clone, y):
        X_train, X_valid = train_clone.values[train_index], train_clone.values[test_index]
        y_train, y_valid = y.values[train_index], y.values[test_index]
        model = LGBMClassifier(random_state=25, verbose=-1, **params)
        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)])
        pred = model.predict(X_valid)
        f1 = f1_score(y_valid, pred, average='macro')
        scores.append(f1)
        clfs.append(model)

    # -- Evaluate the score by cross-validation
    f1 = np.mean(scores)
    return f1

study = optuna.create_study(direction="maximize") # maximise the score during tuning
study.optimize(objective, n_trials=10000) # run the objective function 100 times

print(study.best_trial) # print the best performing pipeline

[32m[I 2023-02-20 00:53:08,215][0m A new study created in memory with name: no-name-9436a22b-079a-47d7-8832-a3ec65efed38[0m
[32m[I 2023-02-20 00:53:12,536][0m Trial 0 finished with value: 0.6502525826980523 and parameters: {'0_pca_n_components': 5, '1_pca_n_components': 23, '2_pca_n_components': 20, '3_pca_n_components': 21, '4_pca_n_components': 9, '5_pca_n_components': 28, '6_pca_n_components': 31}. Best is trial 0 with value: 0.6502525826980523.[0m
[32m[I 2023-02-20 00:53:16,918][0m Trial 1 finished with value: 0.6471872163571478 and parameters: {'0_pca_n_components': 13, '1_pca_n_components': 17, '2_pca_n_components': 22, '3_pca_n_components': 20, '4_pca_n_components': 3, '5_pca_n_components': 37, '6_pca_n_components': 50}. Best is trial 0 with value: 0.6502525826980523.[0m
[32m[I 2023-02-20 00:53:21,285][0m Trial 2 finished with value: 0.6597647281019652 and parameters: {'0_pca_n_components': 13, '1_pca_n_components': 3, '2_pca_n_components': 11, '3_pca_n_components': 1

FrozenTrial(number=5514, state=TrialState.COMPLETE, values=[0.7023416498182075], datetime_start=datetime.datetime(2023, 2, 20, 7, 52, 43, 67878), datetime_complete=datetime.datetime(2023, 2, 20, 7, 52, 47, 735618), params={'0_pca_n_components': 3, '1_pca_n_components': 7, '2_pca_n_components': 8, '3_pca_n_components': 4, '4_pca_n_components': 2, '5_pca_n_components': 2, '6_pca_n_components': 2}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'0_pca_n_components': IntDistribution(high=15, log=False, low=2, step=1), '1_pca_n_components': IntDistribution(high=23, log=False, low=2, step=1), '2_pca_n_components': IntDistribution(high=23, log=False, low=2, step=1), '3_pca_n_components': IntDistribution(high=26, log=False, low=2, step=1), '4_pca_n_components': IntDistribution(high=19, log=False, low=2, step=1), '5_pca_n_components': IntDistribution(high=37, log=False, low=2, step=1), '6_pca_n_components': IntDistribution(high=64, log=False, low=2, step=1)}, trial_id=55

In [14]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold, StratifiedKFold
from sklearn.metrics import f1_score
from lightgbm import LGBMClassifier

# KNN
imputer = KNNImputer()
train_clone = train.copy()
test_clone = test.copy()
num_features = test.select_dtypes(exclude=['object']).columns.to_list()
train_clone[num_features] = imputer.fit_transform(train_clone[num_features])
test_clone[num_features] = imputer.transform(test_clone[num_features])

# PCA
pca_5 = PCA_transform()
for k in range(len(correlation_groups)):
    print(k)
    name = f"{k}_pca_n_components"
    pca_n_components = study.best_params[name]
    train_clone = pca_5.fit_transform(train_clone, correlation_groups[k], pca_n_components)
    
test_clone = pca_5.transform(test_clone)
    
kf = KFold(n_splits=10, shuffle=True, random_state=25)

clfs = []
scores = []
for train_index, test_index in kf.split(train_clone, y):
    #print(train_index)
    X_train, X_valid = train_clone.values[train_index], train_clone.values[test_index]
    y_train, y_valid = y.values[train_index], y.values[test_index]
    
    #params = study.best_params
    params = {'num_leaves': 17,
        'max_depth': 11,
        'learning_rate': 0.07028290319049474,
        'n_estimators': 78,
        'class_weight': 'balanced',
        'min_child_samples': 12,
        'subsample': 0.831632859850219,
        'colsample_bytree': 0.9362544923583181,
        'reg_alpha': 0.01941513921336218,
        'reg_lambda': 0.0021722692515700652}
            
    model = LGBMClassifier(random_state=25, verbose=-1, **params)
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)])
    pred = model.predict(X_valid)
    f1 = f1_score(y_valid, pred, average='macro')
    scores.append(f1)
    clfs.append(model)
print('Mean F1:', np.mean(scores))
#0.682161892591053

0
1
2
3
4
5
6
Mean F1: 0.7023416498182075


In [12]:
print(train_clone.shape, test_clone.shape)

(598, 461) (310, 461)


In [13]:
preds = []
for i, model in enumerate(clfs):
    pred = model.predict_proba(test_clone)
    if i == 0:
        preds = pred
    else:
        preds += pred
print(preds)
final_pred = np.argmax(preds, axis=1)

submit = pd.read_csv('../sample_submission.csv')
submit['Y_Class'] = final_pred
submit.to_csv('../submission_PCA.csv', index=False)
final_pred

[[0.45425805 9.33685003 0.20889192]
 [0.71302682 7.6111705  1.67580268]
 [0.71067541 7.94870078 1.34062381]
 [1.89460357 7.51215178 0.59324465]
 [1.38294052 7.22607906 1.39098042]
 [0.45833424 5.97568985 3.56597591]
 [2.35721703 5.38116972 2.26161326]
 [7.15499994 2.08929132 0.75570875]
 [6.17447438 3.28464426 0.54088136]
 [0.27394492 1.83821676 7.88783832]
 [0.99255389 8.45707698 0.55036913]
 [1.39072116 7.39231592 1.21696292]
 [0.04251154 3.98225601 5.97523245]
 [6.52968297 1.46309158 2.00722545]
 [9.35867549 0.5193257  0.12199881]
 [5.17774394 4.6452079  0.17704815]
 [1.65358383 7.45884748 0.8875687 ]
 [0.19017591 9.64074356 0.16908053]
 [0.06234491 9.54034329 0.3973118 ]
 [0.06907096 9.62306568 0.30786337]
 [0.33360529 9.58723941 0.0791553 ]
 [0.10816394 8.71687578 1.17496028]
 [0.26429084 9.58280652 0.15290264]
 [0.75612283 9.0100105  0.23386667]
 [0.30896229 9.06970923 0.62132848]
 [0.05025031 9.90837643 0.04137325]
 [0.1279031  9.73688743 0.13520947]
 [0.08428192 9.85027467 0.06

array([1, 1, 1, 1, 1, 1, 1, 0, 0, 2, 1, 1, 2, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 2, 1, 1, 0, 0, 1,
       1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
       1, 1, 1, 1, 2, 2, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 2, 1, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 2, 2, 0, 0, 2, 0, 0, 1, 1, 1, 1, 1, 2, 1, 1, 2,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0, 1, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [None]:
#train_add = ["PRODUCT_ID", "Y_Class", "Y_Quality", "TIMESTAMP", "LINE", "PRODUCT_CODE"]
#test_add = ["PRODUCT_ID","TIMESTAMP", "LINE", "PRODUCT_CODE"]

#save_train = pd.concat([train[train_add], train_clone], axis=1)
#save_test = pd.concat([test[test_add], test_clone], axis=1)

#save_train.to_csv("train_PCA.csv", index=False)
#save_test.to_csv("test_PCA.csv", index=False)

### IEEE FRUAD ALGORITHM

In [None]:
def reduce_group(grps):
    use = []
    for g in grps:
        mx = 0; vx = g[0]
        for gg in g:
            n = train[str(gg)].nunique()
            if n > mx:
                mx = n
                vx = gg
            #print(str(gg)+'-'+str(n),', ',end='')
        use.append(vx)
        #print()
    return list(set(use))

use_columns = []
for idx in range(1, len(train_keys)):
    correlated = []
    Vs = train_nan_groups[train_keys[idx]]
    is_visited = [False] * len(Vs)
    corr = train[Vs].corr()
    for i in range(len(Vs)):
        tmp = []
        if is_visited[i] == False:
            is_visited[i] = True
            tmp.append(Vs[i])
            for j in range(i+1, len(Vs)):
                ratio = abs(corr.iloc[i, j])
                if ratio >= .95:
                    is_visited[j]=True
                    tmp.append(Vs[j])
            correlated.append(tmp)
        else:
            pass
    new_correlated = []
    for v in Vs:
        tmp = None
        for cor in correlated:
            if v in cor:
                if tmp is None:
                    tmp = cor
                else:
                    tmp.extend(cor)

        new_correlated.append(tmp)
    #print(correlated)
    #print(new_correlated)
    #for cor in correlated:
    #    if len(cor) > 1:
    #        pca_5 = PCA_transform()
    #        train = pca_5.fit_transform(train, cor, 1)
    #        test = pca_5.transform(test)
    final = reduce_group(correlated)
    final.sort(key=lambda x : int(x.split('_')[1]))
    #print("Final", final)
    use_columns.extend(final)
    #make_corr(Vs, train_keys[idx])


In [None]:
train_save_columns = ["PRODUCT_ID", "Y_Class", "Y_Quality", "TIMESTAMP", "LINE", "PRODUCT_CODE"] + use_columns
test_save_columns = ["PRODUCT_ID","TIMESTAMP", "LINE", "PRODUCT_CODE"] + use_columns

In [None]:
len(train_save_columns)

In [None]:
train = train_clone[train_save_columns]
test = test_clone[test_save_columns]

### 249 NAN

In [None]:
#train = train[train_save_columns]
#test = test[test_save_columns]

train.to_csv("train_reduce.csv", index=False)
test.to_csv("test_reduce.csv", index=False)


In [None]:
#Create train and validation set
train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=1302)

In [None]:
# lightgbm을 구현하여 shap value를 예측할 것
# ligthgbm 구현

# library
import lightgbm as lgb  # 없을 경우 cmd/anaconda prompt에서 install
from math import sqrt
from sklearn.metrics import mean_squared_error

# lightgbm model
lgb_dtrain = lgb.Dataset(data = train_x, label = train_y) # LightGBM 모델에 맞게 변환
lgb_param = {'max_depth': 10,
            'learning_rate': 0.01, # Step Size
            'n_estimators': 1000, # Number of trees
            'objective': 'regression'} # 목적 함수 (L2 Loss)
lgb_model = lgb.train(params = lgb_param, train_set = lgb_dtrain) # 학습 진행
lgb_model_predict = lgb_model.predict(valid_x) # test data 예측
print("RMSE: {}".format(sqrt(mean_squared_error(lgb_model_predict, valid_y)))) # RMSE

In [None]:
# shap value 
import shap
explainer = shap.TreeExplainer(lgb_model) # Tree model Shap Value 확인 객체 지정
shap_values = explainer.shap_values(valid_x) # Shap Values 계산

In [None]:
shap.initjs() # javascript 초기화 (graph 초기화)
shap.summary_plot(shap_values, valid_x, plot_type = "bar")

In [None]:
vals= np.abs(shap_values).mean(0)
feature_importance = pd.DataFrame(list(zip(train_x.columns,vals)),columns=['col_name','feature_importance_vals'])
feature_importance.sort_values(by=['feature_importance_vals'],ascending=False,inplace=True)
feature_importance.to_csv("SHAP_values.csv", index=False)

In [None]:
X.drop(columns=list(feature_importance[feature_importance['feature_importance_vals'] == 0]['col_name']), inplace=True)

In [None]:
import pandas as pd
feature_importance = pd.read_csv("SHAP_values.csv")
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
#train =train[["Y_Quality", "Y_Class", "PRODUCT_CODE", "LINE"] + list(feature_importance[feature_importance['feature_importance_vals'] == 0]['col_name'])]
#train.to_csv("tmp.csv", index=False)
#test = test[["PRODUCT_CODE", 'LINE'] + list(feature_importance[feature_importance['feature_importance_vals'] == 0]['col_name'])]
#test.to_csv("tmp_test.csv", index=False)

## 다중 공산성 처리

In [None]:
import statsmodels.api as sm

train = pd.read_csv("train_mice.csv")
test = pd.read_csv('test_mice.csv')

for col in train:
    if train[col].nunique() < 2:
        train.drop(columns=col, inplace=True)
        test.drop(columns=col, inplace=True)

# 중복된 변수 제거
dup = ~train.T.duplicated()
train = train.loc[:, dup]
test = test.loc[:, dup]

y = train[["Y_Quality"]]
X = train.drop(columns=["PRODUCT_ID","Y_Class", "Y_Quality", "TIMESTAMP"])
test_X = test.drop(columns=["PRODUCT_ID", "TIMESTAMP"])

num_features = X.select_dtypes(exclude=['object']).columns.to_list()

scaler = StandardScaler()
X[num_features] = scaler.fit_transform(X[num_features])
test_X[num_features] = scaler.transform(test_X[num_features])

# qualitative to quantitative
qual_col = ['PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(X[i])
    X[i] = le.transform(X[i])
    
    for label in np.unique(test_X[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_X[i] = le.transform(test_X[i]) 
    

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
LINE = ["T010305", "T010306", "T050304", "T050307", "T100304", "T100306"]
line_x = X[X["LINE"] == LINE[0]].drop(columns=["LINE"]).dropna(axis=1, how="all")
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(line_x.values, i) for i in range(line_x.shape[1])]
vif['features'] = line_x.columns

vif0 = vif.sort_values("VIF Factor").reset_index(drop=True)
vif0

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
LINE = ["T010305", "T010306", "T050304", "T050307", "T100304", "T100306"]
line_x = X[X["LINE"] == LINE[1]].drop(columns=["LINE"]).dropna(axis=1, how="all")
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(line_x.values, i) for i in range(line_x.shape[1])]
vif['features'] = line_x.columns

vif1 = vif.sort_values("VIF Factor").reset_index(drop=True)
vif1

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
LINE = ["T010305", "T010306", "T050304", "T050307", "T100304", "T100306"]
line_x = X[X["LINE"] == LINE[2]].drop(columns=["LINE"]).dropna(axis=1, how="all")
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(line_x.values, i) for i in range(line_x.shape[1])]
vif['features'] = line_x.columns

vif2 = vif.sort_values("VIF Factor").reset_index(drop=True)
vif2

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
LINE = ["T010305", "T010306", "T050304", "T050307", "T100304", "T100306"]
line_x = X[X["LINE"] == LINE[3]].drop(columns=["LINE"]).dropna(axis=1, how="all")
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(line_x.values, i) for i in range(line_x.shape[1])]
vif['features'] = line_x.columns

vif3 = vif.sort_values("VIF Factor").reset_index(drop=True)
vif3

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
LINE = ["T010305", "T010306", "T050304", "T050307", "T100304", "T100306"]
line_x = X[X["LINE"] == LINE[4]].drop(columns=["LINE"]).dropna(axis=1, how="all")
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(line_x.values, i) for i in range(line_x.shape[1])]
vif['features'] = line_x.columns

vif4 = vif.sort_values("VIF Factor").reset_index(drop=True)
vif4

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
LINE = ["T010305", "T010306", "T050304", "T050307", "T100304", "T100306"]
line_x = X[X["LINE"] == LINE[5]].drop(columns=["LINE"]).dropna(axis=1, how="all")
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(line_x.values, i) for i in range(line_x.shape[1])]
vif['features'] = line_x.columns

vif5 = vif.sort_values("VIF Factor").reset_index(drop=True)
vif5

In [None]:
json_output = {}
json_output[LINE[0]] = vif0[vif0['VIF Factor'] > 10]['features'].to_list()
json_output[LINE[1]] = vif1[vif1['VIF Factor'] > 10]['features'].to_list()
json_output[LINE[2]] = vif2[vif2['VIF Factor'] > 10]['features'].to_list()
json_output[LINE[3]] = vif3[vif3['VIF Factor'] > 10]['features'].to_list()
json_output[LINE[4]] = vif4[vif4['VIF Factor'] > 10]['features'].to_list()
json_output[LINE[5]] = vif5[vif5['VIF Factor'] > 10]['features'].to_list()

import json
with open("VIF.json", 'w') as fp:
    json.dump(json_output, fp)