In [1]:
import statsmodels.api as sm
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
import numpy as np 
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier,AdaBoostClassifier,VotingClassifier,HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score,r2_score,recall_score,roc_auc_score,confusion_matrix,classification_report
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import lightgbm as lgb
import warnings
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.exceptions import DataConversionWarning
from catboost import CatBoostClassifier,Pool
from catboost.utils import eval_metric
warnings.filterwarnings("ignore", category=DataConversionWarning)
warnings.filterwarnings("ignore")
from IPython.display import display, HTML
from sklearn.metrics import auc
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import cross_val_score
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import ttest_ind,chi2_contingency
from sklearn.naive_bayes import GaussianNB
from sklearn.utils import class_weight

import pandas as pd 
from sklearn.preprocessing import PowerTransformer, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from scipy.stats import skew
from sklearn.model_selection import train_test_split
import seaborn as sns
from matplotlib import pyplot as plt
import os

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

### Feature Engineering

In [3]:
train['X_Area_Diff'] = train['X_Maximum'] - train['X_Minimum']
train['Perimeter_Ratio'] = train['X_Perimeter'] / train['Y_Perimeter']
train['Is_Steel_A300'] = train['TypeOfSteel_A300'].apply(lambda x: 1 if x == 1 else 0)
train['Combined_Edges_Index'] = train['Edges_Index'] + train['Edges_X_Index'] + train['Edges_Y_Index']

In [4]:
test['X_Area_Diff'] = test['X_Maximum'] - test['X_Minimum']
test['Perimeter_Ratio'] = test['X_Perimeter'] / test['Y_Perimeter']
test['Is_Steel_A300'] = test['TypeOfSteel_A300'].apply(lambda x: 1 if x == 1 else 0)
test['Combined_Edges_Index'] = test['Edges_Index'] + test['Edges_X_Index'] + test['Edges_Y_Index']

### Column Transformer

In [5]:
unique_counts = train.nunique()
#Threshold to distinguish continous and categorical
threshold = 12
continuous_vars_temp = unique_counts[unique_counts > threshold].index.tolist()
#categorical_vars = unique_counts[unique_counts <= threshold].index.tolist()
if 'id' in continuous_vars_temp:
    continuous_vars_temp.remove('id')

In [6]:
skew_threshold = 0.5  
# Define the columns for transformation
columns_to_transform = ['X_Minimum', 'X_Maximum', 'Y_Minimum', 'Y_Maximum', 'Pixels_Areas',
                        'X_Perimeter', 'Y_Perimeter', 'Sum_of_Luminosity', 'Minimum_of_Luminosity',
                        'Maximum_of_Luminosity', 'Length_of_Conveyer', 'Steel_Plate_Thickness',
                        'Edges_Index', 'Empty_Index', 'Square_Index', 'Outside_X_Index',
                        'Edges_X_Index', 'Edges_Y_Index', 'LogOfAreas', 'Log_X_Index',
                        'Log_Y_Index', 'Orientation_Index', 'Luminosity_Index', 'SigmoidOfAreas',
                        'X_Area_Diff', 'Perimeter_Ratio', 'Combined_Edges_Index']

# Calculate skewness for each column
skewness = train[columns_to_transform].apply(lambda x: x.skew())
power_transformer = PowerTransformer(method='yeo-johnson', standardize=True)
function_transformer = FunctionTransformer(np.log1p, validate=True)  
transformer = ColumnTransformer([
    ('power_transform', power_transformer, skewness[skewness > skew_threshold].index),
    ('log_transform', function_transformer, skewness[skewness <= skew_threshold].index)
])

train[columns_to_transform] = transformer.fit_transform(train[columns_to_transform])

In [7]:
unique_counts = test.nunique()
#Threshold to distinguish continous and categorical
threshold = 12
continuous_vars_test = unique_counts[unique_counts > threshold].index.tolist()
#categorical_vars = unique_counts[unique_counts <= threshold].index.tolist()
if 'id' in continuous_vars_test:
    continuous_vars_test.remove('id')

In [8]:
skew_threshold = 0.5  
# Define the columns for transformation
columns_to_transform = ['X_Minimum', 'X_Maximum', 'Y_Minimum', 'Y_Maximum', 'Pixels_Areas',
                        'X_Perimeter', 'Y_Perimeter', 'Sum_of_Luminosity', 'Minimum_of_Luminosity',
                        'Maximum_of_Luminosity', 'Length_of_Conveyer', 'Steel_Plate_Thickness',
                        'Edges_Index', 'Empty_Index', 'Square_Index', 'Outside_X_Index',
                        'Edges_X_Index', 'Edges_Y_Index', 'LogOfAreas', 'Log_X_Index',
                        'Log_Y_Index', 'Orientation_Index', 'Luminosity_Index', 'SigmoidOfAreas',
                        'X_Area_Diff', 'Perimeter_Ratio', 'Combined_Edges_Index']

# Calculate skewness for each column
skewness = test[columns_to_transform].apply(lambda x: x.skew())
power_transformer = PowerTransformer(method='yeo-johnson', standardize=True)
function_transformer = FunctionTransformer(np.log1p, validate=True)  
transformer = ColumnTransformer([
    ('power_transform', power_transformer, skewness[skewness > skew_threshold].index),
    ('log_transform', function_transformer, skewness[skewness <= skew_threshold].index)
])

test[columns_to_transform] = transformer.fit_transform(test[columns_to_transform])

In [9]:
numerical_columns = train.select_dtypes(include='number').columns

### Remove Outliers

In [10]:
def remove_outliers_replace(data, columns, threshold=1.5):
    data_no_outliers = data.copy()

    for column in columns:
        Q1 = data_no_outliers[column].quantile(0.25)
        Q3 = data_no_outliers[column].quantile(0.75)
        IQR = Q3 - Q1

        lower_bound = Q1 - threshold * IQR
        upper_bound = Q3 + threshold * IQR

        is_outlier = (data_no_outliers[column] < lower_bound) | (data_no_outliers[column] > upper_bound)

        if data_no_outliers[column].dtype == 'O':  # Categorical column
            median_value = data_no_outliers.loc[~is_outlier, column].mode().iloc[0]
            data_no_outliers.loc[is_outlier, column] = median_value
        else:  # Numerical column
            mean_value = data_no_outliers.loc[~is_outlier, column].mean()
            data_no_outliers.loc[is_outlier, column] = mean_value

    return data_no_outliers

columns_to_remove_outliers_replace = continuous_vars_temp
train = remove_outliers_replace(train, columns_to_remove_outliers_replace)
test = remove_outliers_replace(test, columns_to_remove_outliers_replace)

In [11]:
train.drop(columns='id',axis = 1,inplace = True)

In [12]:
X = train.drop(['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults'], axis=1)
y = train[['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']]

### Hypothesis Testing

In [13]:
import pandas as pd
from scipy.stats import ttest_ind

target_columns = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']

# Perform pairwise t-tests for each target column with all other columns
p_values = {}
for target in target_columns:
    p_values[target] = {}  # Initialize a dictionary for storing p-values for this target
    for column in train.columns:
        if column != target:  # Exclude the target column itself
            t_stat, p_value = ttest_ind(train[target], train[column])
            p_values[target][column] = p_value

# Convert the nested dictionary to a DataFrame for better visualization
p_values_df = pd.DataFrame(p_values)


In [14]:
target_columns = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']

# Perform pairwise t-tests for each target column with all other columns
significant_columns = {}
for target in target_columns:
    p_values = {}  # Initialize a dictionary for storing p-values for this target
    for column in train.columns:
        if column != target:  # Exclude the target column itself
            t_stat, p_value = ttest_ind(train[target], train[column])
            p_values[column] = p_value
    
    # Filter columns based on p-value threshold (e.g., 0.05)
    significant_columns[target] = [col for col, p_val in p_values.items() if p_val <= 0.05]

# Display the number of significant columns for each target column
for target, cols in significant_columns.items():
    print(f"Number of significant columns for '{target}': {len(cols)}")

Number of significant columns for 'Pastry': 37
Number of significant columns for 'Z_Scratch': 36
Number of significant columns for 'K_Scatch': 37
Number of significant columns for 'Stains': 34
Number of significant columns for 'Dirtiness': 34
Number of significant columns for 'Bumps': 37
Number of significant columns for 'Other_Faults': 36


In [15]:
test.drop(columns=['id'], inplace=True)

In [16]:
train = X
target = y

In [17]:
test.shape, train.shape, target.shape

((12814, 31), (19219, 31), (19219, 7))

In [74]:
train.shape

(19219, 31)

## Predict 7 columns with one model

In [19]:
from pycaret.classification import *

In [29]:
for i in target.columns:
    print(target[i].value_counts())

0    17753
1     1466
Name: Pastry, dtype: int64
0    18069
1     1150
Name: Z_Scratch, dtype: int64
0    15787
1     3432
Name: K_Scatch, dtype: int64
0    18651
1      568
Name: Stains, dtype: int64
0    18734
1      485
Name: Dirtiness, dtype: int64
0    14456
1     4763
Name: Bumps, dtype: int64
0    12661
1     6558
Name: Other_Faults, dtype: int64


In [32]:
_ = setup(data=train, target=target[['Pastry','Z_Scratch','K_Scatch','Stains','Dirtiness','Bumps','Other_Faults']])


ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

## Individual Models for Individual Columns

In [75]:
from pycaret.regression import setup, compare_models

In [79]:
_ = setup(data=pd.concat([train, pd.DataFrame(target['Pastry'])], axis=1), target = 'Pastry')

Unnamed: 0,Description,Value
0,Session id,8442
1,Target,Pastry
2,Target type,Regression
3,Original data shape,"(19219, 32)"
4,Transformed data shape,"(19219, 32)"
5,Transformed train set shape,"(13453, 32)"
6,Transformed test set shape,"(5766, 32)"
7,Numeric features,31
8,Preprocess,True
9,Imputation type,simple


In [80]:
compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,0.1208,0.0588,0.2425,0.1457,0.1699,0.7906,4.171
lightgbm,Light Gradient Boosting Machine,0.1175,0.0595,0.2438,0.1362,0.1717,0.7635,0.748
catboost,CatBoost Regressor,0.1222,0.06,0.2448,0.1293,0.1727,0.7647,9.036
et,Extra Trees Regressor,0.126,0.061,0.2469,0.1142,0.1757,0.7726,1.666
ridge,Ridge Regression,0.1364,0.0617,0.2484,0.1034,0.1745,0.8281,0.041
lr,Linear Regression,0.1364,0.0617,0.2484,0.1033,0.1745,0.828,1.375
br,Bayesian Ridge,0.1357,0.0618,0.2484,0.1032,0.1744,0.83,0.042
rf,Random Forest Regressor,0.125,0.0618,0.2486,0.1017,0.1771,0.7723,8.364
lar,Least Angle Regression,0.1371,0.062,0.249,0.099,0.1751,0.8278,0.072
omp,Orthogonal Matching Pursuit,0.1352,0.0643,0.2535,0.0664,0.1774,0.8634,0.041


In [81]:
_ = setup(data=pd.concat([train, pd.DataFrame(target['Z_Scratch'])], axis=1), target = 'Z_Scratch')

Unnamed: 0,Description,Value
0,Session id,2076
1,Target,Z_Scratch
2,Target type,Regression
3,Original data shape,"(19219, 32)"
4,Transformed data shape,"(19219, 32)"
5,Transformed train set shape,"(13453, 32)"
6,Transformed test set shape,"(5766, 32)"
7,Numeric features,31
8,Preprocess,True
9,Imputation type,simple


In [82]:
compare_models()

In [83]:
_ = setup(data=pd.concat([train, pd.DataFrame(target['K_Scatch'])], axis=1), target = 'K_Scatch')

Unnamed: 0,Description,Value
0,Session id,295
1,Target,K_Scatch
2,Target type,Regression
3,Original data shape,"(19219, 32)"
4,Transformed data shape,"(19219, 32)"
5,Transformed train set shape,"(13453, 32)"
6,Transformed test set shape,"(5766, 32)"
7,Numeric features,31
8,Preprocess,True
9,Imputation type,simple


In [84]:
compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,0.0679,0.0332,0.1817,0.7735,0.1261,0.1823,2.624
catboost,CatBoost Regressor,0.0699,0.0335,0.1827,0.7713,0.1271,0.1805,7.204
et,Extra Trees Regressor,0.068,0.0335,0.1828,0.7712,0.1276,0.1866,1.093
lightgbm,Light Gradient Boosting Machine,0.0664,0.0336,0.1829,0.7706,0.127,0.1809,0.293
rf,Random Forest Regressor,0.0682,0.0344,0.1852,0.7654,0.1297,0.1887,5.98
xgboost,Extreme Gradient Boosting,0.0739,0.0382,0.195,0.7394,0.1348,0.1921,1.249
knn,K Neighbors Regressor,0.0621,0.0386,0.1961,0.7363,0.1376,0.1656,0.076
ridge,Ridge Regression,0.12,0.042,0.2047,0.7134,0.1489,0.2334,0.032
br,Bayesian Ridge,0.12,0.042,0.2047,0.7134,0.1489,0.2335,0.032
lr,Linear Regression,0.1198,0.042,0.2047,0.7133,0.1489,0.233,0.039


In [85]:
_ = setup(data=pd.concat([train, pd.DataFrame(target['Stains'])], axis=1), target = 'Stains')

Unnamed: 0,Description,Value
0,Session id,4136
1,Target,Stains
2,Target type,Regression
3,Original data shape,"(19219, 32)"
4,Transformed data shape,"(19219, 32)"
5,Transformed train set shape,"(13453, 32)"
6,Transformed test set shape,"(5766, 32)"
7,Numeric features,31
8,Preprocess,True
9,Imputation type,simple


In [86]:
compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)


<catboost.core.CatBoostRegressor at 0x1d96b37c580>

In [87]:
_ = setup(data=pd.concat([train, pd.DataFrame(target['Dirtiness'])], axis=1), target = 'Dirtiness')

Unnamed: 0,Description,Value
0,Session id,4767
1,Target,Dirtiness
2,Target type,Regression
3,Original data shape,"(19219, 32)"
4,Transformed data shape,"(19219, 32)"
5,Transformed train set shape,"(13453, 32)"
6,Transformed test set shape,"(5766, 32)"
7,Numeric features,31
8,Preprocess,True
9,Imputation type,simple


In [88]:
compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,0.0464,0.0225,0.1496,0.0939,0.1047,0.8673,3.016
catboost,CatBoost Regressor,0.0499,0.0228,0.1508,0.0775,0.1064,0.8493,6.803
lightgbm,Light Gradient Boosting Machine,0.0486,0.0234,0.1529,0.0532,0.1082,0.8489,0.366
ridge,Ridge Regression,0.0536,0.0236,0.1534,0.0482,0.1071,0.9247,0.029
lr,Linear Regression,0.0537,0.0236,0.1534,0.0481,0.1071,0.9241,0.05
br,Bayesian Ridge,0.053,0.0237,0.1536,0.0458,0.1071,0.9287,0.035
et,Extra Trees Regressor,0.0505,0.0237,0.1537,0.0422,0.1095,0.8605,0.869
lar,Least Angle Regression,0.0555,0.0238,0.1541,0.0394,0.1079,0.921,0.033
rf,Random Forest Regressor,0.0505,0.0239,0.1543,0.0356,0.1101,0.8678,5.99
omp,Orthogonal Matching Pursuit,0.0509,0.0244,0.1558,0.0182,0.1083,0.9555,0.019


Processing:   0%|          | 0/85 [00:00<?, ?it/s]

In [89]:
_ = setup(data=pd.concat([train, pd.DataFrame(target['Bumps'])], axis=1), target = 'Bumps')

Unnamed: 0,Description,Value
0,Session id,6764
1,Target,Bumps
2,Target type,Regression
3,Original data shape,"(19219, 32)"
4,Transformed data shape,"(19219, 32)"
5,Transformed train set shape,"(13453, 32)"
6,Transformed test set shape,"(5766, 32)"
7,Numeric features,31
8,Preprocess,True
9,Imputation type,simple


In [90]:
compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.2945,0.1495,0.3866,0.1962,0.2718,0.5892,0.412
gbr,Gradient Boosting Regressor,0.3042,0.1496,0.3867,0.1959,0.2718,0.6095,2.628
catboost,CatBoost Regressor,0.2979,0.1512,0.3887,0.1872,0.2737,0.5859,7.057
rf,Random Forest Regressor,0.3083,0.154,0.3924,0.1718,0.2774,0.599,8.284
et,Extra Trees Regressor,0.3064,0.1549,0.3935,0.1671,0.2783,0.594,1.618
ridge,Ridge Regression,0.3248,0.1605,0.4006,0.1369,0.2818,0.6466,0.031
lr,Linear Regression,0.3247,0.1605,0.4006,0.1368,0.2819,0.6464,0.033
br,Bayesian Ridge,0.3251,0.1607,0.4007,0.1363,0.2819,0.6484,0.025
xgboost,Extreme Gradient Boosting,0.3055,0.1641,0.4051,0.1172,0.2848,0.5866,1.268
lar,Least Angle Regression,0.3269,0.1659,0.4072,0.1066,0.2852,0.6443,0.036


Processing:   0%|          | 0/85 [00:00<?, ?it/s]

In [91]:
_ = setup(data=pd.concat([train, pd.DataFrame(target['Other_Faults'])], axis=1), target = 'Other_Faults')

Unnamed: 0,Description,Value
0,Session id,6759
1,Target,Other_Faults
2,Target type,Regression
3,Original data shape,"(19219, 32)"
4,Transformed data shape,"(19219, 32)"
5,Transformed train set shape,"(13453, 32)"
6,Transformed test set shape,"(5766, 32)"
7,Numeric features,31
8,Preprocess,True
9,Imputation type,simple


In [92]:
compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,0.4044,0.2027,0.4502,0.0987,0.3166,0.5904,2.609
lightgbm,Light Gradient Boosting Machine,0.3992,0.2042,0.4519,0.0919,0.3177,0.5822,0.339
catboost,CatBoost Regressor,0.3981,0.2062,0.454,0.0833,0.3194,0.5798,6.787
ada,AdaBoost Regressor,0.4182,0.2071,0.4551,0.0791,0.3216,0.5932,0.193
rf,Random Forest Regressor,0.4073,0.2078,0.4558,0.076,0.3222,0.5775,9.722
et,Extra Trees Regressor,0.4052,0.2084,0.4565,0.0733,0.3223,0.5766,2.172
ridge,Ridge Regression,0.4186,0.2098,0.4581,0.067,0.3221,0.6115,0.03
br,Bayesian Ridge,0.4196,0.2098,0.458,0.067,0.322,0.6129,0.034
lr,Linear Regression,0.4186,0.2099,0.4581,0.0669,0.3221,0.6115,0.046
huber,Huber Regressor,0.4032,0.2129,0.4614,0.0534,0.3194,0.6531,0.356


Processing:   0%|          | 0/85 [00:00<?, ?it/s]