In [165]:
import pandas as pd 
import numpy as np 
from ydata_profiling import ProfileReport
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [166]:
data = pd.read_csv('data/train.csv', index_col=[0])

def Overview(data) :
    profile = ProfileReport(data)
    profile.to_file('output.html')


# Overview(data)

In [167]:
data.shape

(136429, 13)

In [168]:
data = data[~data.duplicated()]

In [169]:
data.head()

Unnamed: 0_level_0,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,L50096,L,300.6,309.6,1596,36.1,140,0,0,0,0,0,0
1,M20343,M,302.6,312.1,1759,29.1,200,0,0,0,0,0,0
2,L49454,L,299.3,308.5,1805,26.5,25,0,0,0,0,0,0
3,L53355,L,301.0,310.9,1524,44.3,197,0,0,0,0,0,0
4,M24050,M,298.0,309.0,1641,35.4,34,0,0,0,0,0,0


In [170]:
def SingleColumn(data, column):
    if column not in data.columns:
        print(f"Column '{column}' not found in the DataFrame.")
        return

    plt.figure(figsize=(14, 8))
    plt.subplot(121)
    plt.title(f"The Histplot of {column}")
    if data[column].dtype == 'object':
        sns.countplot(data=data, x=column)
    else:
        sns.histplot(data=data, x=column)
    plt.subplot(122)
    plt.title(f"The Pie of {column}")
    plt.pie(data[column].value_counts().values, labels=data[column].value_counts().index, autopct='%1.1f%%', startangle=90)
    plt.tight_layout()
    plt.show()

In [171]:
# SingleColumn(data, 'TWF')
# SingleColumn(data, 'Air temperature [K]')

In [172]:
# for column in data.columns :
#     SingleColumn(data, column)

In [173]:
data.columns

Index(['Product ID', 'Type', 'Air temperature [K]', 'Process temperature [K]',
       'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]',
       'Machine failure', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF'],
      dtype='object')

In [174]:
# In data there is very imbalance
# And Data is normally distributed


In [175]:
import plotly.express as px
smapleData = data.sample(700)
px.scatter(smapleData, x='Air temperature [K]', color='Machine failure')

In [176]:
data['Machine failure'].value_counts()

Machine failure
0    133166
1      2129
Name: count, dtype: int64

In [177]:
data.isna().sum()

Product ID                 0
Type                       0
Air temperature [K]        0
Process temperature [K]    0
Rotational speed [rpm]     0
Torque [Nm]                0
Tool wear [min]            0
Machine failure            0
TWF                        0
HDF                        0
PWF                        0
OSF                        0
RNF                        0
dtype: int64

In [178]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 135295 entries, 0 to 136428
Data columns (total 13 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Product ID               135295 non-null  object 
 1   Type                     135295 non-null  object 
 2   Air temperature [K]      135295 non-null  float64
 3   Process temperature [K]  135295 non-null  float64
 4   Rotational speed [rpm]   135295 non-null  int64  
 5   Torque [Nm]              135295 non-null  float64
 6   Tool wear [min]          135295 non-null  int64  
 7   Machine failure          135295 non-null  int64  
 8   TWF                      135295 non-null  int64  
 9   HDF                      135295 non-null  int64  
 10  PWF                      135295 non-null  int64  
 11  OSF                      135295 non-null  int64  
 12  RNF                      135295 non-null  int64  
dtypes: float64(3), int64(8), object(2)
memory usage: 14.5+ MB


In [179]:
data.drop(['Product ID'], axis=1, inplace=True)

In [206]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
data.columns

Index(['Type', 'Air temperature [K]', 'Process temperature [K]',
       'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]',
       'Machine failure', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF'],
      dtype='object')

In [207]:
inputs = data.drop(['Machine failure'], axis=1)
targets = data['Machine failure']

In [208]:
X_train, X_test, y_train, y_test = train_test_split(inputs, targets, test_size=0.2, random_state=42)

In [209]:
from sklearn.preprocessing import StandardScaler
trf1 = ColumnTransformer([
    ('ordinal', OrdinalEncoder(), ['Type']),
    
], remainder='passthrough')

In [210]:
X_train = trf1.fit_transform(X_train)

In [211]:
X_test = trf1.transform(X_test)


In [212]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [213]:

model = CatBoostClassifier(random_state=23, n_estimators=200, verbose=False)
model.fit(X_train, y_train)
pred = model.predict(X_test)
roc_auc_score(y_test, pred)

0.892468677953461

In [188]:
def ModelSelction (X_train, y_train, X_test, y_test) :
    model_dict = {
        "lgb" : LogisticRegression(),
        "random" : RandomForestClassifier(random_state=42, n_estimators=200),
        "xgb" : XGBClassifier(random_state=42, n_estimators=200),
        "lgb" : LGBMClassifier(random_state=42, n_estimators=200),
        "cat" : CatBoostClassifier(random_state=42, n_estimators=200, verbose=False)
    }

    for model_name, model in model_dict.items() :
        print("Model Name is :   ",  model_name)
        model.fit(X_train, y_train)
        pred = model.predict(X_test)
        print(roc_auc_score(y_test, pred))


ModelSelction(X_train, y_train, X_test, y_test)


Model Name is :    lgb
[LightGBM] [Info] Number of positive: 1691, number of negative: 106545
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007665 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 942
[LightGBM] [Info] Number of data points in the train set: 108236, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.015623 -> initscore=-4.143247
[LightGBM] [Info] Start training from score -4.143247
0.8901667907661734
Model Name is :    random


KeyboardInterrupt: 

In [None]:
def Hyperparameter(model_name, **params):
    model_dict = {
        "log": LogisticRegression(),
        # "random": RandomForestClassifier(random_state=42, **params),
        "xgb": XGBClassifier(random_state=42, **params),
        "lgb": LGBMClassifier(random_state=42, **params),
        # "cat": CatBoostClassifier(random_state=42, verbose=False, **params)
    }
    
    model = model_dict[model_name]

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(roc_auc_score(y_test, y_pred))




In [None]:
Hyperparameter('xgb', n_estimators=200, learning_rate=0.1, max_depth=7, min_child_weight=1, gamma=0.1, subsample=0.8, colsample_bytree=0.8)

In [None]:
Hyperparameter('lgb', n_estimators=200, learning_rate=0.1, max_depth=6, min_child_samples=20, subsample=0.7, colsample_bytree=0.7)

In [None]:
Hyperparameter('cat', n_estimators=400, learning_rate=0.1, depth=7, min_data_in_leaf=20, subsample=0.9, colsample_bylevel=0.9)

In [None]:
# Best model is LightBGM 

# Now perform PCA
# from sklearn.decomposition import PCA

In [214]:
best_params = {
    "n_estimators":200,
    "learning_rate" : 0.1, 
    "max_depth" : 6,
    "min_child_samples" : 20,
    "subsample" : 0.7, 
    "colsample_bytree":0.7
}

In [215]:
model = LGBMClassifier(random_state=42, **best_params)
model.fit(X_train, y_train)
pred = (model.predict(X_test))
roc_auc_score(y_test, pred)


[LightGBM] [Info] Number of positive: 1691, number of negative: 106545
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016558 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 942
[LightGBM] [Info] Number of data points in the train set: 108236, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.015623 -> initscore=-4.143247
[LightGBM] [Info] Start training from score -4.143247


0.892468677953461

In [None]:
test_data = pd.read_csv("data/test.csv", index_col=[0])
test_data.drop(['Product ID'], axis=1,  inplace=True)


In [218]:
data = trf1.fit_transform(inputs)

In [219]:
test_data = trf1.transform(test_data)

In [228]:
model = LGBMClassifier(random_state=42, **best_params)
model.fit(data, targets)
pred = (model.predict_proba(test_data)[:,1])

[LightGBM] [Info] Number of positive: 2129, number of negative: 133166
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.028442 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 943
[LightGBM] [Info] Number of data points in the train set: 135295, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.015736 -> initscore=-4.135944
[LightGBM] [Info] Start training from score -4.135944


In [229]:
pred

array([0.00055525, 0.00139817, 0.00018957, ..., 0.00048838, 0.00018671,
       0.00181612])

In [230]:
predictions = pd.read_csv("data/sample_submission.csv")

In [231]:
predictions['Machine failure'] = pred

In [232]:
predictions

Unnamed: 0,id,Machine failure
0,136429,0.000555
1,136430,0.001398
2,136431,0.000190
3,136432,0.000407
4,136433,0.001697
...,...,...
90949,227378,0.000466
90950,227379,0.000197
90951,227380,0.000488
90952,227381,0.000187


In [227]:
predictions.to_csv("prediction.csv", index=False)