In [None]:
# https://www.kaggle.com/competitions/playground-series-s3e23/data


import pandas as pd 
import numpy as np  
from ydata_profiling import ProfileReport
import seaborn as sns
import matplotlib.pyplot as plt 
%matplotlib inline

In [None]:
data = pd.read_csv('data/train.csv', index_col=[0])

def Overview (data) :
    profile = ProfileReport(dark_mode=True, df=data)
    profile.to_file("output.html")

# Overview(data)

In [None]:
# Data Analysis 

# As data is to much long so we use sample of data to analyze

# Creating Sample 

sample_data = data.sample(10000)
sample_data

In [None]:
# Making the Univarient Data analysis

def UnivariateAnalysis(data, column):
    plt.figure(figsize=(12, 6))  

    plt.subplot(121)
    sns.histplot(data=data, x=column)
    plt.title(f'Countplot of {column}')

    plt.subplot(122)
    data[column].value_counts().plot(kind='pie', autopct='%1.1f%%')
    plt.title(f'Pie Chart of {column}')

    plt.tight_layout() 
    plt.show()

In [None]:
UnivariateAnalysis(sample_data, 'defects')

In [None]:
corr_df = sample_data.corr()['defects'].sort_values(ascending=False).drop(['defects'])
corr_df = pd.concat([corr_df.head(10), corr_df.tail(1)])
corr_df.index

In [None]:
for column in corr_df.index :
    UnivariateAnalysis(sample_data, column)

In [None]:
#  This show the very much skewness in data to check actual skewness we stats.probplot

from scipy import stats
def CheckSkewness(data, column):
    plt.figure(figsize=(12, 6)) 

    plt.subplot(121)
    sns.kdeplot(data[column]) 
    plt.title("Density Plot on " + column) 


    plt.subplot(122)
    stats.probplot(data[column], dist='norm', plot=plt)
    plt.title(f"Q-Q Plot of {column}")

    
    plt.tight_layout()
    plt.show()

In [None]:
CheckSkewness(sample_data, 'loc')

In [None]:
for column in corr_df.index :
    CheckSkewness(data, column)

In [None]:
# These all are skew so we should go to the bining technique
for column in data.columns :
    print(column)
    print(data[column].value_counts().sort_values(ascending=False))

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.compose import ColumnTransformer

In [None]:
inputs = data.drop(['defects'], axis=1)
targets = data.defects

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
targets = le.fit_transform(targets)

X_train, X_test, y_train, y_test = train_test_split(inputs, targets, test_size=0.2, random_state=42)

In [None]:
data.isna().sum()

In [None]:
data.describe()

In [None]:
from sklearn.preprocessing import PowerTransformer, FunctionTransformer

In [None]:
trf1 = ColumnTransformer([
    ("power", PowerTransformer(), ['loc', 'v(g)', 'ev(g)', 'iv(g)', 'n', 'v', 'l', 'd', 'i', 'e', 'b', 't','lOCode', 'lOComment', 'lOBlank', 'locCodeAndComment', 'uniq_Op', 'uniq_Opnd', 'total_Op', 'total_Opnd', 'branchCount'])
], remainder='passthrough', verbose_feature_names_out=False).set_output(transform='pandas')

In [None]:

trf2 = ColumnTransformer([
    ("function", FunctionTransformer(func=np.log1p), ['loc', 'v(g)', 'ev(g)', 'iv(g)', 'n', 'v', 'l', 'd', 'i', 'e', 'b', 't','lOCode', 'lOComment', 'lOBlank', 'locCodeAndComment', 'uniq_Op', 'uniq_Opnd', 'total_Op', 'total_Opnd', 'branchCount'])
], remainder='passthrough', verbose_feature_names_out=False).set_output(transform='pandas')

X_train = trf2.fit_transform(X_train)
X_train
X_test = trf2.transform(X_test)


In [None]:
for column in X_train.columns :
    
    CheckSkewness(X_train, column)

In [None]:
# Now data is skew 

from sklearn.metrics import roc_auc_score


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier

In [None]:
def TrainModel () :

    lgb_dict = {
        "learning_rate" : 0.2,
        "n_estimators" : 200,
        "boosting_type" : 'dart',
        "reg_alpha" : 0.4,
    }

    xgb_dict = {
        "n_estimators" : 200,
        "learning_rate" : 0.2,
        "eval_metric" : 'auc',
        "objective" : "binary:logistic"
    }

    cat_dict = {
        "n_estimators" : 200,
        "learning_rate" : 0.2
    }

    model_dict = {
        "log" : LogisticRegression(max_iter=1000),
        "lgb" : LGBMClassifier(random_state=42, **lgb_dict),
        "cat" : CatBoostClassifier(random_state=42, verbose=False, **cat_dict),
        "xgb" : XGBClassifier(random_state=42, **xgb_dict),
    }

    model_score = []

    for model_name, model in model_dict.items() :
        model.fit(X_train, y_train)
        pred = model.predict(X_test)
        roc_auc = roc_auc_score(y_test, pred)
        print(f"ROC AUC Score of model {model_name}:", roc_auc)
        model_score.append(roc_auc)


TrainModel()

In [None]:
from sklearn.metrics import roc_curve, auc


def KNNModel (X_test, X_train, y_train, y_test) :
    X_train = X_train.to_numpy()
    X_test = X_test.to_numpy()
    knn = KNeighborsClassifier(n_neighbors=200)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)

    roc_auc = roc_auc_score(y_test, y_pred)
    print(f"ROC AUC Score of model :", roc_auc)


KNNModel(X_test, X_train,y_train,  y_test)


In [None]:
test_data = pd.read_csv('data/test.csv')
test_data.shape

In [None]:
data = trf2.fit_transform(inputs)
test_data = trf2.transform(test_data)
model = LGBMClassifier(random_state=42)
model.fit(data, targets)
test_data.shape

In [None]:
y_pred = model.predict_proba(test_data)[:,0]

In [None]:
len(y_pred)

In [None]:
predictions = pd.DataFrame()
len(test_data.index)

In [None]:
predictions = pd.read_csv('data/sample_submission.csv')

In [None]:
predictions['defects'] = y_pred
predictions.shape

In [None]:
predictions.to_csv('prediction.csv', index=False)

In [None]:
predictions.shape