In [3]:
import pandas as pd 
import numpy as np 
import os 
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.compose import make_column_transformer,make_column_selector
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold 
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
os.chdir(r"D:\kaggla_comp\New folder")

In [4]:
train = pd.read_csv("train.csv", index_col=0)

train['Previous qualification'] = train['Previous qualification'].astype(object)
train["Mother's qualification"] = train["Mother's qualification"].astype(object)
train["Father's qualification"] = train["Father's qualification"].astype(object)
train["Mother's occupation"] = train["Mother's occupation"].astype(object)
train["Mother's occupation"] = train["Mother's occupation"].astype(object)
train['Application order'] = train['Application order'].astype(object)
train['Application mode'] = train['Application mode'].astype(object)
train['Marital status'] = train['Marital status'].astype(object)
train['Nacionality'] = train['Nacionality'].astype(object)
train['Course'] = train['Course'].astype(object)


In [5]:
# Initialize a LabelEncoder object
le = LabelEncoder()

# Fit the LabelEncoder to the 'Target' column of the train DataFrame and transform it
train['Target'] = le.fit_transform(train['Target'])

# Assign the transformed 'Target' column to the variable y
y = train['Target']

# Drop the 'Target' column from the train DataFrame and assign the result to the variable X
X = train.drop('Target', axis=1)

In [6]:
#### Column Transformer on Train set
# Initialize a OneHotEncoder
ohc = OneHotEncoder(
    # Output a dense array instead of a sparse matrix
    sparse_output=False,
    # Ignore unknown categories during transformation
    handle_unknown='ignore',
    # Drop the first category for each feature
    drop='first'
)

# Set the output format of the OneHotEncoder to pandas DataFrame
ohc.set_output(transform='pandas')

# Create a ColumnTransformer
ct = make_column_transformer(
    # Apply the OneHotEncoder to columns with object (categorical) data types
    (ohc, make_column_selector(dtype_include=object)),
    # Pass through columns with non-object (numerical) data types
    ("passthrough", make_column_selector(dtype_exclude=object)),
    # Suppress verbose output about feature names
    verbose_feature_names_out=False
)

In [8]:
# Initialize a Min-Max Scaler
mm_scaler = MinMaxScaler()

# Initialize a K-Nearest Neighbors Classifier
knn = KNeighborsClassifier()

# Create a Pipeline with the ColumnTransformer, Min-Max Scaler, and KNN Classifier
pipe = Pipeline([
    # Apply the ColumnTransformer to the data
    ('TRNS', ct),
    # Apply the Min-Max Scaler to the data
    ('SCL', mm_scaler),
    # Apply the KNN Classifier to the data
    ('KNN', knn)
])

# Define the grid of hyperparameters for the KNN Classifier
params = {
    # Number of neighbors for the KNN Classifier
    'KNN__n_neighbors': np.arange(1, 10)
}

# Create a Stratified K-Fold cross-validation object
kfold = StratifiedKFold(
    # Number of folds for the cross-validation
    n_splits=5,
    # Shuffle the data before splitting it into folds
    shuffle=True,
    # Random seed for shuffling
    random_state=24
)

# Create a GridSearchCV object
gcv = GridSearchCV(
    # Model to be tuned
    pipe,
    # Grid of hyperparameters
    param_grid=params,
    # Cross-validation object
    cv=kfold,
    # Verbosity level for the grid search
    verbose=3,
    # Metric to use for evaluating the model
    scoring='accuracy'
)

# Fit the grid search model to the data
gcv.fit(X, y)

# Print the best combination of hyperparameters found during the grid search
print(gcv.best_params_)

# Print the best score (i.e., the highest accuracy) found during the grid search
print(gcv.best_score_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits




[CV 1/5] END ................KNN__n_neighbors=1;, score=0.663 total time=   5.0s




[CV 2/5] END ................KNN__n_neighbors=1;, score=0.665 total time=   4.9s




[CV 3/5] END ................KNN__n_neighbors=1;, score=0.671 total time=   5.1s




[CV 4/5] END ................KNN__n_neighbors=1;, score=0.661 total time=   5.0s




[CV 5/5] END ................KNN__n_neighbors=1;, score=0.661 total time=   5.0s




[CV 1/5] END ................KNN__n_neighbors=2;, score=0.637 total time=   4.9s




[CV 2/5] END ................KNN__n_neighbors=2;, score=0.640 total time=   5.3s




[CV 3/5] END ................KNN__n_neighbors=2;, score=0.650 total time=   4.9s




[CV 4/5] END ................KNN__n_neighbors=2;, score=0.634 total time=   5.2s




[CV 5/5] END ................KNN__n_neighbors=2;, score=0.634 total time=   5.2s




[CV 1/5] END ................KNN__n_neighbors=3;, score=0.693 total time=   4.9s




[CV 2/5] END ................KNN__n_neighbors=3;, score=0.689 total time=   5.2s




[CV 3/5] END ................KNN__n_neighbors=3;, score=0.699 total time=   5.5s




[CV 4/5] END ................KNN__n_neighbors=3;, score=0.693 total time=   5.9s




[CV 5/5] END ................KNN__n_neighbors=3;, score=0.692 total time=   5.3s




[CV 1/5] END ................KNN__n_neighbors=4;, score=0.698 total time=   5.0s




[CV 2/5] END ................KNN__n_neighbors=4;, score=0.696 total time=   5.2s




[CV 3/5] END ................KNN__n_neighbors=4;, score=0.704 total time=   5.4s




[CV 4/5] END ................KNN__n_neighbors=4;, score=0.695 total time=   5.2s




[CV 5/5] END ................KNN__n_neighbors=4;, score=0.693 total time=   5.3s




[CV 1/5] END ................KNN__n_neighbors=5;, score=0.709 total time=   5.1s




[CV 2/5] END ................KNN__n_neighbors=5;, score=0.708 total time=   5.1s




[CV 3/5] END ................KNN__n_neighbors=5;, score=0.716 total time=   5.0s




[CV 4/5] END ................KNN__n_neighbors=5;, score=0.710 total time=   5.1s




[CV 5/5] END ................KNN__n_neighbors=5;, score=0.708 total time=   5.1s




[CV 1/5] END ................KNN__n_neighbors=6;, score=0.711 total time=   4.8s




[CV 2/5] END ................KNN__n_neighbors=6;, score=0.712 total time=   5.0s




[CV 3/5] END ................KNN__n_neighbors=6;, score=0.719 total time=   5.1s




[CV 4/5] END ................KNN__n_neighbors=6;, score=0.711 total time=   5.0s




[CV 5/5] END ................KNN__n_neighbors=6;, score=0.710 total time=   5.3s




[CV 1/5] END ................KNN__n_neighbors=7;, score=0.720 total time=   5.3s




[CV 2/5] END ................KNN__n_neighbors=7;, score=0.717 total time=   5.1s




[CV 3/5] END ................KNN__n_neighbors=7;, score=0.724 total time=   5.0s




[CV 4/5] END ................KNN__n_neighbors=7;, score=0.718 total time=   5.2s




[CV 5/5] END ................KNN__n_neighbors=7;, score=0.716 total time=   5.0s




[CV 1/5] END ................KNN__n_neighbors=8;, score=0.720 total time=   4.9s




[CV 2/5] END ................KNN__n_neighbors=8;, score=0.720 total time=   5.3s




[CV 3/5] END ................KNN__n_neighbors=8;, score=0.730 total time=   4.9s




[CV 4/5] END ................KNN__n_neighbors=8;, score=0.722 total time=   4.9s




[CV 5/5] END ................KNN__n_neighbors=8;, score=0.717 total time=   5.5s




[CV 1/5] END ................KNN__n_neighbors=9;, score=0.726 total time=   5.1s




[CV 2/5] END ................KNN__n_neighbors=9;, score=0.723 total time=   5.3s




[CV 3/5] END ................KNN__n_neighbors=9;, score=0.726 total time=   5.3s




[CV 4/5] END ................KNN__n_neighbors=9;, score=0.725 total time=   5.0s




[CV 5/5] END ................KNN__n_neighbors=9;, score=0.722 total time=   5.1s
{'KNN__n_neighbors': 9}
0.7246007226596373


In [9]:
#Reading test dataset
test = pd.read_csv("test.csv", index_col=0)

test['Previous qualification'] = test['Previous qualification'].astype(object)
test["Mother's qualification"] = test["Mother's qualification"].astype(object)
test["Father's qualification"] = test["Father's qualification"].astype(object)
test["Mother's occupation"] = test["Mother's occupation"].astype(object)
test["Mother's occupation"] = test["Mother's occupation"].astype(object)
test['Application order'] = test['Application order'].astype(object)
test['Application mode'] = test['Application mode'].astype(object)
test['Marital status'] = test['Marital status'].astype(object)
test['Nacionality'] = test['Nacionality'].astype(object)
test['Course'] = test['Course'].astype(object)


# Get the best model from the grid search
best_model = gcv.best_estimator_

# Use the best model to make predictions on the test set
prediction = best_model.predict(test)

# Print the classes used by the LabelEncoder
print(le.classes_)

# Inverse transform the predictions to get the original labels
prediction = le.inverse_transform(prediction)

# Create a DataFrame to store the predictions
submit = pd.DataFrame({
    # Column for the id
    'id': list(test.index),
    # Column for the predicted target
    'Target': prediction
})

# Save the DataFrame to a CSV file
submit.to_csv('D:/kaggla_comp/New folder/sbt_22.csv', index=None)



['Dropout' 'Enrolled' 'Graduate']
