In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'playground-series-s4e6:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F73290%2F8710574%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240626%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240626T061423Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D492d38a85c7b3d5d9e22898628f1b3cdf82b799da37674ddadfd47d27ccb5cb269b1d4a3c47d574ebfdbc6d77da2bfca381962d5d9cec40057721570bfb30fe34293d8f1b3d55556a06d00687d3a62648eb44155c7e04001cef8353d8c851bf5a7c30b558e9b5f54dd2e4b4ee950db03bb8c5ce6d080c3dcb5c32883972399f146ea894d2063275a2e1f321e788cf84f01f7fb9220c24990dd128e3f3697135d2ef3b8682593acfc3873fe80bf6bb7076018ed0ef996647678f965e6d588c42672b9db5513e16bc6c93d8561cbc197925882dcc21ff528f0110be634adc7215a23534a4e591058efdeb865f7183a1533d4e640c30b84b755152fafbe11c48a8d'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading playground-series-s4e6, 3217279 bytes compressed
Downloaded and uncompressed: playground-series-s4e6
Data source import complete.


# Classification with academic success datasets

## 1. Imports and Configs

In [5]:
import sklearn
import numpy as np
import os
import datetime
import pandas as pd
import matplotlib.pyplot as plt
import missingno as msno
from prettytable import PrettyTable
%matplotlib inline
import seaborn as sns
sns.set(style='darkgrid', font_scale=1.4)
from tqdm import tqdm
from tqdm.notebook import tqdm as tqdm_notebook
tqdm_notebook.get_lock().locks = []
# !pip install sweetviz
# import sweetviz as sv
import concurrent.futures
from copy import deepcopy
from functools import partial
from itertools import combinations
import random
from random import randint, uniform
import gc
from sklearn.feature_selection import f_classif
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler,PowerTransformer, FunctionTransformer
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from itertools import combinations
from sklearn.impute import SimpleImputer
import xgboost as xg
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import mean_squared_error,mean_squared_log_error, roc_auc_score, accuracy_score, f1_score, precision_recall_curve, log_loss
from sklearn.cluster import KMeans
#!pip install yellowbrick
from yellowbrick.cluster import KElbowVisualizer
#!pip install gap-stat
#from gap_statistic.optimalK import OptimalK
from scipy import stats
import statsmodels.api as sm
from scipy.stats import ttest_ind
from scipy.stats import boxcox
import math
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.base import BaseEstimator, TransformerMixin
!pip install optuna
import optuna
#!pip install cmaes
#import cmaes
import xgboost as xgb
!pip install catboost
!pip install lightgbm --install-option=--gpu --install-option="--boost-root=C:/local/boost_1_69_0" --install-option="--boost-librarydir=C:/local/boost_1_69_0/lib64-msvc-14.1"
import lightgbm as lgb
!pip install category_encoders
from category_encoders import OneHotEncoder, OrdinalEncoder, CountEncoder, CatBoostEncoder
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, GradientBoostingClassifier,ExtraTreesClassifier, AdaBoostClassifier, HistGradientBoostingRegressor
#!pip install -U imbalanced-learn
#from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoost, CatBoostRegressor, CatBoostClassifier
from sklearn.svm import NuSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression, RidgeClassifierCV, LogisticRegressionCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.neural_network import MLPClassifier
from catboost import Pool
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import FactorAnalysis

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")
pd.pandas.set_option('display.max_columns',None)

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5

Usage:   
  pip3 install [options] <requirement specifier> [package-index-options] ...
  pip3 install [options] -r <requirements file> [package-index-options] ...
  pip3 install [options] [-e] <vcs project url> ...
  pip3 install [options] [-e] <local project path> ...
  pip3 install [options] <archive url/path> ...

no such option: --install-option


In [6]:
SEED = 23
N_FOLDS = 5
target = 'Target'

## 2. Data Loading and Overview

In [7]:
global device
device = 'cpu'

train=pd.read_csv('/kaggle/input/playground-series-s4e6/train.csv')
test=pd.read_csv('/kaggle/input/playground-series-s4e6/test.csv')
submission=pd.read_csv("/kaggle/input/playground-series-s4e6/sample_submission.csv")

train.drop(columns=["id"],inplace=True)
test.drop(columns=["id"],inplace=True)
train_copy=train.copy()
test_copy=test.copy()

train.reset_index(inplace=True,drop=True)

initial_features = list(train.columns)

In [8]:
train

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Admission grade,Displaced,Educational special needs,Debtor,Tuition fees up to date,Gender,Scholarship holder,Age at enrollment,International,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,1,1,9238,1,1,126.0,1,1,19,5,5,122.6,0,0,0,1,0,1,18,0,0,6,6,6,14.500000,0,0,6,7,6,12.428571,0,11.1,0.6,2.02,Graduate
1,1,17,1,9238,1,1,125.0,1,19,19,9,9,119.8,1,0,0,1,0,0,18,0,0,6,8,4,11.600000,0,0,6,9,0,0.000000,0,11.1,0.6,2.02,Dropout
2,1,17,2,9254,1,1,137.0,1,3,19,2,3,144.7,0,0,0,1,1,0,18,0,0,6,0,0,0.000000,0,0,6,0,0,0.000000,0,16.2,0.3,-0.92,Dropout
3,1,1,3,9500,1,1,131.0,1,19,3,3,2,126.1,1,0,0,1,0,1,18,0,0,7,9,7,12.591250,0,0,8,11,7,12.820000,0,11.1,0.6,2.02,Enrolled
4,1,1,2,9500,1,1,132.0,1,19,37,4,9,120.1,1,0,0,1,0,0,18,0,0,7,12,6,12.933333,0,0,7,12,6,12.933333,0,7.6,2.6,0.32,Graduate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76513,1,17,1,9254,1,1,121.0,1,19,1,7,5,116.5,1,0,0,1,0,1,18,0,0,6,9,6,10.666667,0,0,6,8,5,10.600000,0,13.9,-0.3,0.79,Graduate
76514,1,1,6,9254,1,1,125.0,1,1,38,4,9,131.6,1,0,0,1,0,0,19,0,0,6,22,4,13.000000,0,0,6,9,6,13.875000,0,9.4,-0.8,-3.12,Graduate
76515,5,17,1,9085,1,1,138.0,1,37,37,9,10,123.3,1,0,0,1,0,0,19,0,0,5,13,4,12.500000,2,0,5,8,5,11.400000,1,9.4,-0.8,-3.12,Enrolled
76516,1,1,3,9070,1,1,136.0,1,38,37,5,9,124.8,1,0,0,1,0,0,18,0,0,6,0,0,0.000000,0,0,6,0,0,0.000000,0,7.6,2.6,0.32,Dropout


In [9]:
test

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Admission grade,Displaced,Educational special needs,Debtor,Tuition fees up to date,Gender,Scholarship holder,Age at enrollment,International,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP
0,1,1,1,9500,1,1,141.0,1,3,1,2,3,152.1,0,0,0,0,0,0,18,0,0,7,0,0,0.000000,0,0,8,0,0,0.000000,0,13.9,-0.3,0.79
1,1,1,1,9238,1,1,128.0,1,1,19,4,5,116.5,0,0,0,1,0,0,19,0,0,6,7,6,14.857143,0,0,6,6,6,13.500000,0,11.1,0.6,2.02
2,1,1,1,9238,1,1,118.0,1,1,19,4,9,114.2,0,0,0,1,0,1,18,0,0,6,11,6,12.000000,0,0,6,11,5,11.000000,0,15.5,2.8,-4.06
3,1,44,1,9147,1,39,130.0,1,1,19,3,3,130.0,0,0,0,1,0,1,23,0,2,6,15,5,11.500000,0,3,8,14,5,11.000000,0,8.9,1.4,3.51
4,1,39,1,9670,1,1,110.0,1,1,37,5,5,106.0,1,0,0,1,0,0,26,0,0,6,9,3,11.000000,0,0,6,9,4,10.666667,2,7.6,2.6,0.32
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51007,1,1,2,171,1,1,128.0,1,38,37,7,10,124.7,1,0,0,1,0,0,19,0,0,0,0,0,0.000000,0,0,0,0,0,0.000000,0,15.5,2.8,-4.06
51008,2,39,1,9119,1,19,133.1,1,19,37,9,9,140.0,0,0,1,0,1,0,33,0,0,5,6,0,0.000000,0,0,5,5,0,0.000000,0,9.4,-0.8,-3.12
51009,1,1,1,171,1,1,127.0,1,1,1,4,10,120.4,0,0,1,0,0,0,20,0,0,0,0,0,0.000000,0,0,0,0,0,0.000000,0,15.5,2.8,-4.06
51010,1,1,3,9773,1,1,132.0,1,19,19,5,5,126.3,1,0,1,0,0,0,18,0,0,6,8,5,12.600000,0,0,6,9,3,13.000000,0,7.6,2.6,0.32


In [10]:
train.isna().sum().sort_values(ascending=False)

Marital status                                    0
Age at enrollment                                 0
Curricular units 1st sem (credited)               0
Curricular units 1st sem (enrolled)               0
Curricular units 1st sem (evaluations)            0
Curricular units 1st sem (approved)               0
Curricular units 1st sem (grade)                  0
Curricular units 1st sem (without evaluations)    0
Curricular units 2nd sem (credited)               0
Curricular units 2nd sem (enrolled)               0
Curricular units 2nd sem (evaluations)            0
Curricular units 2nd sem (approved)               0
Curricular units 2nd sem (grade)                  0
Curricular units 2nd sem (without evaluations)    0
Unemployment rate                                 0
Inflation rate                                    0
GDP                                               0
International                                     0
Scholarship holder                                0
Application 

In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76518 entries, 0 to 76517
Data columns (total 37 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Marital status                                  76518 non-null  int64  
 1   Application mode                                76518 non-null  int64  
 2   Application order                               76518 non-null  int64  
 3   Course                                          76518 non-null  int64  
 4   Daytime/evening attendance                      76518 non-null  int64  
 5   Previous qualification                          76518 non-null  int64  
 6   Previous qualification (grade)                  76518 non-null  float64
 7   Nacionality                                     76518 non-null  int64  
 8   Mother's qualification                          76518 non-null  int64  
 9   Father's qualification                 

In [12]:
train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Marital status,76518.0,1.111934,0.441669,1.0,1.0,1.0,1.0,6.0
Application mode,76518.0,16.054419,16.682337,1.0,1.0,17.0,39.0,53.0
Application order,76518.0,1.64441,1.229645,0.0,1.0,1.0,2.0,9.0
Course,76518.0,9001.286377,1803.438531,33.0,9119.0,9254.0,9670.0,9991.0
Daytime/evening attendance,76518.0,0.915314,0.278416,0.0,1.0,1.0,1.0,1.0
Previous qualification,76518.0,3.65876,8.623774,1.0,1.0,1.0,1.0,43.0
Previous qualification (grade),76518.0,132.378766,10.995328,95.0,125.0,133.1,140.0,190.0
Nacionality,76518.0,1.2266,3.392183,1.0,1.0,1.0,1.0,109.0
Mother's qualification,76518.0,19.837633,15.399456,1.0,1.0,19.0,37.0,44.0
Father's qualification,76518.0,23.425076,14.921164,1.0,4.0,19.0,37.0,44.0


## 3.Feature Engineering

### Train Dataset

In [13]:
# Print the number of unique values for each column
for col in train.columns:
    print(f'{col} has {train[col].nunique()} values')

Marital status has 6 values
Application mode has 22 values
Application order has 8 values
Course has 19 values
Daytime/evening attendance has 2 values
Previous qualification has 21 values
Previous qualification (grade) has 110 values
Nacionality has 18 values
Mother's qualification has 35 values
Father's qualification has 39 values
Mother's occupation has 40 values
Father's occupation has 56 values
Admission grade has 668 values
Displaced has 2 values
Educational special needs has 2 values
Debtor has 2 values
Tuition fees up to date has 2 values
Gender has 2 values
Scholarship holder has 2 values
Age at enrollment has 46 values
International has 2 values
Curricular units 1st sem (credited) has 21 values
Curricular units 1st sem (enrolled) has 24 values
Curricular units 1st sem (evaluations) has 36 values
Curricular units 1st sem (approved) has 23 values
Curricular units 1st sem (grade) has 1206 values
Curricular units 1st sem (without evaluations) has 12 values
Curricular units 2nd sem

In [14]:
def check_unique_values(col_label):
    cond = (set(train[col_label])==set(test[col_label]))
    return cond

In [15]:
check_unique_values('Marital status')

True

In [16]:
print(train['Marital status'].unique())

[1 2 4 3 5 6]


In [17]:
print(test['Marital status'].unique())

[1 2 4 5 6 3]


In [18]:
def custom_one_hot_encoding(data, target_col_label, new_col_labels, conditions):
    for label in new_col_labels:
        data[label] = 0

    for idx, row in data.iterrows():
        value = row[target_col_label]
        matched = False
        for i, condition in enumerate(conditions):
            if value in condition:
                data.at[idx, new_col_labels[i]] = 1
                matched = True
                break
        if not matched:
            data.at[idx, new_col_labels[-1]] = 1

    return data


In [19]:
custom_one_hot_encoding(train, 'Marital status', ['Marital_single','Marital_family'],[[1,3]])

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Admission grade,Displaced,Educational special needs,Debtor,Tuition fees up to date,Gender,Scholarship holder,Age at enrollment,International,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target,Marital_single,Marital_family
0,1,1,1,9238,1,1,126.0,1,1,19,5,5,122.6,0,0,0,1,0,1,18,0,0,6,6,6,14.500000,0,0,6,7,6,12.428571,0,11.1,0.6,2.02,Graduate,1,0
1,1,17,1,9238,1,1,125.0,1,19,19,9,9,119.8,1,0,0,1,0,0,18,0,0,6,8,4,11.600000,0,0,6,9,0,0.000000,0,11.1,0.6,2.02,Dropout,1,0
2,1,17,2,9254,1,1,137.0,1,3,19,2,3,144.7,0,0,0,1,1,0,18,0,0,6,0,0,0.000000,0,0,6,0,0,0.000000,0,16.2,0.3,-0.92,Dropout,1,0
3,1,1,3,9500,1,1,131.0,1,19,3,3,2,126.1,1,0,0,1,0,1,18,0,0,7,9,7,12.591250,0,0,8,11,7,12.820000,0,11.1,0.6,2.02,Enrolled,1,0
4,1,1,2,9500,1,1,132.0,1,19,37,4,9,120.1,1,0,0,1,0,0,18,0,0,7,12,6,12.933333,0,0,7,12,6,12.933333,0,7.6,2.6,0.32,Graduate,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76513,1,17,1,9254,1,1,121.0,1,19,1,7,5,116.5,1,0,0,1,0,1,18,0,0,6,9,6,10.666667,0,0,6,8,5,10.600000,0,13.9,-0.3,0.79,Graduate,1,0
76514,1,1,6,9254,1,1,125.0,1,1,38,4,9,131.6,1,0,0,1,0,0,19,0,0,6,22,4,13.000000,0,0,6,9,6,13.875000,0,9.4,-0.8,-3.12,Graduate,1,0
76515,5,17,1,9085,1,1,138.0,1,37,37,9,10,123.3,1,0,0,1,0,0,19,0,0,5,13,4,12.500000,2,0,5,8,5,11.400000,1,9.4,-0.8,-3.12,Enrolled,0,1
76516,1,1,3,9070,1,1,136.0,1,38,37,5,9,124.8,1,0,0,1,0,0,18,0,0,6,0,0,0.000000,0,0,6,0,0,0.000000,0,7.6,2.6,0.32,Dropout,1,0


In [20]:
check_unique_values('Previous qualification')

False

In [21]:
print(train['Previous qualification'].unique())

[ 1 19 39 40 12  3  2 42  4  9 11  6 15 38 10 43 14 37  5 36 17]


In [22]:
print(test['Previous qualification'].unique())

[ 1 39 19  9  3 12 40 42 10  2  6 43 38  4 15 17 14  5 11 16]


In [23]:
custom_one_hot_encoding(train, 'Previous qualification', ['PQ_default','PQ_higher','PQ_lower','PQ_much_lower','PQ_others'],[[1],[2,3,4,5,6],[9,10,12,14,15],[19,38]])

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Admission grade,Displaced,Educational special needs,Debtor,Tuition fees up to date,Gender,Scholarship holder,Age at enrollment,International,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target,Marital_single,Marital_family,PQ_default,PQ_higher,PQ_lower,PQ_much_lower,PQ_others
0,1,1,1,9238,1,1,126.0,1,1,19,5,5,122.6,0,0,0,1,0,1,18,0,0,6,6,6,14.500000,0,0,6,7,6,12.428571,0,11.1,0.6,2.02,Graduate,1,0,1,0,0,0,0
1,1,17,1,9238,1,1,125.0,1,19,19,9,9,119.8,1,0,0,1,0,0,18,0,0,6,8,4,11.600000,0,0,6,9,0,0.000000,0,11.1,0.6,2.02,Dropout,1,0,1,0,0,0,0
2,1,17,2,9254,1,1,137.0,1,3,19,2,3,144.7,0,0,0,1,1,0,18,0,0,6,0,0,0.000000,0,0,6,0,0,0.000000,0,16.2,0.3,-0.92,Dropout,1,0,1,0,0,0,0
3,1,1,3,9500,1,1,131.0,1,19,3,3,2,126.1,1,0,0,1,0,1,18,0,0,7,9,7,12.591250,0,0,8,11,7,12.820000,0,11.1,0.6,2.02,Enrolled,1,0,1,0,0,0,0
4,1,1,2,9500,1,1,132.0,1,19,37,4,9,120.1,1,0,0,1,0,0,18,0,0,7,12,6,12.933333,0,0,7,12,6,12.933333,0,7.6,2.6,0.32,Graduate,1,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76513,1,17,1,9254,1,1,121.0,1,19,1,7,5,116.5,1,0,0,1,0,1,18,0,0,6,9,6,10.666667,0,0,6,8,5,10.600000,0,13.9,-0.3,0.79,Graduate,1,0,1,0,0,0,0
76514,1,1,6,9254,1,1,125.0,1,1,38,4,9,131.6,1,0,0,1,0,0,19,0,0,6,22,4,13.000000,0,0,6,9,6,13.875000,0,9.4,-0.8,-3.12,Graduate,1,0,1,0,0,0,0
76515,5,17,1,9085,1,1,138.0,1,37,37,9,10,123.3,1,0,0,1,0,0,19,0,0,5,13,4,12.500000,2,0,5,8,5,11.400000,1,9.4,-0.8,-3.12,Enrolled,0,1,1,0,0,0,0
76516,1,1,3,9070,1,1,136.0,1,38,37,5,9,124.8,1,0,0,1,0,0,18,0,0,6,0,0,0.000000,0,0,6,0,0,0.000000,0,7.6,2.6,0.32,Dropout,1,0,1,0,0,0,0


In [24]:
check_unique_values('Nacionality')

False

In [25]:
print(train['Nacionality'].unique())

[  1  26  41  24   6 100  21  22  11 101   2 103 105  25  17  62 109  32]


In [26]:
print(test['Nacionality'].unique())

[  1  41  22 109 100  21   6  26  11 105  62   2  32  24 103  25 101  14]


In [27]:
custom_one_hot_encoding(train, 'Nacionality', ['N_Portgal','N_Europe','N_Africa','N_South America','N_Central America','N_Asia','N_Others'],[[1],[2,6,11,13,14,17,62,100,103,105],[21,22,24,25,26],[41,109],[101,108],[32]])

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Admission grade,Displaced,Educational special needs,Debtor,Tuition fees up to date,Gender,Scholarship holder,Age at enrollment,International,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target,Marital_single,Marital_family,PQ_default,PQ_higher,PQ_lower,PQ_much_lower,PQ_others,N_Portgal,N_Europe,N_Africa,N_South America,N_Central America,N_Asia,N_Others
0,1,1,1,9238,1,1,126.0,1,1,19,5,5,122.6,0,0,0,1,0,1,18,0,0,6,6,6,14.500000,0,0,6,7,6,12.428571,0,11.1,0.6,2.02,Graduate,1,0,1,0,0,0,0,1,0,0,0,0,0,0
1,1,17,1,9238,1,1,125.0,1,19,19,9,9,119.8,1,0,0,1,0,0,18,0,0,6,8,4,11.600000,0,0,6,9,0,0.000000,0,11.1,0.6,2.02,Dropout,1,0,1,0,0,0,0,1,0,0,0,0,0,0
2,1,17,2,9254,1,1,137.0,1,3,19,2,3,144.7,0,0,0,1,1,0,18,0,0,6,0,0,0.000000,0,0,6,0,0,0.000000,0,16.2,0.3,-0.92,Dropout,1,0,1,0,0,0,0,1,0,0,0,0,0,0
3,1,1,3,9500,1,1,131.0,1,19,3,3,2,126.1,1,0,0,1,0,1,18,0,0,7,9,7,12.591250,0,0,8,11,7,12.820000,0,11.1,0.6,2.02,Enrolled,1,0,1,0,0,0,0,1,0,0,0,0,0,0
4,1,1,2,9500,1,1,132.0,1,19,37,4,9,120.1,1,0,0,1,0,0,18,0,0,7,12,6,12.933333,0,0,7,12,6,12.933333,0,7.6,2.6,0.32,Graduate,1,0,1,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76513,1,17,1,9254,1,1,121.0,1,19,1,7,5,116.5,1,0,0,1,0,1,18,0,0,6,9,6,10.666667,0,0,6,8,5,10.600000,0,13.9,-0.3,0.79,Graduate,1,0,1,0,0,0,0,1,0,0,0,0,0,0
76514,1,1,6,9254,1,1,125.0,1,1,38,4,9,131.6,1,0,0,1,0,0,19,0,0,6,22,4,13.000000,0,0,6,9,6,13.875000,0,9.4,-0.8,-3.12,Graduate,1,0,1,0,0,0,0,1,0,0,0,0,0,0
76515,5,17,1,9085,1,1,138.0,1,37,37,9,10,123.3,1,0,0,1,0,0,19,0,0,5,13,4,12.500000,2,0,5,8,5,11.400000,1,9.4,-0.8,-3.12,Enrolled,0,1,1,0,0,0,0,1,0,0,0,0,0,0
76516,1,1,3,9070,1,1,136.0,1,38,37,5,9,124.8,1,0,0,1,0,0,18,0,0,6,0,0,0.000000,0,0,6,0,0,0.000000,0,7.6,2.6,0.32,Dropout,1,0,1,0,0,0,0,1,0,0,0,0,0,0


In [28]:
train['Curricular units 1st sem (credited/enrolled)'] = train['Curricular units 1st sem (credited)']/train['Curricular units 1st sem (enrolled)']
train['Curricular units 1st sem (credited/enrolled)'] = train['Curricular units 1st sem (credited/enrolled)'].fillna(0)

In [29]:
train['Curricular units 1st sem (approved/enrolled)'] = train['Curricular units 1st sem (approved)']/train['Curricular units 1st sem (enrolled)']
train['Curricular units 1st sem (approved/enrolled)'] = train['Curricular units 1st sem (approved/enrolled)'].fillna(0)

In [30]:
train.head(10)

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Admission grade,Displaced,Educational special needs,Debtor,Tuition fees up to date,Gender,Scholarship holder,Age at enrollment,International,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target,Marital_single,Marital_family,PQ_default,PQ_higher,PQ_lower,PQ_much_lower,PQ_others,N_Portgal,N_Europe,N_Africa,N_South America,N_Central America,N_Asia,N_Others,Curricular units 1st sem (credited/enrolled),Curricular units 1st sem (approved/enrolled)
0,1,1,1,9238,1,1,126.0,1,1,19,5,5,122.6,0,0,0,1,0,1,18,0,0,6,6,6,14.5,0,0,6,7,6,12.428571,0,11.1,0.6,2.02,Graduate,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.0,1.0
1,1,17,1,9238,1,1,125.0,1,19,19,9,9,119.8,1,0,0,1,0,0,18,0,0,6,8,4,11.6,0,0,6,9,0,0.0,0,11.1,0.6,2.02,Dropout,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.0,0.666667
2,1,17,2,9254,1,1,137.0,1,3,19,2,3,144.7,0,0,0,1,1,0,18,0,0,6,0,0,0.0,0,0,6,0,0,0.0,0,16.2,0.3,-0.92,Dropout,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.0,0.0
3,1,1,3,9500,1,1,131.0,1,19,3,3,2,126.1,1,0,0,1,0,1,18,0,0,7,9,7,12.59125,0,0,8,11,7,12.82,0,11.1,0.6,2.02,Enrolled,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.0,1.0
4,1,1,2,9500,1,1,132.0,1,19,37,4,9,120.1,1,0,0,1,0,0,18,0,0,7,12,6,12.933333,0,0,7,12,6,12.933333,0,7.6,2.6,0.32,Graduate,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.0,0.857143
5,1,39,1,171,1,19,133.1,1,19,19,1,1,100.0,0,0,0,1,1,0,24,0,0,0,0,0,0.0,0,0,0,0,0,0.0,0,13.9,-0.3,0.79,Dropout,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0.0,0.0
6,1,44,1,9085,1,39,130.0,1,37,37,9,6,130.0,1,0,0,1,0,0,21,0,0,5,7,4,12.75,0,0,5,8,5,13.25,0,12.4,0.5,1.79,Graduate,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0.0,0.8
7,1,1,1,9773,1,1,130.0,1,19,37,4,5,133.9,1,0,0,1,1,0,18,0,0,6,6,5,13.0,0,0,6,6,6,13.666667,0,10.8,1.4,1.74,Graduate,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.0,0.833333
8,1,39,1,9003,1,1,133.1,1,19,19,9,9,130.0,1,0,1,0,1,0,24,0,0,6,8,0,0.0,0,0,6,10,0,0.0,0,16.2,0.3,-0.92,Dropout,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.0,0.0
9,1,1,6,9500,1,1,135.0,1,37,37,4,9,128.0,1,0,0,1,0,1,18,0,0,7,7,6,12.966667,0,0,7,7,6,12.966667,0,7.6,2.6,0.32,Graduate,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.0,0.857143


In [31]:
scaler = MinMaxScaler()

train['Evaluations_scaled'] = scaler.fit_transform(train[['Curricular units 1st sem (evaluations)']])
train['Without_evaluations_scaled'] = scaler.fit_transform(train[['Curricular units 1st sem (without evaluations)']])

train['Curricular units 1st sem (evaluations-without evaluations)'] = train['Evaluations_scaled'] - train['Without_evaluations_scaled']

train.drop(['Evaluations_scaled', 'Without_evaluations_scaled'], axis=1, inplace=True)

In [32]:
train.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Admission grade,Displaced,Educational special needs,Debtor,Tuition fees up to date,Gender,Scholarship holder,Age at enrollment,International,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target,Marital_single,Marital_family,PQ_default,PQ_higher,PQ_lower,PQ_much_lower,PQ_others,N_Portgal,N_Europe,N_Africa,N_South America,N_Central America,N_Asia,N_Others,Curricular units 1st sem (credited/enrolled),Curricular units 1st sem (approved/enrolled),Curricular units 1st sem (evaluations-without evaluations)
0,1,1,1,9238,1,1,126.0,1,1,19,5,5,122.6,0,0,0,1,0,1,18,0,0,6,6,6,14.5,0,0,6,7,6,12.428571,0,11.1,0.6,2.02,Graduate,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.0,1.0,0.133333
1,1,17,1,9238,1,1,125.0,1,19,19,9,9,119.8,1,0,0,1,0,0,18,0,0,6,8,4,11.6,0,0,6,9,0,0.0,0,11.1,0.6,2.02,Dropout,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.0,0.666667,0.177778
2,1,17,2,9254,1,1,137.0,1,3,19,2,3,144.7,0,0,0,1,1,0,18,0,0,6,0,0,0.0,0,0,6,0,0,0.0,0,16.2,0.3,-0.92,Dropout,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.0,0.0,0.0
3,1,1,3,9500,1,1,131.0,1,19,3,3,2,126.1,1,0,0,1,0,1,18,0,0,7,9,7,12.59125,0,0,8,11,7,12.82,0,11.1,0.6,2.02,Enrolled,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.0,1.0,0.2
4,1,1,2,9500,1,1,132.0,1,19,37,4,9,120.1,1,0,0,1,0,0,18,0,0,7,12,6,12.933333,0,0,7,12,6,12.933333,0,7.6,2.6,0.32,Graduate,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.0,0.857143,0.266667


In [33]:
train['Curricular units 2nd sem (credited/enrolled)'] = train['Curricular units 2nd sem (credited)']/train['Curricular units 2nd sem (enrolled)']
train['Curricular units 2nd sem (credited/enrolled)'] = train['Curricular units 2nd sem (credited/enrolled)'].fillna(0)

In [34]:
train['Curricular units 2nd sem (approved/enrolled)'] = train['Curricular units 2nd sem (approved)']/train['Curricular units 2nd sem (enrolled)']
train['Curricular units 2nd sem (approved/enrolled)'] = train['Curricular units 2nd sem (approved/enrolled)'].fillna(0)

In [35]:
scaler = MinMaxScaler()

train['Evaluations_scaled'] = scaler.fit_transform(train[['Curricular units 2nd sem (evaluations)']])
train['Without_evaluations_scaled'] = scaler.fit_transform(train[['Curricular units 2nd sem (without evaluations)']])

train['Curricular units 2nd sem (evaluations-without evaluations)'] = train['Evaluations_scaled'] - train['Without_evaluations_scaled']

train.drop(['Evaluations_scaled', 'Without_evaluations_scaled'], axis=1, inplace=True)

In [36]:
train.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Admission grade,Displaced,Educational special needs,Debtor,Tuition fees up to date,Gender,Scholarship holder,Age at enrollment,International,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target,Marital_single,Marital_family,PQ_default,PQ_higher,PQ_lower,PQ_much_lower,PQ_others,N_Portgal,N_Europe,N_Africa,N_South America,N_Central America,N_Asia,N_Others,Curricular units 1st sem (credited/enrolled),Curricular units 1st sem (approved/enrolled),Curricular units 1st sem (evaluations-without evaluations),Curricular units 2nd sem (credited/enrolled),Curricular units 2nd sem (approved/enrolled),Curricular units 2nd sem (evaluations-without evaluations)
0,1,1,1,9238,1,1,126.0,1,1,19,5,5,122.6,0,0,0,1,0,1,18,0,0,6,6,6,14.5,0,0,6,7,6,12.428571,0,11.1,0.6,2.02,Graduate,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.0,1.0,0.133333,0.0,1.0,0.212121
1,1,17,1,9238,1,1,125.0,1,19,19,9,9,119.8,1,0,0,1,0,0,18,0,0,6,8,4,11.6,0,0,6,9,0,0.0,0,11.1,0.6,2.02,Dropout,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.0,0.666667,0.177778,0.0,0.0,0.272727
2,1,17,2,9254,1,1,137.0,1,3,19,2,3,144.7,0,0,0,1,1,0,18,0,0,6,0,0,0.0,0,0,6,0,0,0.0,0,16.2,0.3,-0.92,Dropout,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,1,3,9500,1,1,131.0,1,19,3,3,2,126.1,1,0,0,1,0,1,18,0,0,7,9,7,12.59125,0,0,8,11,7,12.82,0,11.1,0.6,2.02,Enrolled,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.0,1.0,0.2,0.0,0.875,0.333333
4,1,1,2,9500,1,1,132.0,1,19,37,4,9,120.1,1,0,0,1,0,0,18,0,0,7,12,6,12.933333,0,0,7,12,6,12.933333,0,7.6,2.6,0.32,Graduate,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.0,0.857143,0.266667,0.0,0.857143,0.363636


In [37]:
sc = StandardScaler()
columns_to_scale = ['Unemployment rate', 'Inflation rate', 'GDP']
train[columns_to_scale] = sc.fit_transform(train[columns_to_scale])
fa = FactorAnalysis(n_components=1, max_iter=5000)
fitted = fa.fit_transform(train[columns_to_scale])

print(fitted)
print(fitted.shape)

[[-0.85531678]
 [-0.85531678]
 [ 0.42478788]
 ...
 [ 1.15028179]
 [-0.22912022]
 [ 1.69083564]]
(76518, 1)


In [38]:
Factor_loading_matrix = fa.components_.T

pd.DataFrame(Factor_loading_matrix,
             columns=["第1因子"],
             index=[train[columns_to_scale].columns])

Unnamed: 0,第1因子
Unemployment rate,0.324861
Inflation rate,0.090574
GDP,-0.918227


In [39]:
train['Recession'] = fitted
train.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Admission grade,Displaced,Educational special needs,Debtor,Tuition fees up to date,Gender,Scholarship holder,Age at enrollment,International,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target,Marital_single,Marital_family,PQ_default,PQ_higher,PQ_lower,PQ_much_lower,PQ_others,N_Portgal,N_Europe,N_Africa,N_South America,N_Central America,N_Asia,N_Others,Curricular units 1st sem (credited/enrolled),Curricular units 1st sem (approved/enrolled),Curricular units 1st sem (evaluations-without evaluations),Curricular units 2nd sem (credited/enrolled),Curricular units 2nd sem (approved/enrolled),Curricular units 2nd sem (evaluations-without evaluations),Recession
0,1,1,1,9238,1,1,126.0,1,1,19,5,5,122.6,0,0,0,1,0,1,18,0,0,6,6,6,14.5,0,0,6,7,6,12.428571,0,-0.158418,-0.44911,0.933176,Graduate,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.0,1.0,0.133333,0.0,1.0,0.212121,-0.855317
1,1,17,1,9238,1,1,125.0,1,19,19,9,9,119.8,1,0,0,1,0,0,18,0,0,6,8,4,11.6,0,0,6,9,0,0.0,0,-0.158418,-0.44911,0.933176,Dropout,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.0,0.666667,0.177778,0.0,0.0,0.272727,-0.855317
2,1,17,2,9254,1,1,137.0,1,3,19,2,3,144.7,0,0,0,1,1,0,18,0,0,6,0,0,0.0,0,0,6,0,0,0.0,0,1.763675,-0.663578,-0.372698,Dropout,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.424788
3,1,1,3,9500,1,1,131.0,1,19,3,3,2,126.1,1,0,0,1,0,1,18,0,0,7,9,7,12.59125,0,0,8,11,7,12.82,0,-0.158418,-0.44911,0.933176,Enrolled,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.0,1.0,0.2,0.0,0.875,0.333333,-0.855317
4,1,1,2,9500,1,1,132.0,1,19,37,4,9,120.1,1,0,0,1,0,0,18,0,0,7,12,6,12.933333,0,0,7,12,6,12.933333,0,-1.477502,0.98068,0.178079,Graduate,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.0,0.857143,0.266667,0.0,0.857143,0.363636,-0.22912


In [40]:
train['Debtor_Recession_interaction'] = train['Debtor'] * train['Recession']

In [41]:
train['Scholarship holder_Recession_interaction'] = train['Scholarship holder'] * train['Recession']

In [42]:
train

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Admission grade,Displaced,Educational special needs,Debtor,Tuition fees up to date,Gender,Scholarship holder,Age at enrollment,International,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target,Marital_single,Marital_family,PQ_default,PQ_higher,PQ_lower,PQ_much_lower,PQ_others,N_Portgal,N_Europe,N_Africa,N_South America,N_Central America,N_Asia,N_Others,Curricular units 1st sem (credited/enrolled),Curricular units 1st sem (approved/enrolled),Curricular units 1st sem (evaluations-without evaluations),Curricular units 2nd sem (credited/enrolled),Curricular units 2nd sem (approved/enrolled),Curricular units 2nd sem (evaluations-without evaluations),Recession,Debtor_Recession_interaction,Scholarship holder_Recession_interaction
0,1,1,1,9238,1,1,126.0,1,1,19,5,5,122.6,0,0,0,1,0,1,18,0,0,6,6,6,14.500000,0,0,6,7,6,12.428571,0,-0.158418,-0.449110,0.933176,Graduate,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.0,1.000000,0.133333,0.0,1.000000,0.212121,-0.855317,-0.0,-0.855317
1,1,17,1,9238,1,1,125.0,1,19,19,9,9,119.8,1,0,0,1,0,0,18,0,0,6,8,4,11.600000,0,0,6,9,0,0.000000,0,-0.158418,-0.449110,0.933176,Dropout,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.0,0.666667,0.177778,0.0,0.000000,0.272727,-0.855317,-0.0,-0.000000
2,1,17,2,9254,1,1,137.0,1,3,19,2,3,144.7,0,0,0,1,1,0,18,0,0,6,0,0,0.000000,0,0,6,0,0,0.000000,0,1.763675,-0.663578,-0.372698,Dropout,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.424788,0.0,0.000000
3,1,1,3,9500,1,1,131.0,1,19,3,3,2,126.1,1,0,0,1,0,1,18,0,0,7,9,7,12.591250,0,0,8,11,7,12.820000,0,-0.158418,-0.449110,0.933176,Enrolled,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.0,1.000000,0.200000,0.0,0.875000,0.333333,-0.855317,-0.0,-0.855317
4,1,1,2,9500,1,1,132.0,1,19,37,4,9,120.1,1,0,0,1,0,0,18,0,0,7,12,6,12.933333,0,0,7,12,6,12.933333,0,-1.477502,0.980680,0.178079,Graduate,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.0,0.857143,0.266667,0.0,0.857143,0.363636,-0.229120,-0.0,-0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76513,1,17,1,9254,1,1,121.0,1,19,1,7,5,116.5,1,0,0,1,0,1,18,0,0,6,9,6,10.666667,0,0,6,8,5,10.600000,0,0.896848,-1.092515,0.386841,Graduate,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.0,1.000000,0.200000,0.0,0.833333,0.242424,-0.313509,-0.0,-0.313509
76514,1,1,6,9254,1,1,125.0,1,1,38,4,9,131.6,1,0,0,1,0,0,19,0,0,6,22,4,13.000000,0,0,6,9,6,13.875000,0,-0.799116,-1.449963,-1.349882,Graduate,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.0,0.666667,0.488889,0.0,1.000000,0.272727,1.150282,0.0,0.000000
76515,5,17,1,9085,1,1,138.0,1,37,37,9,10,123.3,1,0,0,1,0,0,19,0,0,5,13,4,12.500000,2,0,5,8,5,11.400000,1,-0.799116,-1.449963,-1.349882,Enrolled,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0.0,0.800000,0.122222,0.0,1.000000,0.159091,1.150282,0.0,0.000000
76516,1,1,3,9070,1,1,136.0,1,38,37,5,9,124.8,1,0,0,1,0,0,18,0,0,6,0,0,0.000000,0,0,6,0,0,0.000000,0,-1.477502,0.980680,0.178079,Dropout,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,-0.229120,-0.0,-0.000000


### Test Dataset

In [43]:
# Print the number of unique values for each column
for col in test.columns:
    print(f'{col} has {train[col].nunique()} values')

Marital status has 6 values
Application mode has 22 values
Application order has 8 values
Course has 19 values
Daytime/evening attendance has 2 values
Previous qualification has 21 values
Previous qualification (grade) has 110 values
Nacionality has 18 values
Mother's qualification has 35 values
Father's qualification has 39 values
Mother's occupation has 40 values
Father's occupation has 56 values
Admission grade has 668 values
Displaced has 2 values
Educational special needs has 2 values
Debtor has 2 values
Tuition fees up to date has 2 values
Gender has 2 values
Scholarship holder has 2 values
Age at enrollment has 46 values
International has 2 values
Curricular units 1st sem (credited) has 21 values
Curricular units 1st sem (enrolled) has 24 values
Curricular units 1st sem (evaluations) has 36 values
Curricular units 1st sem (approved) has 23 values
Curricular units 1st sem (grade) has 1206 values
Curricular units 1st sem (without evaluations) has 12 values
Curricular units 2nd sem

In [44]:
custom_one_hot_encoding(test, 'Marital status', ['Marital_single','Marital_family'],[[1,3]])

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Admission grade,Displaced,Educational special needs,Debtor,Tuition fees up to date,Gender,Scholarship holder,Age at enrollment,International,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Marital_single,Marital_family
0,1,1,1,9500,1,1,141.0,1,3,1,2,3,152.1,0,0,0,0,0,0,18,0,0,7,0,0,0.000000,0,0,8,0,0,0.000000,0,13.9,-0.3,0.79,1,0
1,1,1,1,9238,1,1,128.0,1,1,19,4,5,116.5,0,0,0,1,0,0,19,0,0,6,7,6,14.857143,0,0,6,6,6,13.500000,0,11.1,0.6,2.02,1,0
2,1,1,1,9238,1,1,118.0,1,1,19,4,9,114.2,0,0,0,1,0,1,18,0,0,6,11,6,12.000000,0,0,6,11,5,11.000000,0,15.5,2.8,-4.06,1,0
3,1,44,1,9147,1,39,130.0,1,1,19,3,3,130.0,0,0,0,1,0,1,23,0,2,6,15,5,11.500000,0,3,8,14,5,11.000000,0,8.9,1.4,3.51,1,0
4,1,39,1,9670,1,1,110.0,1,1,37,5,5,106.0,1,0,0,1,0,0,26,0,0,6,9,3,11.000000,0,0,6,9,4,10.666667,2,7.6,2.6,0.32,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51007,1,1,2,171,1,1,128.0,1,38,37,7,10,124.7,1,0,0,1,0,0,19,0,0,0,0,0,0.000000,0,0,0,0,0,0.000000,0,15.5,2.8,-4.06,1,0
51008,2,39,1,9119,1,19,133.1,1,19,37,9,9,140.0,0,0,1,0,1,0,33,0,0,5,6,0,0.000000,0,0,5,5,0,0.000000,0,9.4,-0.8,-3.12,0,1
51009,1,1,1,171,1,1,127.0,1,1,1,4,10,120.4,0,0,1,0,0,0,20,0,0,0,0,0,0.000000,0,0,0,0,0,0.000000,0,15.5,2.8,-4.06,1,0
51010,1,1,3,9773,1,1,132.0,1,19,19,5,5,126.3,1,0,1,0,0,0,18,0,0,6,8,5,12.600000,0,0,6,9,3,13.000000,0,7.6,2.6,0.32,1,0


In [45]:
custom_one_hot_encoding(test, 'Previous qualification', ['PQ_default','PQ_higher','PQ_lower','PQ_much_lower','PQ_others'],[[1],[2,3,4,5,6],[9,10,12,14,15],[19,38]])

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Admission grade,Displaced,Educational special needs,Debtor,Tuition fees up to date,Gender,Scholarship holder,Age at enrollment,International,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Marital_single,Marital_family,PQ_default,PQ_higher,PQ_lower,PQ_much_lower,PQ_others
0,1,1,1,9500,1,1,141.0,1,3,1,2,3,152.1,0,0,0,0,0,0,18,0,0,7,0,0,0.000000,0,0,8,0,0,0.000000,0,13.9,-0.3,0.79,1,0,1,0,0,0,0
1,1,1,1,9238,1,1,128.0,1,1,19,4,5,116.5,0,0,0,1,0,0,19,0,0,6,7,6,14.857143,0,0,6,6,6,13.500000,0,11.1,0.6,2.02,1,0,1,0,0,0,0
2,1,1,1,9238,1,1,118.0,1,1,19,4,9,114.2,0,0,0,1,0,1,18,0,0,6,11,6,12.000000,0,0,6,11,5,11.000000,0,15.5,2.8,-4.06,1,0,1,0,0,0,0
3,1,44,1,9147,1,39,130.0,1,1,19,3,3,130.0,0,0,0,1,0,1,23,0,2,6,15,5,11.500000,0,3,8,14,5,11.000000,0,8.9,1.4,3.51,1,0,0,0,0,0,1
4,1,39,1,9670,1,1,110.0,1,1,37,5,5,106.0,1,0,0,1,0,0,26,0,0,6,9,3,11.000000,0,0,6,9,4,10.666667,2,7.6,2.6,0.32,1,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51007,1,1,2,171,1,1,128.0,1,38,37,7,10,124.7,1,0,0,1,0,0,19,0,0,0,0,0,0.000000,0,0,0,0,0,0.000000,0,15.5,2.8,-4.06,1,0,1,0,0,0,0
51008,2,39,1,9119,1,19,133.1,1,19,37,9,9,140.0,0,0,1,0,1,0,33,0,0,5,6,0,0.000000,0,0,5,5,0,0.000000,0,9.4,-0.8,-3.12,0,1,0,0,0,1,0
51009,1,1,1,171,1,1,127.0,1,1,1,4,10,120.4,0,0,1,0,0,0,20,0,0,0,0,0,0.000000,0,0,0,0,0,0.000000,0,15.5,2.8,-4.06,1,0,1,0,0,0,0
51010,1,1,3,9773,1,1,132.0,1,19,19,5,5,126.3,1,0,1,0,0,0,18,0,0,6,8,5,12.600000,0,0,6,9,3,13.000000,0,7.6,2.6,0.32,1,0,1,0,0,0,0


In [46]:
custom_one_hot_encoding(test, 'Nacionality', ['N_Portgal','N_Europe','N_Africa','N_South America','N_Central America','N_Asia','N_Others'],[[1],[2,6,11,13,14,17,62,100,103,105],[21,22,24,25,26],[41,109],[101,108],[32]])

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Admission grade,Displaced,Educational special needs,Debtor,Tuition fees up to date,Gender,Scholarship holder,Age at enrollment,International,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Marital_single,Marital_family,PQ_default,PQ_higher,PQ_lower,PQ_much_lower,PQ_others,N_Portgal,N_Europe,N_Africa,N_South America,N_Central America,N_Asia,N_Others
0,1,1,1,9500,1,1,141.0,1,3,1,2,3,152.1,0,0,0,0,0,0,18,0,0,7,0,0,0.000000,0,0,8,0,0,0.000000,0,13.9,-0.3,0.79,1,0,1,0,0,0,0,1,0,0,0,0,0,0
1,1,1,1,9238,1,1,128.0,1,1,19,4,5,116.5,0,0,0,1,0,0,19,0,0,6,7,6,14.857143,0,0,6,6,6,13.500000,0,11.1,0.6,2.02,1,0,1,0,0,0,0,1,0,0,0,0,0,0
2,1,1,1,9238,1,1,118.0,1,1,19,4,9,114.2,0,0,0,1,0,1,18,0,0,6,11,6,12.000000,0,0,6,11,5,11.000000,0,15.5,2.8,-4.06,1,0,1,0,0,0,0,1,0,0,0,0,0,0
3,1,44,1,9147,1,39,130.0,1,1,19,3,3,130.0,0,0,0,1,0,1,23,0,2,6,15,5,11.500000,0,3,8,14,5,11.000000,0,8.9,1.4,3.51,1,0,0,0,0,0,1,1,0,0,0,0,0,0
4,1,39,1,9670,1,1,110.0,1,1,37,5,5,106.0,1,0,0,1,0,0,26,0,0,6,9,3,11.000000,0,0,6,9,4,10.666667,2,7.6,2.6,0.32,1,0,1,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51007,1,1,2,171,1,1,128.0,1,38,37,7,10,124.7,1,0,0,1,0,0,19,0,0,0,0,0,0.000000,0,0,0,0,0,0.000000,0,15.5,2.8,-4.06,1,0,1,0,0,0,0,1,0,0,0,0,0,0
51008,2,39,1,9119,1,19,133.1,1,19,37,9,9,140.0,0,0,1,0,1,0,33,0,0,5,6,0,0.000000,0,0,5,5,0,0.000000,0,9.4,-0.8,-3.12,0,1,0,0,0,1,0,1,0,0,0,0,0,0
51009,1,1,1,171,1,1,127.0,1,1,1,4,10,120.4,0,0,1,0,0,0,20,0,0,0,0,0,0.000000,0,0,0,0,0,0.000000,0,15.5,2.8,-4.06,1,0,1,0,0,0,0,1,0,0,0,0,0,0
51010,1,1,3,9773,1,1,132.0,1,19,19,5,5,126.3,1,0,1,0,0,0,18,0,0,6,8,5,12.600000,0,0,6,9,3,13.000000,0,7.6,2.6,0.32,1,0,1,0,0,0,0,1,0,0,0,0,0,0


In [47]:
test['Curricular units 1st sem (credited/enrolled)'] = test['Curricular units 1st sem (credited)']/test['Curricular units 1st sem (enrolled)']
test['Curricular units 1st sem (credited/enrolled)'] = test['Curricular units 1st sem (credited/enrolled)'].fillna(0)

In [48]:
test['Curricular units 1st sem (approved/enrolled)'] = test['Curricular units 1st sem (approved)']/test['Curricular units 1st sem (enrolled)']
test['Curricular units 1st sem (approved/enrolled)'] = test['Curricular units 1st sem (approved/enrolled)'].fillna(0)

In [49]:
test.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Admission grade,Displaced,Educational special needs,Debtor,Tuition fees up to date,Gender,Scholarship holder,Age at enrollment,International,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Marital_single,Marital_family,PQ_default,PQ_higher,PQ_lower,PQ_much_lower,PQ_others,N_Portgal,N_Europe,N_Africa,N_South America,N_Central America,N_Asia,N_Others,Curricular units 1st sem (credited/enrolled),Curricular units 1st sem (approved/enrolled)
0,1,1,1,9500,1,1,141.0,1,3,1,2,3,152.1,0,0,0,0,0,0,18,0,0,7,0,0,0.0,0,0,8,0,0,0.0,0,13.9,-0.3,0.79,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.0,0.0
1,1,1,1,9238,1,1,128.0,1,1,19,4,5,116.5,0,0,0,1,0,0,19,0,0,6,7,6,14.857143,0,0,6,6,6,13.5,0,11.1,0.6,2.02,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.0,1.0
2,1,1,1,9238,1,1,118.0,1,1,19,4,9,114.2,0,0,0,1,0,1,18,0,0,6,11,6,12.0,0,0,6,11,5,11.0,0,15.5,2.8,-4.06,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.0,1.0
3,1,44,1,9147,1,39,130.0,1,1,19,3,3,130.0,0,0,0,1,0,1,23,0,2,6,15,5,11.5,0,3,8,14,5,11.0,0,8.9,1.4,3.51,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0.333333,0.833333
4,1,39,1,9670,1,1,110.0,1,1,37,5,5,106.0,1,0,0,1,0,0,26,0,0,6,9,3,11.0,0,0,6,9,4,10.666667,2,7.6,2.6,0.32,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.0,0.5


In [50]:
scaler = MinMaxScaler()

test['Evaluations_scaled'] = scaler.fit_transform(test[['Curricular units 1st sem (evaluations)']])
test['Without_evaluations_scaled'] = scaler.fit_transform(test[['Curricular units 1st sem (without evaluations)']])

test['Curricular units 1st sem (evaluations-without evaluations)'] = test['Evaluations_scaled'] - test['Without_evaluations_scaled']

test.drop(['Evaluations_scaled', 'Without_evaluations_scaled'], axis=1, inplace=True)

In [51]:
test['Curricular units 2nd sem (credited/enrolled)'] = test['Curricular units 2nd sem (credited)']/test['Curricular units 2nd sem (enrolled)']
test['Curricular units 2nd sem (credited/enrolled)'] = test['Curricular units 2nd sem (credited/enrolled)'].fillna(0)

In [52]:
test['Curricular units 2nd sem (approved/enrolled)'] = test['Curricular units 2nd sem (approved)']/test['Curricular units 2nd sem (enrolled)']
test['Curricular units 2nd sem (approved/enrolled)'] = test['Curricular units 2nd sem (approved/enrolled)'].fillna(0)

In [53]:
scaler = MinMaxScaler()

test['Evaluations_scaled'] = scaler.fit_transform(test[['Curricular units 2nd sem (evaluations)']])
test['Without_evaluations_scaled'] = scaler.fit_transform(test[['Curricular units 2nd sem (without evaluations)']])

test['Curricular units 2nd sem (evaluations-without evaluations)'] = test['Evaluations_scaled'] - test['Without_evaluations_scaled']

test.drop(['Evaluations_scaled', 'Without_evaluations_scaled'], axis=1, inplace=True)

In [54]:
test.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Admission grade,Displaced,Educational special needs,Debtor,Tuition fees up to date,Gender,Scholarship holder,Age at enrollment,International,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Marital_single,Marital_family,PQ_default,PQ_higher,PQ_lower,PQ_much_lower,PQ_others,N_Portgal,N_Europe,N_Africa,N_South America,N_Central America,N_Asia,N_Others,Curricular units 1st sem (credited/enrolled),Curricular units 1st sem (approved/enrolled),Curricular units 1st sem (evaluations-without evaluations),Curricular units 2nd sem (credited/enrolled),Curricular units 2nd sem (approved/enrolled),Curricular units 2nd sem (evaluations-without evaluations)
0,1,1,1,9500,1,1,141.0,1,3,1,2,3,152.1,0,0,0,0,0,0,18,0,0,7,0,0,0.0,0,0,8,0,0,0.0,0,13.9,-0.3,0.79,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,1,1,9238,1,1,128.0,1,1,19,4,5,116.5,0,0,0,1,0,0,19,0,0,6,7,6,14.857143,0,0,6,6,6,13.5,0,11.1,0.6,2.02,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.0,1.0,0.155556,0.0,1.0,0.181818
2,1,1,1,9238,1,1,118.0,1,1,19,4,9,114.2,0,0,0,1,0,1,18,0,0,6,11,6,12.0,0,0,6,11,5,11.0,0,15.5,2.8,-4.06,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.0,1.0,0.244444,0.0,0.833333,0.333333
3,1,44,1,9147,1,39,130.0,1,1,19,3,3,130.0,0,0,0,1,0,1,23,0,2,6,15,5,11.5,0,3,8,14,5,11.0,0,8.9,1.4,3.51,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0.333333,0.833333,0.333333,0.375,0.625,0.424242
4,1,39,1,9670,1,1,110.0,1,1,37,5,5,106.0,1,0,0,1,0,0,26,0,0,6,9,3,11.0,0,0,6,9,4,10.666667,2,7.6,2.6,0.32,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.0,0.5,0.2,0.0,0.666667,0.072727


In [55]:
sc = StandardScaler()
columns_to_scale = ['Unemployment rate', 'Inflation rate', 'GDP']
test[columns_to_scale] = sc.fit_transform(test[columns_to_scale])
fa = FactorAnalysis(n_components=1, max_iter=5000)
fitted = fa.fit_transform(test[columns_to_scale])

print(fitted)
print(fitted.shape)

[[-0.29605949]
 [-0.83752195]
 [ 1.66915403]
 ...
 [ 1.66915403]
 [-0.24542084]
 [-0.24542084]]
(51012, 1)


In [56]:
scaler = MinMaxScaler()

train['Evaluations_scaled'] = scaler.fit_transform(train[['Curricular units 1st sem (evaluations)']])
train['Without_evaluations_scaled'] = scaler.fit_transform(train[['Curricular units 1st sem (without evaluations)']])

train['Curricular units 1st sem (evaluations-without evaluations)'] = train['Evaluations_scaled'] - train['Without_evaluations_scaled']

train.drop(['Evaluations_scaled', 'Without_evaluations_scaled'], axis=1, inplace=True)

In [57]:
Factor_loading_matrix = fa.components_.T

pd.DataFrame(Factor_loading_matrix,
             columns=["第1因子"],
             index=[test[columns_to_scale].columns])

Unnamed: 0,第1因子
Unemployment rate,0.330421
Inflation rate,0.089217
GDP,-0.897467


In [58]:
test['Recession'] = fitted
test.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Admission grade,Displaced,Educational special needs,Debtor,Tuition fees up to date,Gender,Scholarship holder,Age at enrollment,International,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Marital_single,Marital_family,PQ_default,PQ_higher,PQ_lower,PQ_much_lower,PQ_others,N_Portgal,N_Europe,N_Africa,N_South America,N_Central America,N_Asia,N_Others,Curricular units 1st sem (credited/enrolled),Curricular units 1st sem (approved/enrolled),Curricular units 1st sem (evaluations-without evaluations),Curricular units 2nd sem (credited/enrolled),Curricular units 2nd sem (approved/enrolled),Curricular units 2nd sem (evaluations-without evaluations),Recession
0,1,1,1,9500,1,1,141.0,1,3,1,2,3,152.1,0,0,0,0,0,0,18,0,0,7,0,0,0.0,0,0,8,0,0,0.0,0,0.897514,-1.089794,0.389264,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,-0.296059
1,1,1,1,9238,1,1,128.0,1,1,19,4,5,116.5,0,0,0,1,0,0,19,0,0,6,7,6,14.857143,0,0,6,6,6,13.5,0,-0.158656,-0.448201,0.935535,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.0,1.0,0.155556,0.0,1.0,0.181818,-0.837522
2,1,1,1,9238,1,1,118.0,1,1,19,4,9,114.2,0,0,0,1,0,1,18,0,0,6,11,6,12.0,0,0,6,11,5,11.0,0,1.50104,1.120136,-1.764733,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.0,1.0,0.244444,0.0,0.833333,0.333333,1.669154
3,1,44,1,9147,1,39,130.0,1,1,19,3,3,130.0,0,0,0,1,0,1,23,0,2,6,15,5,11.5,0,3,8,14,5,11.0,0,-0.988505,0.122103,1.597278,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0.333333,0.833333,0.333333,0.375,0.625,0.424242,-1.465317
4,1,39,1,9670,1,1,110.0,1,1,37,5,5,106.0,1,0,0,1,0,0,26,0,0,6,9,3,11.0,0,0,6,9,4,10.666667,2,-1.47887,0.97756,0.180526,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.0,0.5,0.2,0.0,0.666667,0.072727,-0.245421


In [59]:
test['Debtor_Recession_interaction'] = test['Debtor'] * test['Recession']

In [60]:
test['Scholarship holder_Recession_interaction'] = test['Scholarship holder'] * test['Recession']

In [61]:
test

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Admission grade,Displaced,Educational special needs,Debtor,Tuition fees up to date,Gender,Scholarship holder,Age at enrollment,International,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Marital_single,Marital_family,PQ_default,PQ_higher,PQ_lower,PQ_much_lower,PQ_others,N_Portgal,N_Europe,N_Africa,N_South America,N_Central America,N_Asia,N_Others,Curricular units 1st sem (credited/enrolled),Curricular units 1st sem (approved/enrolled),Curricular units 1st sem (evaluations-without evaluations),Curricular units 2nd sem (credited/enrolled),Curricular units 2nd sem (approved/enrolled),Curricular units 2nd sem (evaluations-without evaluations),Recession,Debtor_Recession_interaction,Scholarship holder_Recession_interaction
0,1,1,1,9500,1,1,141.0,1,3,1,2,3,152.1,0,0,0,0,0,0,18,0,0,7,0,0,0.000000,0,0,8,0,0,0.000000,0,0.897514,-1.089794,0.389264,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000,0.000000,0.000000,-0.296059,-0.000000,-0.000000
1,1,1,1,9238,1,1,128.0,1,1,19,4,5,116.5,0,0,0,1,0,0,19,0,0,6,7,6,14.857143,0,0,6,6,6,13.500000,0,-0.158656,-0.448201,0.935535,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.000000,1.000000,0.155556,0.000,1.000000,0.181818,-0.837522,-0.000000,-0.000000
2,1,1,1,9238,1,1,118.0,1,1,19,4,9,114.2,0,0,0,1,0,1,18,0,0,6,11,6,12.000000,0,0,6,11,5,11.000000,0,1.501040,1.120136,-1.764733,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.000000,1.000000,0.244444,0.000,0.833333,0.333333,1.669154,0.000000,1.669154
3,1,44,1,9147,1,39,130.0,1,1,19,3,3,130.0,0,0,0,1,0,1,23,0,2,6,15,5,11.500000,0,3,8,14,5,11.000000,0,-0.988505,0.122103,1.597278,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0.333333,0.833333,0.333333,0.375,0.625000,0.424242,-1.465317,-0.000000,-1.465317
4,1,39,1,9670,1,1,110.0,1,1,37,5,5,106.0,1,0,0,1,0,0,26,0,0,6,9,3,11.000000,0,0,6,9,4,10.666667,2,-1.478870,0.977560,0.180526,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.000000,0.500000,0.200000,0.000,0.666667,0.072727,-0.245421,-0.000000,-0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51007,1,1,2,171,1,1,128.0,1,38,37,7,10,124.7,1,0,0,1,0,0,19,0,0,0,0,0,0.000000,0,0,0,0,0,0.000000,0,1.501040,1.120136,-1.764733,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000,0.000000,0.000000,1.669154,0.000000,0.000000
51008,2,39,1,9119,1,19,133.1,1,19,37,9,9,140.0,0,0,1,0,1,0,33,0,0,5,6,0,0.000000,0,0,5,5,0,0.000000,0,-0.799903,-1.446235,-1.347257,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0.000000,0.000000,0.133333,0.000,0.000000,0.151515,1.097992,1.097992,0.000000
51009,1,1,1,171,1,1,127.0,1,1,1,4,10,120.4,0,0,1,0,0,0,20,0,0,0,0,0,0.000000,0,0,0,0,0,0.000000,0,1.501040,1.120136,-1.764733,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000,0.000000,0.000000,1.669154,1.669154,0.000000
51010,1,1,3,9773,1,1,132.0,1,19,19,5,5,126.3,1,0,1,0,0,0,18,0,0,6,8,5,12.600000,0,0,6,9,3,13.000000,0,-1.478870,0.977560,0.180526,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0.000000,0.833333,0.177778,0.000,0.500000,0.272727,-0.245421,-0.245421,-0.000000


## 3.Modeling

In [62]:
label_encoder = LabelEncoder()
train['Target'] = label_encoder.fit_transform(train['Target'])

In [63]:
X_train = train.drop(columns=['Target'])
y_train = train['Target']
X_test = test

In [64]:
def cross_validate_model(model, X_train, y_train, params, n_splits=10):
    """
    Performs K-Fold cross-validation for a given model, returns the last model and average validation accuracy.

    Parameters:
        model: Machine learning model class (e.g., RandomForestClassifier)
        X_train: Training feature dataset
        y_train: Training target dataset
        params: Dictionary of parameters to initialize the model
        n_splits: Number of folds for cross-validation (default: 10)

    Returns:
        last_model: The last trained model instance
        average_val_accuracy: Average validation accuracy over all folds
    """
    # Initialize variables
    cv = KFold(n_splits=n_splits, shuffle=True, random_state=0)
    val_scores = []

    # Cross-validation loop
    for fold, (train_ind, valid_ind) in enumerate(cv.split(X_train)):
        # Data splitting
        X_fold_train = X_train.iloc[train_ind]
        y_fold_train = y_train.iloc[train_ind]
        X_val = X_train.iloc[valid_ind]
        y_val = y_train.iloc[valid_ind]

        # Model initialization and training
        clf = model(**params)
        clf.fit(X_fold_train, y_fold_train)

        # Predict and evaluate
        y_pred_trn = clf.predict(X_fold_train)
        y_pred_val = clf.predict(X_val)
        train_acc = accuracy_score(y_fold_train, y_pred_trn)
        val_acc = accuracy_score(y_val, y_pred_val)
        print(f"Fold: {fold}, Train Accuracy: {train_acc:.5f}, Val Accuracy: {val_acc:.5f}")
        print("-" * 50)

        # Accumulate validation scores
        val_scores.append(val_acc)

    # Calculate the average validation score
    average_val_accuracy = np.mean(val_scores)
    print("Average Validation Accuracy:", average_val_accuracy)

    return clf, average_val_accuracy

In [65]:
from catboost import CatBoostClassifier

cb_params = {
    'depth': 9,
    'verbose': False,
    'thread_count': -1,
    'iterations': 1830,
    'border_count': 200,
    'random_state': SEED,
    'min_child_samples': 4,
    'bootstrap_type': 'MVS',
    'grow_policy': 'Depthwise',
    'l2_leaf_reg': 4.349614487163372,
    'random_strength': 0.16489500172653238,
    'colsample_bylevel': 0.45740812601887504,
}

print('CatBoost Cross-Validation Results:\n')
cat_model, cat_mean_accuracy = cross_validate_model(CatBoostClassifier, X_train, y_train, cb_params)

CatBoost Cross-Validation Results:

Fold: 0, Train Accuracy: 0.90291, Val Accuracy: 0.83547
--------------------------------------------------
Fold: 1, Train Accuracy: 0.90524, Val Accuracy: 0.82880
--------------------------------------------------
Fold: 2, Train Accuracy: 0.90268, Val Accuracy: 0.83351
--------------------------------------------------
Fold: 3, Train Accuracy: 0.90268, Val Accuracy: 0.82658
--------------------------------------------------
Fold: 4, Train Accuracy: 0.90426, Val Accuracy: 0.83011
--------------------------------------------------
Fold: 5, Train Accuracy: 0.90586, Val Accuracy: 0.83873
--------------------------------------------------
Fold: 6, Train Accuracy: 0.90500, Val Accuracy: 0.83325
--------------------------------------------------
Fold: 7, Train Accuracy: 0.90249, Val Accuracy: 0.83939
--------------------------------------------------
Fold: 8, Train Accuracy: 0.90428, Val Accuracy: 0.83139
--------------------------------------------------
F

In [66]:
cat_preds = cat_model.predict(X_test)
cat_preds_labels = label_encoder.inverse_transform(cat_preds)

cat_result = pd.DataFrame(X_test.index)
cat_result['Target'] = cat_preds_labels
cat_result.to_csv('submission_cat.csv', index=False)

from google.colab import files
files.download('submission_cat.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## 4. Feature Importance

In [67]:
def plot_feature_importances(model, model_name, color_scale='Picnic', dataframe=None):
    """
    Plots feature importances of a fitted random forest model.

    Parameters:
    model (RandomForest model): The trained random forest model.
    color_scale (str): Color scale for the plot.
    dataframe (pd.DataFrame): DataFrame used to train the model. Must not be None.

    Returns:
    Plotly Figure: A plot showing feature importances.
    """
    if dataframe is None:
        raise ValueError("Dataframe cannot be None and must contain the feature names.")

    # Extracting feature importances and sorting them
    importances = model.feature_importances_
    indices = np.argsort(importances)[::-1]
    feature_names = dataframe.columns

    # Creating a DataFrame for the importances
    feature_importances = pd.DataFrame({
        'Feature': feature_names[indices],
        'Importance': importances[indices]
    })

    # Plotting the feature importances
    fig = px.bar(feature_importances.sort_values('Importance', ascending=True),
                 x='Importance',
                 y='Feature',
                 title=f"Feature Importances in {model_name}",
                 labels={'Importance': 'Importance', 'Feature': 'Feature'},
                 height=1400,
                 color='Importance',
                 color_continuous_scale=color_scale)

    fig.update_layout(xaxis_title='Importance', yaxis_title='Feature')

    return fig

In [68]:
import plotly.express as px

model_name = 'CatBoost'
fig = plot_feature_importances(cat_model, model_name, 'Picnic', X_train)
fig.show()