In [1]:
from scipy.stats import randint

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
# 데이터 로드

train = pd.read_csv('dataset/학업위험 예측(다중분류)/train.csv')
test = pd.read_csv('dataset/학업위험 예측(다중분류)/test.csv')

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76518 entries, 0 to 76517
Data columns (total 38 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   id                                              76518 non-null  int64  
 1   Marital status                                  76518 non-null  int64  
 2   Application mode                                76518 non-null  int64  
 3   Application order                               76518 non-null  int64  
 4   Course                                          76518 non-null  int64  
 5   Daytime/evening attendance                      76518 non-null  int64  
 6   Previous qualification                          76518 non-null  int64  
 7   Previous qualification (grade)                  76518 non-null  float64
 8   Nacionality                                     76518 non-null  int64  
 9   Mother's qualification                 

In [4]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51012 entries, 0 to 51011
Data columns (total 37 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   id                                              51012 non-null  int64  
 1   Marital status                                  51012 non-null  int64  
 2   Application mode                                51012 non-null  int64  
 3   Application order                               51012 non-null  int64  
 4   Course                                          51012 non-null  int64  
 5   Daytime/evening attendance                      51012 non-null  int64  
 6   Previous qualification                          51012 non-null  int64  
 7   Previous qualification (grade)                  51012 non-null  float64
 8   Nacionality                                     51012 non-null  int64  
 9   Mother's qualification                 

In [5]:
train.head(2)

Unnamed: 0,id,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,0,1,1,1,9238,1,1,126.0,1,1,...,0,6,7,6,12.428571,0,11.1,0.6,2.02,Graduate
1,1,1,17,1,9238,1,1,125.0,1,19,...,0,6,9,0,0.0,0,11.1,0.6,2.02,Dropout


In [6]:
test.head(2)

Unnamed: 0,id,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,...,Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP
0,76518,1,1,1,9500,1,1,141.0,1,3,...,0,0,8,0,0,0.0,0,13.9,-0.3,0.79
1,76519,1,1,1,9238,1,1,128.0,1,1,...,0,0,6,6,6,13.5,0,11.1,0.6,2.02


In [7]:
train.isnull().sum().sum()

0

In [8]:
test.isnull().sum().sum()

0

In [9]:
train['Target'].unique()

array(['Graduate', 'Dropout', 'Enrolled'], dtype=object)

In [10]:
# 종속변수 레이블인코딩 - 굳이 안 해도 됨. 모델 피팅 시 자동처리.

# label_encoder = LabelEncoder()
# train['Target'] = label_encoder.fit_transform(train['Target'])

In [11]:
train.head()

Unnamed: 0,id,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,0,1,1,1,9238,1,1,126.0,1,1,...,0,6,7,6,12.428571,0,11.1,0.6,2.02,Graduate
1,1,1,17,1,9238,1,1,125.0,1,19,...,0,6,9,0,0.0,0,11.1,0.6,2.02,Dropout
2,2,1,17,2,9254,1,1,137.0,1,3,...,0,6,0,0,0.0,0,16.2,0.3,-0.92,Dropout
3,3,1,1,3,9500,1,1,131.0,1,19,...,0,8,11,7,12.82,0,11.1,0.6,2.02,Enrolled
4,4,1,1,2,9500,1,1,132.0,1,19,...,0,7,12,6,12.933333,0,7.6,2.6,0.32,Graduate


In [12]:
# 데이터 분리
# kfold 수행 시 DataFrame 객체를 numpy 객체로 변환 필수
X = train.drop(['id', 'Target'], axis=1).to_numpy()
y = train['Target'].to_numpy()
test_id = test.pop('id').to_numpy()

In [13]:
# # 종속변수 레이블인코딩
# label_encoder = LabelEncoder()
# y = label_encoder.fit_transform(y)

In [14]:
# 학습 데이터 피팅
random_state = 42
model = RandomForestClassifier(random_state=random_state)

In [15]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)

In [16]:
best_model = ''
best_acc = 0

In [17]:
for train_idx, val_idx in skf.split(X, y):  # kfold 수행 시 DataFrame 객체를 numpy 객체로 변환 필수
    print(train_idx, val_idx)
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    model.fit(X_train, y_train)
    
    preds_val = model.predict(X_val)
    
    acc_val = accuracy_score(y_val, preds_val)
    print(acc_val)
    if best_acc < acc_val:
        best_acc = acc_val
        best_model = model
        
best_acc

[    0     2     3 ... 76515 76516 76517] [    1     4     5 ... 76508 76512 76514]
0.8280841610036592
[    1     2     4 ... 76515 76516 76517] [    0     3     7 ... 76478 76500 76513]
0.8268426555148981
[    0     1     2 ... 76514 76515 76516] [    9    11    14 ... 76509 76511 76517]
0.8274307370622059
[    0     1     3 ... 76514 76515 76517] [    2     6    36 ... 76505 76510 76516]
0.8225838070966477
[    0     1     2 ... 76514 76516 76517] [   17    18    21 ... 76493 76507 76515]
0.8250016336666013


0.8280841610036592

In [18]:
# 테스트 데이터 추론
preds_test = best_model.predict(test)
# preds_test = label_encoder.inverse_transform(preds_test)  # 정수 값으로 레이블인코딩 된 종속변수를 카테코리 데이터로 변환

  f"X has feature names, but {self.__class__.__name__} was fitted without"


In [19]:
# 제출
submission = pd.DataFrame({'id': test_id, 'Target': preds_test})
submission.to_csv('submission.csv', index=False)

In [20]:
# no                 : 0.828541
# minmaxscaler       : 0.827365
# standardscaler     : 0.827365
# robustscaler       : 0.828084
# skf                : 0.828084