# 1st PCEO AI CLUB competition - Official Baseline
## 1회 PCEO AI CLUB 대회 - 공식 베이스라인

안녕하세요! 이번 대회 Host 김동규입니다. 이번 대회는 8월에 있었던 AI 집중 교육의 연장선상으로 classical ML 데이터를 이용한 대회입니다. 집중교육 때는 숫자로 이루어진 데이터만 사용했다면, 이번 대회는 대부분 문자로 된 데이터가 중심이 됩니다. categorical variable을 어떻게 잘 변형하고 새로운 feature로 만들어내는지가 중요한 대회입니다.

아래는 간단한 베이스라인입니다. 대회에 submission 할 수 있는 최소한의 코드로 구성되어 있습니다. 아래 코드를 기반으로 하여 점수를 향상하고 리더보드의 상단에 위치하여 보세요!

**리더보드 1등에게 소정의 상품이 주어집니다**

### 데이터 및 라이브러리 로드

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor

In [2]:
train = pd.read_csv('../input/pceo-ai-club1/train.csv', index_col="student_id") #반드시 index_col을 student_id로 설정해주세요
# 학생마다 고유한 아이디 컬럼을 가져오는 것!


test = pd.read_csv('../input/pceo-ai-club1/test.csv', index_col="student_id")
sample_submission = pd.read_csv('../input/pceo-ai-club1/sample_submission.csv', index_col="student_id")

### Super Simple EDA

In [3]:
train.head()

Unnamed: 0_level_0,school,school_setting,school_type,classroom,teaching_method,n_student,gender,lunch,pretest
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
W20C6,GOOBU,Urban,Public,HKF,Standard,28.0,Female,Qualifies for reduced/free lunch,34.0
A3JJP,GOOBU,Urban,Public,W8A,Experimental,26.0,Female,Qualifies for reduced/free lunch,36.0
UBDCF,DNQDD,Suburban,Public,PW5,Experimental,20.0,Male,Qualifies for reduced/free lunch,42.0
QQJEC,GOOBU,Urban,Public,W8A,Experimental,26.0,Female,Qualifies for reduced/free lunch,36.0
3SWL5,UUUQX,Suburban,Non-public,SSP,Standard,15.0,Female,Does not qualify,66.0


pretest 값이 우리가 예측해야 하는 시험 점수입니다.

In [4]:
test.head()

Unnamed: 0_level_0,school,school_setting,school_type,classroom,teaching_method,n_student,gender,lunch
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MNVG5,VHDHF,Rural,Non-public,KR1,Experimental,15.0,Female,Qualifies for reduced/free lunch
LB2TH,GJJHK,Suburban,Public,ZDT,Standard,27.0,Male,Qualifies for reduced/free lunch
ETDRG,CIMBB,Urban,Non-public,PC6,Standard,17.0,Female,Does not qualify
TME5H,GOKXL,Rural,Public,VA6,Standard,19.0,Female,Qualifies for reduced/free lunch
G8C0U,QOQTS,Urban,Public,3XJ,Standard,24.0,Female,Does not qualify


In [5]:
train.isnull().any()

school             False
school_setting     False
school_type        False
classroom          False
teaching_method    False
n_student          False
gender             False
lunch              False
pretest            False
dtype: bool

위에서 보다시피 데이터에 null 값은 없습니다.

### Data Preprocessing

In [6]:
y = train.pretest
X = train.drop(["pretest"], axis=1)

In [7]:
s = (X.dtypes == 'object')
object_cols = list(s[s].index)
object_cols

['school',
 'school_setting',
 'school_type',
 'classroom',
 'teaching_method',
 'gender',
 'lunch']

n_students 이외의 feature들은 모두 categorical입니다. (cateogical => 문자 형태)

In [8]:
label_X = X.copy()
label_test = test.copy()

ordinal_encoder = OrdinalEncoder()
label_X[object_cols] = ordinal_encoder.fit_transform(X[object_cols])
label_test[object_cols] = ordinal_encoder.transform(test[object_cols])

In [9]:
label_X.head()

Unnamed: 0_level_0,school,school_setting,school_type,classroom,teaching_method,n_student,gender,lunch
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
W20C6,8.0,2.0,1.0,45.0,1.0,28.0,0.0,1.0
A3JJP,8.0,2.0,1.0,83.0,0.0,26.0,0.0,1.0
UBDCF,4.0,1.0,1.0,65.0,0.0,20.0,1.0,1.0
QQJEC,8.0,2.0,1.0,83.0,0.0,26.0,0.0,1.0
3SWL5,17.0,1.0,0.0,73.0,1.0,15.0,0.0,0.0


In [10]:
label_test.head()

Unnamed: 0_level_0,school,school_setting,school_type,classroom,teaching_method,n_student,gender,lunch
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MNVG5,18.0,0.0,0.0,53.0,0.0,15.0,0.0,1.0
LB2TH,6.0,1.0,1.0,95.0,1.0,27.0,1.0,1.0
ETDRG,2.0,2.0,0.0,62.0,1.0,17.0,0.0,0.0
TME5H,7.0,0.0,1.0,82.0,1.0,19.0,0.0,1.0
G8C0U,14.0,2.0,1.0,15.0,1.0,24.0,0.0,0.0


categorical variable을 숫자로 잘 바꾸어 주었습니다. Ordinal Encoder에 대해 잊어버린 분들은 [kaggle course](https://www.kaggle.com/alexisbcook/categorical-variables)를 참고하세요.

In [11]:
X_train, X_val, y_train, y_val = train_test_split(label_X, y, test_size=0.2, random_state=3)

### Model train

In [12]:
# RandomForestRegressor
RFG_model = RandomForestRegressor()
RFG_model.fit(X_train, y_train)

#LightBGM
LGBM_model = LGBMRegressor()
LGBM_model.fit(X_train, y_train)

LGBMRegressor()

In [13]:
RFG_pred = RFG_model.predict(X_val)
print(mean_absolute_error(RFG_pred, y_val))

LGBM_pred = LGBM_model.predict(X_val)
print(mean_absolute_error(LGBM_pred, y_val))

2.7164724595843515
2.7360391804497777


### Model predict & submission

In [14]:
RFG_result = RFG_model.predict(label_test)



LGBM_result = LGBM_model.predict(label_test)

In [15]:
submission = pd.DataFrame(LGBM_result,columns=["pretest"], index=label_test.index)

In [16]:
submission.to_csv("submission.csv")