## 패키지

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from tqdm import tqdm

## 데이터

In [2]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
submission = pd.read_csv('./data/sample_submission.csv')

In [3]:
train.shape, test.shape

((2952, 15), (1022, 14))

In [4]:
train.head()

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,도보 10분거리 내 지하철역 수(환승노선 수 반영),도보 10분거리 내 버스정류장 수,단지내주차면수,등록차량수
0,C2483,900,아파트,경상북도,국민임대,39.72,134,38.0,A,15667000,103680,0.0,3.0,1425.0,1015.0
1,C2483,900,아파트,경상북도,국민임대,39.72,15,38.0,A,15667000,103680,0.0,3.0,1425.0,1015.0
2,C2483,900,아파트,경상북도,국민임대,51.93,385,38.0,A,27304000,184330,0.0,3.0,1425.0,1015.0
3,C2483,900,아파트,경상북도,국민임대,51.93,15,38.0,A,27304000,184330,0.0,3.0,1425.0,1015.0
4,C2483,900,아파트,경상북도,국민임대,51.93,41,38.0,A,27304000,184330,0.0,3.0,1425.0,1015.0


In [5]:
test.head()

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,도보 10분거리 내 지하철역 수(환승노선 수 반영),도보 10분거리 내 버스정류장 수,단지내주차면수
0,C1072,754,아파트,경기도,국민임대,39.79,116,14.0,H,22830000,189840,0.0,2.0,683.0
1,C1072,754,아파트,경기도,국민임대,46.81,30,14.0,A,36048000,249930,0.0,2.0,683.0
2,C1072,754,아파트,경기도,국민임대,46.9,112,14.0,H,36048000,249930,0.0,2.0,683.0
3,C1072,754,아파트,경기도,국민임대,46.9,120,14.0,H,36048000,249930,0.0,2.0,683.0
4,C1072,754,아파트,경기도,국민임대,51.46,60,14.0,H,43497000,296780,0.0,2.0,683.0


## 결측치 확인

In [6]:
train.isna().sum()

단지코드                              0
총세대수                              0
임대건물구분                            0
지역                                0
공급유형                              0
전용면적                              0
전용면적별세대수                          0
공가수                               0
자격유형                              0
임대보증금                           569
임대료                             569
도보 10분거리 내 지하철역 수(환승노선 수 반영)    211
도보 10분거리 내 버스정류장 수                4
단지내주차면수                           0
등록차량수                             0
dtype: int64

In [7]:
test.isna().sum()

단지코드                              0
총세대수                              0
임대건물구분                            0
지역                                0
공급유형                              0
전용면적                              0
전용면적별세대수                          0
공가수                               0
자격유형                              2
임대보증금                           180
임대료                             180
도보 10분거리 내 지하철역 수(환승노선 수 반영)     42
도보 10분거리 내 버스정류장 수                0
단지내주차면수                           0
dtype: int64

## 컬럼명 변경

지하쳘역 수와 버스 정류장 수의 컬럼명을 지하철, 버스로 변경하였습니다.

In [8]:
train.columns

Index(['단지코드', '총세대수', '임대건물구분', '지역', '공급유형', '전용면적', '전용면적별세대수', '공가수',
       '자격유형', '임대보증금', '임대료', '도보 10분거리 내 지하철역 수(환승노선 수 반영)',
       '도보 10분거리 내 버스정류장 수', '단지내주차면수', '등록차량수'],
      dtype='object')

In [9]:
train.columns = [
    '단지코드', '총세대수', '임대건물구분', '지역', '공급유형', '전용면적', '전용면적별세대수', '공가수', '신분',
    '임대보증금', '임대료', '지하철', '버스',
    '단지내주차면수', '등록차량수'
]

test.columns = [
    '단지코드', '총세대수', '임대건물구분', '지역', '공급유형', '전용면적', '전용면적별세대수', '공가수', '신분',
    '임대보증금', '임대료', '지하철', '버스',
    '단지내주차면수'
]

## 지역명 숫자로 매핑

In [10]:
local_map = {}
for i, loc in enumerate(train['지역'].unique()):
    local_map[loc] = i

In [11]:
train['지역'] = train['지역'].map(local_map)
test['지역'] = test['지역'].map(local_map)

## 전용면적을 5의 배수로 변경

In [12]:
train['전용면적'] = train['전용면적']//5*5
test['전용면적'] = test['전용면적']//5*5

## 전용면적 상/하한 적용

상한100, 하한 15

In [13]:
idx = train[train['전용면적']>100].index
train.loc[idx, '전용면적'] = 100
idx = test[test['전용면적']>100].index
test.loc[idx, '전용면적'] = 100

idx = train[train['전용면적']<15].index
train.loc[idx, '전용면적'] = 15
idx = test[test['전용면적']<15].index
test.loc[idx, '전용면적'] = 15

In [14]:
test['전용면적'].unique()

array([ 35.,  45.,  50.,  30.,  55.,  25.,  75., 100.,  15.,  20.,  40.,
        60.,  80.,  70.])

## 단지별 데이터 1차원으로 취합

In [15]:
columns = ['단지코드', '총세대수', '공가수', '지역', '단지내주차면수', '지하철', '버스']
target = '등록차량수'
area_columns = []
for area in train['전용면적'].unique():
    area_columns.append(f'면적_{area}')

In [16]:
new_train = pd.DataFrame()
new_test = pd.DataFrame()

In [17]:
for i, code in tqdm(enumerate(train['단지코드'].unique())):
    temp = train[train['단지코드']==code]
    temp.index = range(temp.shape[0])
    for col in columns:
        new_train.loc[i, col] = temp.loc[0, col]
    
    for col in area_columns:
        area = float(col.split('_')[-1])
        new_train.loc[i, col] = temp[temp['전용면적']==area]['전용면적별세대수'].sum()
    
    new_train.loc[i, '등록차량수'] = temp.loc[0, '등록차량수']
    
for i, code in tqdm(enumerate(test['단지코드'].unique())):
    temp = test[test['단지코드']==code]
    temp.index = range(temp.shape[0])
    for col in columns:
        new_test.loc[i, col] = temp.loc[0, col]
    
    for col in area_columns:
        area = float(col.split('_')[-1])
        new_test.loc[i, col] = temp[temp['전용면적']==area]['전용면적별세대수'].sum()

423it [00:04, 89.76it/s]
150it [00:01, 89.62it/s]


In [18]:
new_train

Unnamed: 0,단지코드,총세대수,공가수,지역,단지내주차면수,지하철,버스,면적_35.0,면적_50.0,면적_55.0,...,면적_25.0,면적_70.0,면적_15.0,면적_20.0,면적_100.0,면적_60.0,면적_75.0,면적_80.0,면적_65.0,등록차량수
0,C2483,900.0,38.0,0.0,1425.0,0.0,3.0,149.0,665.0,86.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1015.0
1,C2515,545.0,17.0,1.0,624.0,0.0,3.0,80.0,132.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,205.0
2,C1407,1216.0,13.0,2.0,1285.0,1.0,1.0,0.0,124.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1064.0
3,C1945,755.0,6.0,3.0,734.0,1.0,3.0,240.0,303.0,212.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,730.0
4,C1470,696.0,14.0,4.0,645.0,0.0,2.0,254.0,246.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,553.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
418,C2586,90.0,7.0,9.0,66.0,0.0,3.0,36.0,0.0,0.0,...,42.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,57.0
419,C2035,492.0,24.0,5.0,521.0,0.0,1.0,156.0,0.0,0.0,...,156.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,246.0
420,C2020,40.0,7.0,8.0,25.0,1.0,2.0,15.0,0.0,0.0,...,5.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,19.0
421,C2437,90.0,12.0,11.0,30.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,90.0,0.0,0.0,0.0,0.0,0.0,16.0


## 결측치 처리

In [19]:
new_train = new_train.fillna(-1)
new_test = new_test.fillna(-1)

## 학습

In [20]:
x_train = new_train.iloc[:, 1:-1]
y_train = new_train.iloc[:,-1]
x_test = new_test.iloc[:,1:]
y_test = new_test.iloc[:,-1]

In [149]:
forest = RandomForestRegressor(n_jobs=-1, random_state=42)
forest.fit(x_train, y_train)

pred = forest.predict(x_test)

from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, pred)

537.6293333333333

In [148]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(x_train,y_train)

pred = logreg.predict(x_test)

from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


618.6266666666667

In [23]:
from sklearn.tree import DecisionTreeRegressor

tree = DecisionTreeRegressor()
tree.fit(x_train,y_train).score(x_test, y_test)

pred = tree.predict(x_test)

from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, pred)

533.74

In [55]:
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor()

knn.fit(x_train,y_train)

pred = knn.predict(x_test)

from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, pred)

538.5759999999999

In [147]:
from sklearn.ensemble import AdaBoostRegressor

ada = AdaBoostRegressor(n_estimators=100, random_state=0)
ada.fit(x_train, y_train)

pred = ada.predict(x_test)

from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, pred)

547.3092580823964

In [142]:
from sklearn.ensemble import GradientBoostingRegressor

gbm = GradientBoostingRegressor(n_estimators=100, random_state=0)
gbm.fit(x_train, y_train)

pred = gbm.predict(x_test)

from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, pred)

537.4693426025226

In [146]:
from sklearn.linear_model import Ridge

ridge = Ridge().fit(x_train, y_train)

pred = ridge.predict(x_test)

from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, pred)

536.3012031617642

In [144]:
from sklearn.linear_model import Lasso

lasso = Lasso().fit(x_train, y_train)

pred = lasso.predict(x_test)

from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, pred)

  model = cd_fast.enet_coordinate_descent(


535.9997893770874

In [226]:
import lightgbm as lgb

train_ds = lgb.Dataset(x_train, label = y_train) 
test_ds = lgb.Dataset(x_test, label = y_test) 

params = {'learning_rate': 0.08, 
          'max_depth': 1, 
          'boosting': 'gbdt', 
          'objective': 'regression', 
          'metric': 'mae', 
          'min_data': 2,
          'feature_fraction': 0.6, 
          'bagging_fraction': 0.6, 
          'bagging_freq': 20,
          'seed':10}


lgb = lgb.train(params, train_ds, 2000, test_ds, verbose_eval=100, early_stopping_rounds=1000)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 749
[LightGBM] [Info] Number of data points in the train set: 423, number of used features: 21
[LightGBM] [Info] Start training from score 572.335697
Training until validation scores don't improve for 1000 rounds
[100]	valid_0's l1: 527.304
[200]	valid_0's l1: 531.685
[300]	valid_0's l1: 532.382
[400]	valid_0's l1: 524.774
[500]	valid_0's l1: 524.137
[600]	valid_0's l1: 532.903
[700]	valid_0's l1: 522.582
[800]	valid_0's l1: 522.088
[900]	valid_0's l1: 512.352
[1000]	valid_0's l1: 515.565
[1100]	valid_0's l1: 518.468
[1200]	valid_0's l1: 524.804
[1300]	valid_0's l1: 522.071
[1400]	valid_0's l1: 524.986
[1500]	valid_0's l1: 517.017
[1600]	valid_0's l1: 528.345
[1700]	valid_0's l1: 528.996
[1800]	valid_0's l1: 518.242
[1900]	valid_0's l1: 530.605
[2000]	valid_0's l1: 521.722
Did not meet early stopping. Best iteration is:
[1020]	valid_0's l1: 510.293


## 추론 및 제출

In [58]:
pred = lgb.predict(x_test)

In [59]:
submission['num'] = pred

In [60]:
submission.to_csv('lgb.csv', index=False)