In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from tqdm import tqdm
pd.options.display.float_format = '{:.6f}'.format

# 데이터 불러오기

In [50]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
car_2020 = pd.read_csv('data/car_2020.csv',encoding='cp949')
submission = pd.read_csv('data/sample_submission.csv')

In [51]:
car_2020 = car_2020.drop(['시군구(1)'], axis=1)
car_2020.columns = ['지역','월','항목','분류','자동차수']
car_2020['월'] = pd.to_datetime(car_2020['월']).dt.month
car_2020.head()

Unnamed: 0,지역,월,항목,분류,자동차수
0,서울,1,관용,승용,4576
1,서울,1,관용,승합,3475
2,서울,1,관용,화물,3968
3,서울,1,관용,특수,354
4,서울,1,자가용,승용,2542280


# 컬렴명 바꿔주기

In [57]:
train.columns = [
    '단지코드', '총세대수', '임대건물구분', '지역', '공급유형', '전용면적', '전용면적별세대수', '공가수', '신분',
    '임대보증금', '임대료', '지하철', '버스',
    '단지내주차면수', '등록차량수'
]


# 데이터 전처리

In [4]:
# 지하철이 없으면 0, 있으면 1 로 데이터 레이블 작업 진행

train['지하철'] = train['지하철'].fillna(0)
train['지하철'] = np.where(train['지하철'] == 0, 0, 1)
train['지하철'].value_counts()

In [10]:
train['지하철'].value_counts()

0    2513
1     439
Name: 지하철, dtype: int64

In [11]:
# 원핫 인코딩
#train = pd.get_dummies(data = train, columns = ['지하철'])

# 결측치 처리

In [4]:
train.loc[train.임대보증금=='-', '임대보증금'] = np.nan
train.loc[train.임대료=='-', '임대료'] = np.nan
train['임대보증금'] = train['임대보증금'].astype(float)
train['임대료'] = train['임대료'].astype(float)
train['임대보증금'] = train['임대보증금'].fillna(0)
train['임대료'] = train['임대료'].fillna(0)

In [5]:
train['버스'] = train['버스'].fillna(train['버스'].mean())

# 지역명 숫자로 매핑

In [6]:
local_map = {}
for i, loc in enumerate(train['지역'].unique()):
    local_map[loc] = i
    
train['지역'] = train['지역'].map(local_map)

# 전용면적을 5의 배수로 변경

In [7]:
train['전용면적'] = train['전용면적']//5*5

# 전용면적 상/하한 적용

In [8]:
idx = train[train['전용면적']>100].index
train.loc[idx, '전용면적'] = 100

idx = train[train['전용면적']<15].index
train.loc[idx, '전용면적'] = 15

In [9]:
train.info().

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2952 entries, 0 to 2951
Data columns (total 15 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   단지코드      2952 non-null   object 
 1   총세대수      2952 non-null   int64  
 2   임대건물구분    2952 non-null   object 
 3   지역        2952 non-null   int64  
 4   공급유형      2952 non-null   object 
 5   전용면적      2952 non-null   float64
 6   전용면적별세대수  2952 non-null   int64  
 7   공가수       2952 non-null   float64
 8   신분        2952 non-null   object 
 9   임대보증금     2952 non-null   float64
 10  임대료       2952 non-null   float64
 11  지하철       2952 non-null   float64
 12  버스        2952 non-null   float64
 13  단지내주차면수   2952 non-null   float64
 14  등록차량수     2952 non-null   float64
dtypes: float64(8), int64(3), object(4)
memory usage: 346.1+ KB


In [10]:
from sklearn.model_selection import train_test_split

x = train.drop(['등록차량수','단지코드','임대건물구분','공급유형','신분'], axis=1) 
y = train['등록차량수']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 0)

display(x_train.shape, x_test.shape)

(2214, 10)

(738, 10)

In [11]:
forest = RandomForestRegressor(n_jobs=-1, random_state=42)
forest.fit(x_train, y_train).score(x_test, y_test)

0.9891439162587817

# test 불러오기

In [12]:
test.columns = [
    '단지코드', '총세대수', '임대건물구분', '지역', '공급유형', '전용면적', '전용면적별세대수', '공가수', '신분',
    '임대보증금', '임대료', '지하철', '버스',
    '단지내주차면수'
]

In [13]:
test.isnull().sum()

단지코드          0
총세대수          0
임대건물구분        0
지역            0
공급유형          0
전용면적          0
전용면적별세대수      0
공가수           0
신분            2
임대보증금       180
임대료         180
지하철          42
버스            0
단지내주차면수       0
dtype: int64

In [14]:
test.loc[test.임대보증금=='-', '임대보증금'] = np.nan
test.loc[test.임대료=='-', '임대료'] = np.nan
test['임대보증금'] = test['임대보증금'].astype(float)
test['임대료'] = test['임대료'].astype(float)
test['임대보증금'] = test['임대보증금'].fillna(0)
test['임대료'] = test['임대료'].fillna(0)

In [15]:
test['지하철'] = test['지하철'].fillna(test['지하철'].mean())

In [16]:
test.loc[test.단지코드.isin(['C2411']) & test.신분.isnull(), '신분'] = 'A'
test.loc[test.단지코드.isin(['C2253']) & test.신분.isnull(), '신분'] = 'C'

In [17]:
test.isnull().sum()

단지코드        0
총세대수        0
임대건물구분      0
지역          0
공급유형        0
전용면적        0
전용면적별세대수    0
공가수         0
신분          0
임대보증금       0
임대료         0
지하철         0
버스          0
단지내주차면수     0
dtype: int64

In [18]:
local_map = {}
for i, loc in enumerate(test['지역'].unique()):
    local_map[loc] = i
    
test['지역'] = test['지역'].map(local_map)

In [19]:
x_train

Unnamed: 0,총세대수,지역,전용면적,전용면적별세대수,공가수,임대보증금,임대료,지하철,버스,단지내주차면수
303,809,5,45.0,149,1.0,13101000.0,221030.0,0.000000,6.0,264.0
132,560,1,30.0,192,0.0,3631000.0,80800.0,0.000000,5.0,154.0
1851,1668,2,35.0,11,17.0,24775000.0,172190.0,1.000000,10.0,1756.0
1791,1035,3,50.0,108,21.0,23367000.0,237570.0,0.000000,16.0,911.0
526,2428,8,15.0,1,1.0,0.0,0.0,0.000000,1.0,756.0
...,...,...,...,...,...,...,...,...,...,...
763,1144,7,55.0,240,16.0,18770000.0,246770.0,0.176578,8.0,950.0
835,481,1,30.0,1,5.0,0.0,0.0,0.000000,1.0,65.0
1653,511,9,50.0,29,3.0,18968000.0,207380.0,0.000000,4.0,413.0
2607,902,6,40.0,48,32.0,41120000.0,233010.0,0.000000,2.0,635.0


In [20]:
columns = ['총세대수', '지역',' 전용면적','전용면적별세대수','공가수','임대보증금','임대료','지하철','버스','단지내주차면수']
new_test = pd.DataFrame()
area_columns = []

In [21]:
for i, code in tqdm(enumerate(test['단지코드'].unique())):
    temp = test[test['단지코드']==code]
    temp.index = range(temp.shape[0])
    for col in columns:
        new_test.loc[i, col] = temp.loc[0, col]
    
    for col in area_columns:
        area = float(col.split('_')[-1])
        new_test.loc[i, col] = temp[temp['전용면적']==area]['전용면적별세대수'].sum()

0it [00:00, ?it/s]


KeyError: ' 전용면적'

In [34]:
new_test.head

<bound method NDFrame.head of       단지코드    총세대수   공가수    지역  단지내주차면수  지하철    버스
0    C1072   754.0  14.0   0.0    683.0  0.0   2.0
1    C1128  1354.0   9.0   0.0   1216.0  0.0   3.0
2    C1456   619.0  18.0   1.0    547.0  0.0  16.0
3    C1840   593.0   7.0   2.0    543.0  0.0   3.0
4    C1332  1297.0  11.0   0.0   1112.0  0.0   2.0
..     ...     ...   ...   ...      ...  ...   ...
145  C2456   349.0  17.0   6.0    270.0  0.0   4.0
146  C1266   596.0  35.0  10.0    593.0  0.0   1.0
147  C2152   120.0   9.0   7.0     40.0  0.0   1.0
148  C1267   675.0  38.0   3.0    467.0  0.0   1.0
149  C2189   382.0  45.0   2.0    300.0  0.0   2.0

[150 rows x 7 columns]>

In [24]:
pred = forest.predict(x_test)
#pred

In [25]:
submission['num'] = pred

ValueError: Length of values (738) does not match length of index (150)

In [None]:
submission.to_csv('baseline.csv', index=False)