In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression

In [2]:
data_1 = pd.read_csv("age_gender_info.csv")
data_2 = pd.read_csv("sample_submission.csv")
data_3 = pd.read_csv("test.csv")
data_4 = pd.read_csv("train.csv")

In [3]:
print(data_1.shape)
print(data_2.shape)
print(data_3.shape)
print(data_4.shape)

(16, 23)
(150, 2)
(1022, 14)
(2952, 15)


# train.csv 파일 분석하기

In [4]:
data_4.head(3)

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,도보 10분거리 내 지하철역 수(환승노선 수 반영),도보 10분거리 내 버스정류장 수,단지내주차면수,등록차량수
0,C2483,900,아파트,경상북도,국민임대,39.72,134,38.0,A,15667000,103680,0.0,3.0,1425.0,1015.0
1,C2483,900,아파트,경상북도,국민임대,39.72,15,38.0,A,15667000,103680,0.0,3.0,1425.0,1015.0
2,C2483,900,아파트,경상북도,국민임대,51.93,385,38.0,A,27304000,184330,0.0,3.0,1425.0,1015.0


In [5]:
data_4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2952 entries, 0 to 2951
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   단지코드                          2952 non-null   object 
 1   총세대수                          2952 non-null   int64  
 2   임대건물구분                        2952 non-null   object 
 3   지역                            2952 non-null   object 
 4   공급유형                          2952 non-null   object 
 5   전용면적                          2952 non-null   float64
 6   전용면적별세대수                      2952 non-null   int64  
 7   공가수                           2952 non-null   float64
 8   자격유형                          2952 non-null   object 
 9   임대보증금                         2383 non-null   object 
 10  임대료                           2383 non-null   object 
 11  도보 10분거리 내 지하철역 수(환승노선 수 반영)  2741 non-null   float64
 12  도보 10분거리 내 버스정류장 수            2948 non-null   float64
 13  단지내

In [6]:
data_4.isnull().sum()

단지코드                              0
총세대수                              0
임대건물구분                            0
지역                                0
공급유형                              0
전용면적                              0
전용면적별세대수                          0
공가수                               0
자격유형                              0
임대보증금                           569
임대료                             569
도보 10분거리 내 지하철역 수(환승노선 수 반영)    211
도보 10분거리 내 버스정류장 수                4
단지내주차면수                           0
등록차량수                             0
dtype: int64

## 데이터 전처리
 - NAN 데이터 처리 
 - '-' 로 처리된 부분 해결
 - '임대보증금, 임대료' 부분을 object -> int 로 바꿈

In [7]:
data_4.dropna(axis = 0,inplace = True)

z = data_4[data_4["임대보증금"] == '-'].index
data_4.drop(z, axis = 0, inplace =True)
data_4["임대보증금"] = data_4["임대보증금"].astype("int64")

z = data_4[data_4["임대료"] == '-'].index
data_4.drop(z, axis = 0, inplace =True)
data_4["임대료"] = data_4["임대료"].astype("int64")

In [8]:
print("지역 : ", data_3["지역"].unique())
print()
print("임대건물구분 : ", data_3["임대건물구분"].unique())
print()
print("공급유형 : ", data_3["공급유형"].unique())
print()
print("자격유형 : ", data_3["자격유형"].unique())

지역 :  ['경기도' '부산광역시' '전라북도' '경상남도' '충청남도' '대전광역시' '제주특별자치도' '강원도' '울산광역시' '경상북도'
 '충청북도' '광주광역시' '전라남도' '대구광역시' '세종특별자치시']

임대건물구분 :  ['아파트' '상가']

공급유형 :  ['국민임대' '영구임대' '임대상가' '공공임대(50년)' '공공임대(10년)' '공공임대(분납)' '행복주택']

자격유형 :  ['H' 'A' 'E' 'C' 'D' nan 'G' 'I' 'J' 'K' 'L' 'M' 'N']


In [9]:
y = data_4.iloc[:,-1]
X = data_4.iloc[:,:-1]

In [10]:
instance = pd.get_dummies(data = X, columns = ['공급유형'], prefix = '공급유형',)
instance = pd.get_dummies(data = instance, columns = ['임대건물구분'], prefix = '임대건물구분')
instance = pd.get_dummies(data = instance, columns = ['지역'], prefix = '지역')
instance = pd.get_dummies(data = instance, columns = ['자격유형'], prefix = '자격유형')


In [11]:
reg = LinearRegression().fit(instance.iloc[:,1:], y)
reg.score(instance.iloc[:,1:], y)

0.8085585301755562

# test set 에도 적용해보기

In [12]:
imme = data_3.dropna(axis = 0)

z = imme[imme["임대보증금"] == '-'].index
imme.drop(z, axis = 0, inplace =True)
imme["임대보증금"] = imme["임대보증금"].astype("int64")

z = imme[imme["임대료"] == '-'].index
imme.drop(z, axis = 0, inplace =True)
imme["임대료"] = imme["임대료"].astype("int64")

instance_test = pd.get_dummies(data = imme, columns = ['공급유형'], prefix = '공급유형',)
instance_test = pd.get_dummies(data = instance_test, columns = ['임대건물구분'], prefix = '임대건물구분')
instance_test = pd.get_dummies(data = instance_test, columns = ['지역'], prefix = '지역')
instance_test = pd.get_dummies(data = instance_test, columns = ['자격유형'], prefix = '자격유형')


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imme["임대보증금"] = imme["임대보증금"].astype("int64")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imme["임대료"] = imme["임대료"].astype("int64")


In [13]:
# independent 수가 맞지 않는다

In [14]:
print(instance.shape)
print(instance_test.shape)

(2285, 48)
(814, 43)


In [15]:
print(len(data_3["임대건물구분"].unique()))
print(len(data_4["임대건물구분"].unique()))

2
1


In [16]:
instance.columns

Index(['단지코드', '총세대수', '전용면적', '전용면적별세대수', '공가수', '임대보증금', '임대료',
       '도보 10분거리 내 지하철역 수(환승노선 수 반영)', '도보 10분거리 내 버스정류장 수', '단지내주차면수',
       '공급유형_공공임대(10년)', '공급유형_공공임대(50년)', '공급유형_공공임대(5년)', '공급유형_공공임대(분납)',
       '공급유형_국민임대', '공급유형_영구임대', '공급유형_행복주택', '임대건물구분_아파트', '지역_강원도', '지역_경기도',
       '지역_경상남도', '지역_경상북도', '지역_광주광역시', '지역_대구광역시', '지역_대전광역시', '지역_부산광역시',
       '지역_서울특별시', '지역_세종특별자치시', '지역_울산광역시', '지역_전라남도', '지역_전라북도',
       '지역_제주특별자치도', '지역_충청남도', '지역_충청북도', '자격유형_A', '자격유형_B', '자격유형_C',
       '자격유형_E', '자격유형_F', '자격유형_G', '자격유형_H', '자격유형_I', '자격유형_J', '자격유형_K',
       '자격유형_L', '자격유형_M', '자격유형_N', '자격유형_O'],
      dtype='object')

In [17]:
instance_test.columns

Index(['단지코드', '총세대수', '전용면적', '전용면적별세대수', '공가수', '임대보증금', '임대료',
       '도보 10분거리 내 지하철역 수(환승노선 수 반영)', '도보 10분거리 내 버스정류장 수', '단지내주차면수',
       '공급유형_공공임대(10년)', '공급유형_공공임대(50년)', '공급유형_공공임대(분납)', '공급유형_국민임대',
       '공급유형_영구임대', '공급유형_행복주택', '임대건물구분_아파트', '지역_강원도', '지역_경기도', '지역_경상남도',
       '지역_경상북도', '지역_광주광역시', '지역_대구광역시', '지역_대전광역시', '지역_부산광역시', '지역_세종특별자치시',
       '지역_울산광역시', '지역_전라남도', '지역_전라북도', '지역_제주특별자치도', '지역_충청남도', '지역_충청북도',
       '자격유형_A', '자격유형_C', '자격유형_E', '자격유형_G', '자격유형_H', '자격유형_I', '자격유형_J',
       '자격유형_K', '자격유형_L', '자격유형_M', '자격유형_N'],
      dtype='object')

In [18]:
A = []
for i in instance.columns:
    if i not in instance_test.columns:
        A.append(i)
for i in A:
    instance_test[i] = 0

In [19]:
for i in A:
    instance_test[i] = 0

In [21]:
Z = reg.predict(instance_test.iloc[:,1:])

In [22]:
instance_test

Unnamed: 0,단지코드,총세대수,전용면적,전용면적별세대수,공가수,임대보증금,임대료,도보 10분거리 내 지하철역 수(환승노선 수 반영),도보 10분거리 내 버스정류장 수,단지내주차면수,...,자격유형_J,자격유형_K,자격유형_L,자격유형_M,자격유형_N,공급유형_공공임대(5년),지역_서울특별시,자격유형_B,자격유형_F,자격유형_O
0,C1072,754,39.79,116,14.0,22830000,189840,0.0,2.0,683.0,...,0,0,0,0,0,0,0,0,0,0
1,C1072,754,46.81,30,14.0,36048000,249930,0.0,2.0,683.0,...,0,0,0,0,0,0,0,0,0,0
2,C1072,754,46.90,112,14.0,36048000,249930,0.0,2.0,683.0,...,0,0,0,0,0,0,0,0,0,0
3,C1072,754,46.90,120,14.0,36048000,249930,0.0,2.0,683.0,...,0,0,0,0,0,0,0,0,0,0
4,C1072,754,51.46,60,14.0,43497000,296780,0.0,2.0,683.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1013,C1267,675,22.86,14,38.0,10876000,89230,0.0,1.0,467.0,...,0,0,0,0,0,0,0,0,0,0
1018,C2189,382,29.19,96,45.0,6872000,106400,0.0,2.0,300.0,...,0,0,0,0,0,0,0,0,0,0
1019,C2189,382,29.19,20,45.0,6872000,106400,0.0,2.0,300.0,...,0,0,0,0,0,0,0,0,0,0
1020,C2189,382,39.45,202,45.0,13410000,144600,0.0,2.0,300.0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
code = instance_test["단지코드"]
num =  reg.predict(instance_test.iloc[:,1:])

In [24]:
answer = pd.DataFrame(data =code)
answer["num"] = 0
for i in range(len(answer)):
    answer.iloc[i,1] = round(num[i]) 

In [25]:
answer.rename(columns = {"단지코드": "code"}, inplace = True)
answer= answer.reset_index(drop = True)

In [26]:
data_3.head(4)

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,도보 10분거리 내 지하철역 수(환승노선 수 반영),도보 10분거리 내 버스정류장 수,단지내주차면수
0,C1072,754,아파트,경기도,국민임대,39.79,116,14.0,H,22830000,189840,0.0,2.0,683.0
1,C1072,754,아파트,경기도,국민임대,46.81,30,14.0,A,36048000,249930,0.0,2.0,683.0
2,C1072,754,아파트,경기도,국민임대,46.9,112,14.0,H,36048000,249930,0.0,2.0,683.0
3,C1072,754,아파트,경기도,국민임대,46.9,120,14.0,H,36048000,249930,0.0,2.0,683.0


In [28]:
answer

Unnamed: 0,code,num
0,C1072,587.0
1,C1072,587.0
2,C1072,599.0
3,C1072,599.0
4,C1072,605.0
...,...,...
809,C1267,81.0
810,C2189,-62.0
811,C2189,-64.0
812,C2189,-37.0


In [29]:
A = list(set(answer["code"]))
B = []
for i in A:
    z = answer[answer["code"] == i]["num"].mean()
    B.append(z)

# 2nd) day

### 결과값 table 만들기

In [30]:
Z = list(set(data_3["단지코드"]))

answer_1 = pd.DataFrame(data =Z)
answer_1["num"] = 0

answer_1.rename(columns = {0: "code"}, inplace = True)
answer_1= answer_1.reset_index(drop = True)

answer_1

Unnamed: 0,code,num
0,C2676,0
1,C1840,0
2,C2600,0
3,C1349,0
4,C1318,0
...,...,...
145,C1849,0
146,C1629,0
147,C2691,0
148,C2131,0


In [31]:
A = list(set(answer["code"]))

for i in A:
    z = answer[answer["code"] == i]["num"].mean()
    index = answer_1[answer_1["code"] == i].index
    answer_1.iloc[index[0],1] = round(z)
answer_1   

Unnamed: 0,code,num
0,C2676,353
1,C1840,486
2,C2600,416
3,C1349,202
4,C1318,0
...,...,...
145,C1849,802
146,C1629,228
147,C2691,1047
148,C2131,670


In [32]:
# num을 예상하지 못한 code (dropna 를 통해서)

instanced = answer_1[answer_1["num"] ==0]
instanced["code"].values

array(['C1318', 'C1472', 'C1327', 'C1083', 'C2152', 'C2177'], dtype=object)

In [33]:
print(data_3[data_3["단지코드"] =="C1327"].isnull().sum())
print()
print()
print(data_3[data_3["단지코드"] =="C1083"].isnull().sum())
print()
print()
print(data_3[data_3["단지코드"] =="C1472"].isnull().sum())
print()
print()
print(data_3[data_3["단지코드"] =="C1318"].isnull().sum())

단지코드                            0
총세대수                            0
임대건물구분                          0
지역                              0
공급유형                            0
전용면적                            0
전용면적별세대수                        0
공가수                             0
자격유형                            0
임대보증금                           0
임대료                             0
도보 10분거리 내 지하철역 수(환승노선 수 반영)    4
도보 10분거리 내 버스정류장 수              0
단지내주차면수                         0
dtype: int64


단지코드                            0
총세대수                            0
임대건물구분                          0
지역                              0
공급유형                            0
전용면적                            0
전용면적별세대수                        0
공가수                             0
자격유형                            0
임대보증금                           0
임대료                             0
도보 10분거리 내 지하철역 수(환승노선 수 반영)    5
도보 10분거리 내 버스정류장 수              0
단지내주차면수                         0
dtype: int64


단지코드              

In [34]:
print(data_3["도보 10분거리 내 지하철역 수(환승노선 수 반영)"].describe())
print()
print(data_3["도보 10분거리 내 지하철역 수(환승노선 수 반영)"].value_counts())

count    980.000000
mean       0.136735
std        0.435500
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        2.000000
Name: 도보 10분거리 내 지하철역 수(환승노선 수 반영), dtype: float64

0.0    881
1.0     64
2.0     35
Name: 도보 10분거리 내 지하철역 수(환승노선 수 반영), dtype: int64


In [35]:
# C1327의 경우, 대다수 0.0에 속함

In [36]:
data_3[data_3["단지코드"] =="C2177"].isnull().sum()

단지코드                             0
총세대수                             0
임대건물구분                           0
지역                               0
공급유형                             0
전용면적                             0
전용면적별세대수                         0
공가수                              0
자격유형                             0
임대보증금                           22
임대료                             22
도보 10분거리 내 지하철역 수(환승노선 수 반영)    25
도보 10분거리 내 버스정류장 수               0
단지내주차면수                          0
dtype: int64

In [37]:
data_3[data_3["단지코드"] =="C2152"]

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,도보 10분거리 내 지하철역 수(환승노선 수 반영),도보 10분거리 내 버스정류장 수,단지내주차면수
1005,C2152,120,아파트,강원도,영구임대,24.83,66,9.0,C,-,-,0.0,1.0,40.0
1006,C2152,120,아파트,강원도,영구임대,33.84,54,9.0,C,-,-,0.0,1.0,40.0


In [38]:
instance_test.columns

Index(['단지코드', '총세대수', '전용면적', '전용면적별세대수', '공가수', '임대보증금', '임대료',
       '도보 10분거리 내 지하철역 수(환승노선 수 반영)', '도보 10분거리 내 버스정류장 수', '단지내주차면수',
       '공급유형_공공임대(10년)', '공급유형_공공임대(50년)', '공급유형_공공임대(분납)', '공급유형_국민임대',
       '공급유형_영구임대', '공급유형_행복주택', '임대건물구분_아파트', '지역_강원도', '지역_경기도', '지역_경상남도',
       '지역_경상북도', '지역_광주광역시', '지역_대구광역시', '지역_대전광역시', '지역_부산광역시', '지역_세종특별자치시',
       '지역_울산광역시', '지역_전라남도', '지역_전라북도', '지역_제주특별자치도', '지역_충청남도', '지역_충청북도',
       '자격유형_A', '자격유형_C', '자격유형_E', '자격유형_G', '자격유형_H', '자격유형_I', '자격유형_J',
       '자격유형_K', '자격유형_L', '자격유형_M', '자격유형_N', '공급유형_공공임대(5년)', '지역_서울특별시',
       '자격유형_B', '자격유형_F', '자격유형_O'],
      dtype='object')

In [39]:
reg.coef_

array([ 1.68134844e-01,  2.26510331e+00,  3.32349050e-02, -7.95333566e+00,
       -1.45716197e-07, -3.23931498e-05, -3.19283861e+01,  7.81120589e+00,
        7.30566389e-01,  1.59607064e+02, -1.08359158e+02,  2.29695282e+02,
       -2.36833506e+01, -5.60276578e+01, -1.87318117e+02, -1.39140624e+01,
       -6.40255848e-10,  1.24191457e+00,  1.62773711e+01, -4.28113324e+01,
        1.20863934e+02,  1.21362000e+02,  1.22513784e+02, -1.45373800e+02,
       -5.04472736e+01, -2.30469191e+01, -2.83915504e+01, -3.90710252e+01,
       -5.13484973e+00, -2.53925470e+01, -3.97314077e+01,  4.53843901e+01,
       -2.82426891e+01,  1.03230813e+02,  4.82090371e+01,  5.44102753e+01,
        9.24735436e+01, -4.02389927e+02, -2.48060295e+02,  1.44728236e+02,
        2.21312379e+02,  4.40364821e+01,  1.59980054e+02,  3.03618651e+01,
       -6.01605881e+01, -7.60776993e+01, -1.12054176e+02])

**C2177, C2152 의 경우,**  
 - 도보 10분거리 내 지하철역 수(환승노선 수 반영) : 대다수 0.0으로 환산
 - model의 coefficient 를 봤을떄, 임대료, 임대보증금이 생각보다 낮은 중요도를 갖고 있다. $10^{-7}, 10^{-5}$

# null값을 고려한 trial

In [40]:
instanced["code"].values

array(['C1318', 'C1472', 'C1327', 'C1083', 'C2152', 'C2177'], dtype=object)

In [41]:
# 도보 10분거리 내 지하철역 수(환승노선 수 반영) 으로 대체하기 
data_3[data_3["단지코드"] =="C1327"]
data_3.loc[:,"도보 10분거리 내 지하철역 수(환승노선 수 반영)"].fillna(0,inplace = True)

In [42]:
data_3.iloc[76]

단지코드                            C1006
총세대수                             1505
임대건물구분                            아파트
지역                              대전광역시
공급유형                             영구임대
전용면적                            26.37
전용면적별세대수                           10
공가수                                27
자격유형                                D
임대보증금                             NaN
임대료                               NaN
도보 10분거리 내 지하철역 수(환승노선 수 반영)        2
도보 10분거리 내 버스정류장 수                  5
단지내주차면수                           428
Name: 76, dtype: object

In [43]:
data_3.isnull().sum()

단지코드                              0
총세대수                              0
임대건물구분                            0
지역                                0
공급유형                              0
전용면적                              0
전용면적별세대수                          0
공가수                               0
자격유형                              2
임대보증금                           180
임대료                             180
도보 10분거리 내 지하철역 수(환승노선 수 반영)      0
도보 10분거리 내 버스정류장 수                0
단지내주차면수                           0
dtype: int64

In [44]:
# 중앙값으로 대체
e = imme["임대보증금"].median()
r = imme["임대료"].median()
data_3.loc[:,"임대보증금"].fillna(e,inplace = True)
data_3.loc[:,"임대료"].fillna(r,inplace = True)

In [45]:
data_3[data_3["자격유형"].isnull()]

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,도보 10분거리 내 지하철역 수(환승노선 수 반영),도보 10분거리 내 버스정류장 수,단지내주차면수
196,C2411,962,아파트,경상남도,국민임대,46.9,240,25.0,,71950000,37470,0.0,2.0,840.0
258,C2253,1161,아파트,강원도,영구임대,26.37,745,0.0,,2249000,44770,0.0,2.0,173.0


In [46]:
data_3[data_3["단지코드"] =="C2411"]

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,도보 10분거리 내 지하철역 수(환승노선 수 반영),도보 10분거리 내 버스정류장 수,단지내주차면수
193,C2411,962,아파트,경상남도,국민임대,39.43,56,25.0,A,11992000,100720,0.0,2.0,840.0
194,C2411,962,아파트,경상남도,국민임대,39.72,336,25.0,A,11992000,100720,0.0,2.0,840.0
195,C2411,962,아파트,경상남도,국민임대,39.82,179,25.0,A,11992000,100720,0.0,2.0,840.0
196,C2411,962,아파트,경상남도,국민임대,46.9,240,25.0,,71950000,37470,0.0,2.0,840.0
197,C2411,962,아파트,경상남도,국민임대,51.93,150,25.0,A,21586000,171480,0.0,2.0,840.0


In [47]:
data_3.loc[196,"자격유형"] = 'A'

In [48]:
data_3[data_3["단지코드"] =="C2253"]

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,도보 10분거리 내 지하철역 수(환승노선 수 반영),도보 10분거리 내 버스정류장 수,단지내주차면수
258,C2253,1161,아파트,강원도,영구임대,26.37,745,0.0,,2249000.0,44770,0.0,2.0,173.0
259,C2253,1161,아파트,강원도,영구임대,31.32,239,0.0,C,3731000.0,83020,0.0,2.0,173.0
260,C2253,1161,아파트,강원도,영구임대,31.32,149,0.0,C,3731000.0,83020,0.0,2.0,173.0
261,C2253,1161,상가,강원도,임대상가,13.77,1,0.0,D,21540000.0,153180,0.0,2.0,173.0
262,C2253,1161,상가,강원도,임대상가,22.89,1,0.0,D,21540000.0,153180,0.0,2.0,173.0
263,C2253,1161,상가,강원도,임대상가,22.91,1,0.0,D,21540000.0,153180,0.0,2.0,173.0
264,C2253,1161,상가,강원도,임대상가,23.79,1,0.0,D,21540000.0,153180,0.0,2.0,173.0
265,C2253,1161,상가,강원도,임대상가,23.79,1,0.0,D,21540000.0,153180,0.0,2.0,173.0
266,C2253,1161,상가,강원도,임대상가,23.86,1,0.0,D,21540000.0,153180,0.0,2.0,173.0
267,C2253,1161,상가,강원도,임대상가,23.86,1,0.0,D,21540000.0,153180,0.0,2.0,173.0


In [49]:
data_3.loc[258,"자격유형"] = 'C'

In [50]:
index_1 = data_3[data_3["임대보증금"] == '-'].index
data_3.loc[index_1,"임대보증금"] = e
index_2 = data_3[data_3["임대료"] == '-'].index
data_3.loc[index_2,"임대료"] = e

data_3["임대보증금"] = data_3["임대보증금"].astype("int64")
data_3["임대료"] = data_3["임대료"].astype("int64")

In [51]:
## 모든 변수 처리 완료, 이제 one-hot encoding 을 통해, 다시 한번 더 시작

In [52]:
instance_test_1 = pd.get_dummies(data = data_3, columns = ['공급유형'], prefix = '공급유형',)
instance_test_1 = pd.get_dummies(data = instance_test_1, columns = ['임대건물구분'], prefix = '임대건물구분')
instance_test_1 = pd.get_dummies(data = instance_test_1, columns = ['지역'], prefix = '지역')
instance_test_1 = pd.get_dummies(data = instance_test_1, columns = ['자격유형'], prefix = '자격유형')


In [53]:
print(instance_test_1.shape)
print(instance.shape)

(1022, 46)
(2285, 48)


In [54]:
instance.columns

Index(['단지코드', '총세대수', '전용면적', '전용면적별세대수', '공가수', '임대보증금', '임대료',
       '도보 10분거리 내 지하철역 수(환승노선 수 반영)', '도보 10분거리 내 버스정류장 수', '단지내주차면수',
       '공급유형_공공임대(10년)', '공급유형_공공임대(50년)', '공급유형_공공임대(5년)', '공급유형_공공임대(분납)',
       '공급유형_국민임대', '공급유형_영구임대', '공급유형_행복주택', '임대건물구분_아파트', '지역_강원도', '지역_경기도',
       '지역_경상남도', '지역_경상북도', '지역_광주광역시', '지역_대구광역시', '지역_대전광역시', '지역_부산광역시',
       '지역_서울특별시', '지역_세종특별자치시', '지역_울산광역시', '지역_전라남도', '지역_전라북도',
       '지역_제주특별자치도', '지역_충청남도', '지역_충청북도', '자격유형_A', '자격유형_B', '자격유형_C',
       '자격유형_E', '자격유형_F', '자격유형_G', '자격유형_H', '자격유형_I', '자격유형_J', '자격유형_K',
       '자격유형_L', '자격유형_M', '자격유형_N', '자격유형_O'],
      dtype='object')

In [55]:
instance_test_1.columns

Index(['단지코드', '총세대수', '전용면적', '전용면적별세대수', '공가수', '임대보증금', '임대료',
       '도보 10분거리 내 지하철역 수(환승노선 수 반영)', '도보 10분거리 내 버스정류장 수', '단지내주차면수',
       '공급유형_공공임대(10년)', '공급유형_공공임대(50년)', '공급유형_공공임대(분납)', '공급유형_국민임대',
       '공급유형_영구임대', '공급유형_임대상가', '공급유형_행복주택', '임대건물구분_상가', '임대건물구분_아파트',
       '지역_강원도', '지역_경기도', '지역_경상남도', '지역_경상북도', '지역_광주광역시', '지역_대구광역시',
       '지역_대전광역시', '지역_부산광역시', '지역_세종특별자치시', '지역_울산광역시', '지역_전라남도', '지역_전라북도',
       '지역_제주특별자치도', '지역_충청남도', '지역_충청북도', '자격유형_A', '자격유형_C', '자격유형_D',
       '자격유형_E', '자격유형_G', '자격유형_H', '자격유형_I', '자격유형_J', '자격유형_K', '자격유형_L',
       '자격유형_M', '자격유형_N'],
      dtype='object')

In [56]:
aa = list(instance.columns)
bb = list(instance_test_1.columns)

In [57]:
data_3_1 = instance_test_1.loc[:,"단지코드":"임대료"]

In [58]:
for i in aa[7:]:
    if i in bb[7:]:
        data_3_1[i] = instance_test_1[i]
    else:
        data_3_1[i] = 0


In [59]:
ZZ = reg.predict(data_3_1.iloc[:,1:])

In [60]:
aH = pd.DataFrame(data = data_3_1["단지코드"])
aH["num"] = ZZ
aH.rename(columns = {"단지코드": "code"}, inplace = True)


In [62]:
Z = list(set(data_3["단지코드"]))

answer_2 = pd.DataFrame(data =Z)
answer_2["num"] = 0

answer_2.rename(columns = {0: "code"}, inplace = True)
answer_2= answer_2.reset_index(drop = True)

answer_2

Unnamed: 0,code,num
0,C2676,0
1,C1840,0
2,C2600,0
3,C1349,0
4,C1318,0
...,...,...
145,C1849,0
146,C1629,0
147,C2691,0
148,C2131,0


In [63]:
A = list(set(aH["code"]))

for i in A:
    z = aH[aH["code"] == i]["num"].mean()
    index = answer_2[answer_2["code"] == i].index
    answer_2.iloc[index[0],1] = round(z)
answer_2   

Unnamed: 0,code,num
0,C2676,566
1,C1840,539
2,C2600,469
3,C1349,224
4,C1318,382
...,...,...
145,C1849,851
146,C1629,269
147,C2691,1134
148,C2131,686


# 3일차
- Lasso Algorithm
- Min Max scalar 해보기

In [71]:
from sklearn import linear_model
model = linear_model.Lasso(alpha = 0.1)
model.fit(instance.iloc[:,1:], y)
model.score(instance.iloc[:,1:],y)

0.808258790013064

In [72]:
ZZ = reg.predict(data_3_1.iloc[:,1:])
ZZ

array([1.66338906e+15, 1.66338728e+15, 1.66338728e+15, ...,
       1.66339121e+15, 1.66339033e+15, 1.66338962e+15])

In [73]:
ZZ = model.predict(data_3_1.iloc[:,1:])
ZZ

array([672.49885993, 643.29234728, 686.42084332, ...,  24.82715968,
        51.80804689,  61.70169179])

In [74]:
aH = pd.DataFrame(data = data_3_1["단지코드"])
aH["num"] = ZZ
aH.rename(columns = {"단지코드": "code"}, inplace = True)

Z = list(set(data_3["단지코드"]))

answer_2 = pd.DataFrame(data =Z)
answer_2["num"] = 0

answer_2.rename(columns = {0: "code"}, inplace = True)
answer_2= answer_2.reset_index(drop = True)

A = list(set(aH["code"]))

for i in A:
    z = aH[aH["code"] == i]["num"].mean()
    index = answer_2[answer_2["code"] == i].index
    answer_2.iloc[index[0],1] = z
answer_2   

Unnamed: 0,code,num
0,C1782,427.027290
1,C1456,529.384699
2,C2608,985.412978
3,C2397,563.381658
4,C2523,450.960885
...,...,...
145,C1721,511.418894
146,C1887,634.610362
147,C1392,697.495331
148,C1563,1617.158571


In [75]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_MinMax_train = scaler.fit_transform(instance.iloc[:,1:])
X_MinMax_train

array([[0.34382376, 0.34362766, 0.07085346, ..., 0.        , 0.        ,
        0.        ],
       [0.34382376, 0.34362766, 0.00697799, ..., 0.        , 0.        ,
        0.        ],
       [0.34382376, 0.52066116, 0.20558239, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.08379229, 0.51776135, 0.0171766 , ..., 0.        , 0.        ,
        0.        ],
       [0.08379229, 0.52109613, 0.06011809, ..., 0.        , 0.        ,
        0.        ],
       [0.08379229, 0.56444831, 0.00912507, ..., 0.        , 0.        ,
        0.        ]])

In [76]:
reg = LinearRegression().fit(X_MinMax_train, y)
reg.score(X_MinMax_train, y)

0.8069564368500926

In [77]:
scaler_1 = MinMaxScaler()
X_MinMax_train_test = scaler_1.fit_transform(data_3_1.iloc[:,1:])
X_MinMax_train_test

array([[0.27192631, 0.05201939, 0.0858209 , ..., 0.        , 0.        ,
        0.        ],
       [0.27192631, 0.0642613 , 0.02164179, ..., 0.        , 0.        ,
        0.        ],
       [0.27192631, 0.06441825, 0.08283582, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.12294754, 0.03353446, 0.0141791 , ..., 0.        , 0.        ,
        0.        ],
       [0.12294754, 0.05142648, 0.15      , ..., 0.        , 0.        ,
        0.        ],
       [0.12294754, 0.06324986, 0.04402985, ..., 0.        , 0.        ,
        0.        ]])

In [78]:
aH = pd.DataFrame(data = data_3_1["단지코드"])
aH["num"] = ZZ
aH.rename(columns = {"단지코드": "code"}, inplace = True)

Z = list(set(data_3["단지코드"]))

answer_2 = pd.DataFrame(data =Z)
answer_2["num"] = 0

answer_2.rename(columns = {0: "code"}, inplace = True)
answer_2= answer_2.reset_index(drop = True)

A = list(set(aH["code"]))

for i in A:
    z = aH[aH["code"] == i]["num"].mean()
    index = answer_2[answer_2["code"] == i].index
    answer_2.iloc[index[0],1] = z
answer_2   

Unnamed: 0,code,num
0,C1782,427.027290
1,C1456,529.384699
2,C2608,985.412978
3,C2397,563.381658
4,C2523,450.960885
...,...,...
145,C1721,511.418894
146,C1887,634.610362
147,C1392,697.495331
148,C1563,1617.158571


# 4일차

In [79]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold

In [80]:
rf = RandomForestRegressor(n_estimators=50, oob_score=True, random_state=0)
rf.fit(instance.iloc[:,1:], y)

RandomForestRegressor(n_estimators=50, oob_score=True, random_state=0)

In [87]:
predicted = rf.predict(data_3_1.iloc[:,1:])
predicted
rf.score(instance.iloc[:,1:], y)

0.9987100393855954

In [88]:
aH = pd.DataFrame(data = data_3_1["단지코드"])
aH["num"] = predicted
aH.rename(columns = {"단지코드": "code"}, inplace = True)

Z = list(set(data_3["단지코드"]))

answer_2 = pd.DataFrame(data =Z)
answer_2["num"] = 0

answer_2.rename(columns = {0: "code"}, inplace = True)
answer_2= answer_2.reset_index(drop = True)

A = list(set(aH["code"]))

for i in A:
    z = aH[aH["code"] == i]["num"].mean()
    index = answer_2[answer_2["code"] == i].index
    answer_2.iloc[index[0],1] = z
answer_2   

Unnamed: 0,code,num
0,C1782,422.108000
1,C1456,577.148889
2,C2608,969.520000
3,C2397,561.240000
4,C2523,379.253333
...,...,...
145,C1721,556.783333
146,C1887,454.101667
147,C1392,593.976667
148,C1563,1348.098182


In [92]:
instance_1 = instance.iloc[:,:]
instance_1["등록차량수"] = y
lists = instance_1.corr().sort_values(by = "등록차량수" , ascending = False)["등록차량수"]

A = (lists[lists>0.1]) 
B = lists[lists < - 0.1]
C = []
for i in A.index:
    C.append(i)
    
for i in B.index:
    C.append(i)
    

In [93]:
instance[C[1:]]

Unnamed: 0,단지내주차면수,총세대수,임대료,공급유형_공공임대(10년),전용면적,임대보증금,지역_세종특별자치시,지역_경기도,자격유형_H,자격유형_A,전용면적별세대수,도보 10분거리 내 버스정류장 수,지역_대구광역시,자격유형_L,지역_경상남도,지역_강원도,공급유형_영구임대,자격유형_J,자격유형_C,공급유형_행복주택
0,1425.0,900,103680,0,39.72,15667000,0,0,0,1,134,3.0,0,0,0,0,0,0,0,0
1,1425.0,900,103680,0,39.72,15667000,0,0,0,1,15,3.0,0,0,0,0,0,0,0,0
2,1425.0,900,184330,0,51.93,27304000,0,0,0,1,385,3.0,0,0,0,0,0,0,0,0
3,1425.0,900,184330,0,51.93,27304000,0,0,0,1,15,3.0,0,0,0,0,0,0,0,0
4,1425.0,900,184330,0,51.93,27304000,0,0,0,1,41,3.0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2947,166.0,239,116090,0,49.20,11346000,0,0,0,1,19,1.0,0,0,0,1,0,0,0,0
2948,166.0,239,142310,0,51.08,14005000,0,0,0,1,34,1.0,0,0,0,1,0,0,0,0
2949,166.0,239,142310,0,51.73,14005000,0,0,0,1,34,1.0,0,0,0,1,0,0,0,0
2950,166.0,239,142310,0,51.96,14005000,0,0,0,1,114,1.0,0,0,0,1,0,0,0,0


In [94]:
data_3_1[C[1:]]

Unnamed: 0,단지내주차면수,총세대수,임대료,공급유형_공공임대(10년),전용면적,임대보증금,지역_세종특별자치시,지역_경기도,자격유형_H,자격유형_A,전용면적별세대수,도보 10분거리 내 버스정류장 수,지역_대구광역시,자격유형_L,지역_경상남도,지역_강원도,공급유형_영구임대,자격유형_J,자격유형_C,공급유형_행복주택
0,683.0,754,189840,0,39.79,22830000,0,1,1,0,116,2.0,0,0,0,0,0,0,0,0
1,683.0,754,249930,0,46.81,36048000,0,1,0,1,30,2.0,0,0,0,0,0,0,0,0
2,683.0,754,249930,0,46.90,36048000,0,1,1,0,112,2.0,0,0,0,0,0,0,0,0
3,683.0,754,249930,0,46.90,36048000,0,1,1,0,120,2.0,0,0,0,0,0,0,0,0
4,683.0,754,296780,0,51.46,43497000,0,1,1,0,60,2.0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1017,467.0,675,21540000,0,36.77,21540000,0,0,0,0,126,1.0,0,1,1,0,0,0,0,1
1018,300.0,382,106400,0,29.19,6872000,0,0,1,0,96,2.0,0,0,0,0,0,0,0,0
1019,300.0,382,106400,0,29.19,6872000,0,0,1,0,20,2.0,0,0,0,0,0,0,0,0
1020,300.0,382,144600,0,39.45,13410000,0,0,1,0,202,2.0,0,0,0,0,0,0,0,0


In [96]:
reg = LinearRegression().fit(instance[C[1:]], y)
reg.score(instance[C[1:]], y)

0.7677873153947834

In [97]:
reg.predict(data_3_1[C[1:]])

array([702.12487884, 672.71330837, 717.93859654, ..., 328.02027247,
       344.11883893, 361.51594589])

In [98]:
aH = pd.DataFrame(data = data_3_1["단지코드"])
aH["num"] = reg.predict(data_3_1[C[1:]])
aH.rename(columns = {"단지코드": "code"}, inplace = True)

Z = list(set(data_3["단지코드"]))

answer_2 = pd.DataFrame(data =Z)
answer_2["num"] = 0

answer_2.rename(columns = {0: "code"}, inplace = True)
answer_2= answer_2.reset_index(drop = True)

A = list(set(aH["code"]))

for i in A:
    z = aH[aH["code"] == i]["num"].mean()
    index = answer_2[answer_2["code"] == i].index
    answer_2.iloc[index[0],1] = z
answer_2   

Unnamed: 0,code,num
0,C1782,408.313982
1,C1456,580.921005
2,C2608,1040.092001
3,C2397,510.492816
4,C2523,377.385562
...,...,...
145,C1721,591.894825
146,C1887,702.735726
147,C1392,590.410498
148,C1563,1526.565208


In [99]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree = 2)
poly.fit(instance[C[1:]])

PolynomialFeatures()

In [100]:
poly_ftr = poly.transform(instance[C[1:]])
poly_ftr

array([[1.000e+00, 1.425e+03, 9.000e+02, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [1.000e+00, 1.425e+03, 9.000e+02, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [1.000e+00, 1.425e+03, 9.000e+02, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       ...,
       [1.000e+00, 1.660e+02, 2.390e+02, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [1.000e+00, 1.660e+02, 2.390e+02, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [1.000e+00, 1.660e+02, 2.390e+02, ..., 0.000e+00, 0.000e+00,
        0.000e+00]])

In [101]:
reg = LinearRegression().fit(poly_ftr, y)
reg.score(poly_ftr, y)

0.8634570595171337

In [102]:
poly = PolynomialFeatures(degree = 2)
poly.fit(data_3_1[C[1:]])
poly_ftr_test = poly.transform(data_3_1[C[1:]])
poly_ftr_test

array([[  1., 683., 754., ...,   0.,   0.,   0.],
       [  1., 683., 754., ...,   0.,   0.,   0.],
       [  1., 683., 754., ...,   0.,   0.,   0.],
       ...,
       [  1., 300., 382., ...,   0.,   0.,   0.],
       [  1., 300., 382., ...,   0.,   0.,   0.],
       [  1., 300., 382., ...,   0.,   0.,   0.]])

In [103]:
reg.predict(poly_ftr_test)

array([581.57705556, 693.81503565, 608.58979937, ..., 194.15826921,
       150.61930341, 162.89187473])

In [105]:
aH = pd.DataFrame(data = data_3_1["단지코드"])
aH["num"] = reg.predict(poly_ftr_test)
aH.rename(columns = {"단지코드": "code"}, inplace = True)

Z = list(set(data_3["단지코드"]))

answer_2 = pd.DataFrame(data =Z)
answer_2["num"] = 0

answer_2.rename(columns = {0: "code"}, inplace = True)
answer_2= answer_2.reset_index(drop = True)

A = list(set(aH["code"]))

for i in A:
    z = aH[aH["code"] == i]["num"].mean()
    index = answer_2[answer_2["code"] == i].index
    answer_2.iloc[index[0],1] = z
answer_2   

Unnamed: 0,code,num
0,C1782,417.667767
1,C1456,422.555354
2,C2608,1042.790427
3,C2397,528.121661
4,C2523,135.595954
...,...,...
145,C1721,625.545386
146,C1887,774.194094
147,C1392,608.992872
148,C1563,1634.169972


In [None]:
!pip install dacon_submit_api-0.0.4-py3-none-any.whl

**제출용 파일 따로 만들기**

In [64]:
answer_2.to_csv('submission_result.csv', index=False)
data_5 = pd.read_csv("submission_result.csv")
data_5

Unnamed: 0,code,num
0,C2676,566
1,C1840,539
2,C2600,469
3,C1349,224
4,C1318,382
...,...,...
145,C1849,851
146,C1629,269
147,C2691,1134
148,C2131,686


**code 제출하기**

 1. **파일**
 2. **내 코드**
 3. **대회 코드**
 4. **내 팅명**
 5. **메모**

In [65]:
from dacon_submit_api import dacon_submit_api 

result = dacon_submit_api.post_submission_file(
'submission_result.csv', 
'413d1d88dc4258f5e6bb5afc8dfe750b01d8e78f3130493d25f55c2a16db03b7', 
'235745', 
'유니서울대', 
'Regrsssion model_1' )

{'isSubmitted': False, 'detail': 'Over max submission count of Daily. 일일 제출 가능한 최대 횟수가 초과 되었습니다.'}
