In [269]:
import statsmodels.api as sm
from scipy import stats

In [270]:
import pandas as pd
pd.options.display.float_format = '{:,.4f}'.format
pd.options.display.max_rows = 100
pd.options.display.max_columns = 40
import numpy as np
import os,random, math
from tqdm import tqdm
from copy import deepcopy
from collections import Counter

# Visualization
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import font_manager, rc
plt.rcParams['font.family'] = 'NanumGothic'
import platform
if platform.system() == 'Windows':
    font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
    rc('font', family=font_name)
else:    
    rc('font', family='AppleGothic')

matplotlib.rcParams['axes.unicode_minus'] = False

# from dataprep.eda import plot, plot_correlation, plot_missing

# Warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

In [271]:
import os
path = os.path.dirname(os.getcwd())
path = os.path.join(path, "ProcessedData")
train = pd.read_csv(os.path.join(path,"merged_train.csv"))
test = pd.read_csv(os.path.join(path,"merged_test.csv"))

In [272]:
apt_tr = train[train['임대건물구분']=='아파트']
store_tr = train[train['임대건물구분']=='상가']

apt_test = test[test['임대건물구분']=='아파트']
store_test = test[test['임대건물구분']=='상가']

In [273]:
print(train.shape)
print(test.shape)

(2896, 34)
(1008, 33)


In [274]:
train['공가수_비율'] =  train.apply(lambda x : x['공가수']/x['총세대수'],axis=1)
train['세대당_가능주차면수'] = train.apply(lambda x : x['단지내주차면수']/x['총세대수'],axis=1)
test['공가수_비율'] =  test.apply(lambda x : x['공가수']/x['총세대수'],axis=1)
test['세대당_가능주차면수'] = test.apply(lambda x : x['단지내주차면수']/x['총세대수'],axis=1)

In [275]:
train['0~19 인구수'] = train['0~19세_비율']*train['총세대수']
train['20~39 인구수'] = train['20~39세_비율']*train['총세대수']
train['40~69 인구수'] = train['40~69세_비율']*train['총세대수']
train['70세이상 인구수'] = train['70세이상_비율']*train['총세대수']

test['0~19 인구수'] = test['0~19세_비율']*test['총세대수']
test['20~39 인구수'] = test['20~39세_비율']*test['총세대수']
test['40~69 인구수'] = test['40~69세_비율']*test['총세대수']
test['70세이상 인구수'] = test['70세이상_비율']*test['총세대수']

In [276]:
area = ['경상남도', '전라북도', '강원도', '광주광역시', '충청남도', '제주특별자치도', '울산광역시', '충청북도', '전라남도', '경상북도', '세종특별자치시']
for ind in range(train.shape[0]):
    if train.loc[ind,'지역'] in area :
        train.loc[ind,'환승역 수'] = 0
train = train.join(pd.get_dummies(train['환승역 수'],prefix='sub')).iloc[:,:-1]
train = train.drop('환승역 수',axis=1)

In [277]:
for ind in range(test.shape[0]):
    if test.loc[ind,'지역'] in area :
        test.loc[ind,'환승역 수'] = 0
test = test.join(pd.get_dummies(test['환승역 수'],prefix='sub')).iloc[:,:-1]
test = test.drop('환승역 수',axis=1)

In [278]:
def func2(x):
    if x<0.005: 
        return 0
    elif 0.005<x<0.02:
        return 1
    elif 0.02<x<1:
        return 2
    elif x>1:
        return 3

In [279]:
train['sub_cat'] = train['subway_dist'].apply(lambda x : func2(x))
test['sub_cat'] = test['subway_dist'].apply(lambda x : func2(x))

In [280]:
train = train.join(pd.get_dummies(train['sub_cat'],prefix='sub_dist')).iloc[:,:-1]
train = train.drop(['subway_dist','sub_cat'],axis=1)

In [281]:
test = test.join(pd.get_dummies(test['sub_cat'],prefix='sub_dist')).iloc[:,:-1]
test = test.drop(['subway_dist','sub_cat'],axis=1)

## 공급유형

In [282]:
def func3(x,dic):
    for i in dic.keys():
        if len(dic[i]) == len(x):
            if all(dic[i] == x):
                return i

In [283]:
# 방안 2 

train.loc[train['공급유형'].isin(['영구임대', '행복주택']),'공급유형'] = '영구/행복'
test.loc[test['공급유형'].isin(['영구임대', '행복주택']),'공급유형'] = '영구/행복'

train.loc[train['공급유형'].isin(['공공임대(10년)', '공공임대(5년)']),'공급유형'] = '공공임대(단기)'
test.loc[test['공급유형'].isin(['공공임대(10년)', '공공임대(5년)']),'공급유형'] = '공공임대(단기)'

no_shop = train[train['공급유형']!='임대상가']
uniq = np.reshape(no_shop['공급유형'].unique(),(7,1))

dic = dict(enumerate(uniq))
dic[8] = ['공공임대(단기)', '국민임대']
dic[9] = ['장기전세', '국민임대', '영구/행복']
dic[10] = ['국민임대', '영구/행복']
dic[11] = ['공공임대(단기)', '공공임대(분납)']
dic[12] = ['공공분양', '공공임대(단기)', '공공임대(분납)']
dic[13] = ['공공임대(50년)', '영구/행복']
dic[14] = ['장기전세', '공공임대(단기)', '공공임대(분납)']

# train
no_shop = train[train['공급유형']!='임대상가']
sample = pd.DataFrame(no_shop.groupby('단지코드')['공급유형'].unique()).reset_index()
sample['cluster'] = sample['공급유형'].apply(lambda x : func3(x,dic))
sample['cluster'] = sample['cluster'].apply(lambda x: int(x))
train = pd.merge(train,sample[['단지코드','cluster']],on='단지코드',how='left')
train = train.join(pd.get_dummies(train['cluster'],prefix='type')).iloc[:,:-1]
train = train.drop(['공급유형','cluster'],axis=1)

# test
no_shop = test[test['공급유형']!='임대상가']
sample = pd.DataFrame(no_shop.groupby('단지코드')['공급유형'].unique()).reset_index()
sample['cluster'] = sample['공급유형'].apply(lambda x : func3(x,dic))
test = pd.merge(test,sample[['단지코드','cluster']],on='단지코드',how='left')
test = test.join(pd.get_dummies(test['cluster'],prefix='type')).iloc[:,:-1]
test = test.drop(['공급유형','cluster'],axis=1)
test['type_8'] = 0
test['type_9'] = 0
test['type_11'] = 0
test['type_12'] = 0
test['type_13'] = 0

# sample = pd.DataFrame(no_shop.groupby('단지코드')['공급유형'].nunique()).reset_index()
# ind = sample[sample['공급유형']>1]['단지코드'].values
# no_shop.loc[no_shop['단지코드'].isin(ind),:].groupby('단지코드')['공급유형'].unique()

In [284]:
drop = ['남/여비율','남/여_0~19세','남/여_20~39세','남/여_40~69세','남/여_70세이상','0~19세_비율','20~39세_비율','40~69세_비율','70세이상_비율']
drop1 = ['총세대수','임대건물구분','지역','공가수','자격유형','단지내주차면수','단지명','도로명주소','subway_name']

In [285]:
train = train.drop(drop,axis=1)
test = test.drop(drop,axis=1)

train = train.drop(drop1,axis=1)
test = test.drop(drop1,axis=1)

In [286]:
non_unique = ['전용면적','전용면적별세대수','임대보증금','임대료']
train_one = train.drop(non_unique,axis=1).drop_duplicates().reset_index(drop=True)
test_one = test.drop(non_unique,axis=1).drop_duplicates().reset_index(drop=True)

In [287]:
def merge(col,standard,oldDF,newDF,kind):
    sample = oldDF.groupby('단지코드').describe()[col].reset_index()
    for i in sample.index:
        for j in standard:
            code = sample.loc[i,'단지코드']
            val = sample.loc[i,j]
            ind = newDF[newDF['단지코드']==code].index
            newDF.loc[ind,kind+col+j] = val
    return newDF

In [288]:
train_one = merge('임대료',['min','25%','50%','75%','max'],apt_tr,train_one,'apt')
train_one = merge('임대보증금',['min','25%','50%','75%','max'],apt_tr,train_one,'apt')

train_one = merge('임대료',['min','25%','50%','75%','max'],store_tr,train_one,'store')
train_one = merge('임대보증금',['min','25%','50%','75%','max'],store_tr,train_one,'store')

train_one = merge('전용면적',['min','25%','50%','75%','max'],apt_tr,train_one,'apt')
train_one = merge('전용면적',['mean','std'],store_tr,train_one,'store')

train_one = train_one.replace(np.nan, 0)

In [289]:
test_one = merge('임대료',['min','25%','50%','75%','max'],apt_test,test_one,'apt')
test_one = merge('임대보증금',['min','25%','50%','75%','max'],apt_test,test_one,'apt')

test_one = merge('임대료',['min','25%','50%','75%','max'],store_test,test_one,'store')
test_one = merge('임대보증금',['min','25%','50%','75%','max'],store_test,test_one,'store')

test_one = merge('전용면적',['min','25%','50%','75%','max'],apt_test,test_one,'apt')
test_one = merge('전용면적',['mean','std'],store_test,test_one,'store')

test_one = test_one.replace(np.nan, 0)

In [290]:
print(train_one.shape)
print(test_one.shape)

(414, 57)
(147, 56)


## 유민이 코드추가 

In [291]:
# Fitting
from sklearn.cross_decomposition import PLSRegression
from sklearn.svm import SVR
from statsmodels.api import OLS
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
# from xgboost import XGBRegressor
# import lightgbm as LGB
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error 

# Validation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold, RepeatedKFold

In [None]:
def fit(X_train, X_test, y_train, y_test, model):
    try:
        model.fit(X_train, y_train, verbose=500)
    except:
        model.fit(X_train, y_train)
    tst_pred = model.predict(X_test)
    error = mean_absolute_error(y_test,tst_pred)
    return model, tst_pred, error

def fit_models(X_train1, X_test1, y_train1, y_test1, models):
    best_error = 400
    best_tst_pred = 0
    best_model = ""
    for m in models:
        model, tst_pred, error = fit(X_train1, X_test1, y_train1, y_test1, m)
        if best_error > error:
            best_error = error
            best_tst_pred = tst_pred
            best_model = model
    return best_model, best_tst_pred, best_error

In [292]:
def fit(X_train, X_test, y_train, y_test, model, cv=False):
    try:
        model.fit(X_train, y_train, verbose=False)
    except:
        model.fit(X_train, y_train)
    if cv == False:
        tst_pred = model.predict(X_test)
        error = mean_absolute_error(y_test,tst_pred)
    else:
        tst_pred = model.predict(np.concatenate((X_train, X_test),axis=0))
        error = mean_absolute_error(np.concatenate((y_train, y_test),axis=0),tst_pred)
    return model, tst_pred, error


def fit_models(X, y, models, cv=False):
    X = np.array(X.reset_index(drop=True))
    y = np.array(y.reset_index(drop=True))
    best_error = 10000
    best_tst_pred = 0
    best_model = ""
    if cv==False:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True,random_state=2021)
        for m in models:
            model, tst_pred, error = fit(X_train, X_test, y_train, y_test, m)
            if best_error > error:
                best_error = error
                best_tst_pred = tst_pred
                best_model = model
    else:
        kf = KFold(n_splits = 5, shuffle=True,random_state = 2021)
        for m in models:
            mae = []
            pred= []
            for train_index, test_index in kf.split(X, y):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                model, tst_pred, error = fit(X_train, X_test, y_train, y_test, m, cv=True)
                mae.append(error)
                pred.append(tst_pred)
            cv_error = np.mean(mae)
            print(f"model:{str(m).split('(')[0]}\nMAE:{cv_error}")
            if best_error > cv_error:
                best_error = cv_error
                best_tst_pred = pred
                best_model = model
    return best_model, best_tst_pred, best_error

In [293]:
train_one = train_one.drop(["apt임대료25%","apt임대료75%","apt전용면적25%","apt전용면적75%","store임대료25%","store임대료75%","store임대보증금25%","store임대보증금75%","apt임대보증금25%","apt임대보증금75%"],axis=1)
test_one = test_one.drop(["apt임대료25%","apt임대료75%","apt전용면적25%","apt전용면적75%","store임대료25%","store임대료75%","store임대보증금25%","store임대보증금75%","apt임대보증금25%","apt임대보증금75%"],axis=1)


In [294]:
tr1 = train_one[train_one.세대당_가능주차면수< 0.8304195804195804]
tr2 = train_one[train_one.세대당_가능주차면수>= 0.8304195804195804]

tst1 = test_one[test_one.세대당_가능주차면수< 0.8304195804195804]
tst2 = test_one[test_one.세대당_가능주차면수>= 0.8304195804195804]

In [295]:
X1 = tr1.drop(['등록차량수','단지코드'],axis=1)
y1 = tr1['등록차량수']
# y1 = np.log(tr1['등록차량수'])
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.3,random_state=2021,shuffle=True)

# X2 = tr2.drop(['등록차량수','단지코드'],axis=1)
# y2 = tr2['등록차량수']
# X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.3,random_state=2021,shuffle=True)
X2 = tr2.drop(['등록차량수','단지코드'],axis=1)
# y2 = np.log(tr2['등록차량수'])
y2 = tr2['등록차량수']
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.3,random_state=2021,shuffle=True)

In [296]:
test_one.to_csv("/Users/seungji/Desktop/Dacon/dataforR/test.csv",index =False)

In [297]:
tst_pred = pd.read_csv("/Users/seungji/Desktop/Dacon/dataforR/tst_pred.csv")

In [299]:
len(tst_pred)

147

In [301]:
po_res = pd.concat([test_one["단지코드"],pd.DataFrame(tst_pred)],axis=1)

In [303]:
po_res

Unnamed: 0,단지코드,tst_pred
0,C1072,1259.2512
1,C1128,170.2334
2,C1456,282.9598
3,C1840,529.7911
4,C1332,348.6357
...,...,...
142,C2456,885.4875
143,C1266,300.7561
144,C2152,508.5030
145,C1267,505.8816


In [302]:
def myfunc():
    cb = pd.read_csv("/Users/seungji/Desktop/Dacon/Daesamanlap/EDA_CYR/0713_cb.csv")
    cb = cb.loc[-cb['code'].isin(['C2675', 'C2335', 'C1327']),:].reset_index(drop=True)
    return cb

In [308]:
 mean_absolute_error(po_res["tst_pred"],res['num'])

345.93704335391044

In [304]:
real_y = po_res["tst_pred"]

In [309]:
res = myfunc()
mean_absolute_error(real_y,res['num'])

345.93704335391044

In [310]:
# result = pd.concat([te.단지코드.reset_index(drop=True), pd.DataFrame(tst_predict)],axis=1)
po_res.columns = ["code","num"]
sample = pd.read_csv("/Users/seungji/Desktop/Dacon/parking_data/sample_submission.csv")
sub = sample.merge(po_res, left_on = "code", right_on = "code",how = "left")
sub = sub[["code","num_y"]]
sub.columns = ["code","num"]
sub.loc[sub.num<0,"num"] = np.min(sub.loc[sub.num>=0,"num"])
sub.to_csv("glm718.csv",index=False)

In [None]:
set(X_train1.columns) - set(X_test1.columns)

In [None]:

models = [
#           XGBRegressor(random_state=2021)
         ]

best_model1, best_tst_pred1, best_error1 = fit_models(X_train1, X_test1, y_train1, y_test1, models)
print(best_model1, best_error1)

best_model2, best_tst_pred2, best_error2 = fit_models(X_train2, X_test2, y_train2, y_test2, models)
print(best_model2, best_error2)

In [None]:
list(X_train1.columns[X_train1.sum() == 0])

In [None]:
list(X_train2.columns[X_train1.sum() == 0])

In [None]:
X_train1.columns

In [None]:
X_test1.columns

In [None]:
X_test1 = X_test1.drop(list(X_train1.columns[X_train1.sum() == 0]),axis=1)
X_train1 = X_train1.drop(list(X_train1.columns[X_train1.sum() == 0]),axis=1)
X_test2 = X_test2.drop(list(X_train2.columns[X_train2.sum() == 0]),axis=1)
X_train2 = X_train2.drop(list(X_train2.columns[X_train2.sum() == 0]),axis=1)


X_test1.to_csv("/Users/seungji/Desktop/Dacon/dataforR/X_test1.csv",index =False)
X_train1.to_csv("/Users/seungji/Desktop/Dacon/dataforR/X_train1.csv",index =False)
X_test2.to_csv("/Users/seungji/Desktop/Dacon/dataforR/X_test2.csv",index =False)
X_train2.to_csv("/Users/seungji/Desktop/Dacon/dataforR/X_train2.csv",index =False)
y_test1.to_csv("/Users/seungji/Desktop/Dacon/dataforR/y_test1.csv",index =False)
y_train1.to_csv("/Users/seungji/Desktop/Dacon/dataforR/y_train1.csv",index =False)
y_test2.to_csv("/Users/seungji/Desktop/Dacon/dataforR/y_test2.csv",index =False)
y_train2.to_csv("/Users/seungji/Desktop/Dacon/dataforR/y_train2.csv",index =False)


In [None]:
X_train1.shape, y_train1.shape, X_test1.shape

In [None]:
X_train2.shape, y_train2.shape, X_test2.shape

In [None]:
np.array(y_train1)

In [None]:
from statsmodels.genmod.generalized_linear_model import GLM
from statsmodels.genmod.families import Poisson

In [None]:
y_train1

In [None]:
model = GLM(y_train1,X_train1,family=Poisson())
model = model.fit()
mean_absolute_error(list(model.predict(X_test1)),y_test1)

In [None]:
model = GLM(y_train2,X_train2,family=Poisson())
model = model.fit()
mean_absolute_error(list(model.predict(X_test2)),y_test2)

In [None]:
from sklearn import linear_model
model = linear_model.PoissonRegressor()
model = model.fit(X_train1,y_train1)
model.predict(X_test1)

In [None]:
from sklearn import linear_model
model = LinearRegression()
model = model.fit(X_train1,y_train1)
model.predict(X_test1)

In [None]:
from sklearn import linear_model
model = linear_model.PoissonRegressor()
model.fit(X_train1,y_train1)
model.predict(X_test1)

In [None]:
model.coef_

In [None]:
data = X_train1
data.exog = sm.add_constant(data,prepend=False)

In [None]:
import statsmodels.api as sm
from scipy import stats
model = sm.GLM(data.exog,X_test1,family = sm.families.Poisson())
model_rs = model.fit()
print(model_rs.summary())

In [None]:
train[train.단지코드.isin(["C1176"])]

In [None]:
train[train.단지코드.isin(["C1363"])]

In [None]:
train.describe()

In [None]:
train[train.단지코드.isin(["C2612"])]

In [None]:
from sklearn.model_selection import KFold

In [None]:
kf = KFold(n_splits = 5,random_state = 2021)
mae = []
for train_index, test_index in kf.split(X_tr, y):
    X_train, X_test = X_tr[train_index], X_tr[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model = LinearDiscriminantAnalysis()
    model.fit(X_train, y_train)
    mae.append(mean_absolute_error(y_test,model.predict(X_test)))
np.mean(mae)