In [None]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

In [None]:
train = pd.read_csv('./train.csv')

In [None]:
# SimpleImputer : 평균 대체
mean_imputer = SimpleImputer(strategy='mean')

# 결측값을 평균으로 대체할 열 목록
columns_fill_mean = ['해당층', '총층','총주차대수', '방수', '욕실수']

# 학습 데이터에 fit 및 transform
train[columns_fill_mean] = mean_imputer.fit_transform(train[columns_fill_mean])

In [None]:
train.isnull().sum()

Unnamed: 0,0
ID,0
매물확인방식,0
보증금,0
월세,0
전용면적,787
해당층,0
총층,0
방향,0
방수,0
욕실수,0


In [None]:
import pandas as pd
import numpy as np
from scipy.stats import zscore

In [None]:
# 방수, 욕실수 결측값은 1로 채움
exclude_column = '방수', '욕실수'

train.loc[:, train.columns != exclude_column] = train.loc[:, train.columns != exclude_column].fillna(1)


In [None]:
# 방수와 욕실수 조합별 총 개수 계산
grouped_df = train.groupby(['방수', '욕실수']).size().reset_index(name='총_개수')

# 결과 출력
print(grouped_df)


         방수       욕실수  총_개수
0  1.000000  1.000000  1514
1  1.000000  1.034511     1
2  1.378079  1.034511    16
3  2.000000  1.000000   836
4  2.000000  1.034511     1
5  2.000000  2.000000    84


In [None]:
# 방수와 욕실수 조합별 전용면적 평균 계산
area_mean_df = train.groupby(['방수', '욕실수'])['전용면적'].mean().reset_index(name='전용면적_평균')

# 기존 데이터프레임에 병합
merged_df = pd.merge(grouped_df, area_mean_df, on=['방수', '욕실수'], how='left')

# 결과 출력
print(merged_df)


         방수       욕실수  총_개수    전용면적_평균
0  1.000000  1.000000  1514  25.078175
1  1.000000  1.034511     1        NaN
2  1.378079  1.034511    16        NaN
3  2.000000  1.000000   836  32.019459
4  2.000000  1.034511     1  39.660000
5  2.000000  2.000000    84  26.570125


In [None]:
# 방수와 욕실수 조합별 전용면적 평균 계산
area_mean_df = train.groupby(['방수', '욕실수'])['전용면적'].mean()

# 전용면적 결측값 대체
train['전용면적'] = train.apply(
    lambda row: area_mean_df.loc[row['방수'], row['욕실수']]
    if pd.isnull(row['전용면적']) and (row['방수'], row['욕실수']) in area_mean_df.index
    else row['전용면적'],
    axis=1
)

In [None]:
x1 = train.drop(['ID','허위매물여부'],axis=1)
y1 = train['허위매물여부']

LGBM

In [None]:
import lightgbm as lgb
import pandas as pd
import numpy as np

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
from sklearn.preprocessing import LabelEncoder

# 범주형 변수 목록
categorical_features = ['제공플랫폼', '방향', '매물확인방식', '주차가능여부', '중개사무소', '게재일']

# 범주형 변수를 category 타입으로 변환
for col in categorical_features:
    x1[col] = x1[col].astype('category')

# LightGBM Dataset 생성
train_data = lgb.Dataset(x1, label=y1, categorical_feature=categorical_features)

# 모델 학습
params = {
    'objective': 'binary',
    'metric': 'f1',
    'learning_rate': 0.01,
    'class_weight': 'balanced',
    'num_leaves': 31,
    'max_depth': 5
}

num_round = 100
model = lgb.train(
    params,
    train_data,
    num_boost_round=num_round,
    callbacks=[lgb.log_evaluation(10)]
)


[LightGBM] [Info] Number of positive: 298, number of negative: 2154
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000140 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1249
[LightGBM] [Info] Number of data points in the train set: 2452, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.121533 -> initscore=-1.977988
[LightGBM] [Info] Start training from score -1.977988


In [None]:
test = pd.read_csv('./test.csv')

In [None]:
# test 데이터 전처리

mean_imputer = SimpleImputer(strategy='mean')
columns_fill_mean = ['해당층', '총층', '총주차대수', '방수', '욕실수']
test[columns_fill_mean] = mean_imputer.fit_transform(test[columns_fill_mean])


# 전용면적 결측값 대체
area_mean_tst = test.groupby(['방수', '욕실수'])['전용면적'].mean()
test['전용면적'] = test.apply(
    lambda row: area_mean_tst.loc[row['방수'], row['욕실수']]
    if pd.isnull(row['전용면적']) and (row['방수'], row['욕실수']) in area_mean_tst.index
    else row['전용면적'],
    axis=1
)

# 범주형 변수를 category 타입으로 변환
for col in categorical_features:
    test[col] = test[col].astype('category')

test.drop(['ID'], axis=1, inplace=True)


# 예측
y_pred = model.predict(test)
y_pred_binary = (y_pred > 0.5).astype(int)

print("확률값:", y_pred)
print("이진값:", y_pred_binary)


확률값: [0.07298791 0.50696467 0.11298481 0.06972402 0.0975234  0.06972402
 0.06913534 0.20035894 0.17129887 0.0975234  0.10138393 0.11298481
 0.50538402 0.11272045 0.10141986 0.14544108 0.10962088 0.06972402
 0.10141986 0.12914346 0.10138393 0.50148334 0.11191918 0.12152577
 0.14584741 0.52051437 0.16201309 0.13552422 0.20182271 0.52436532
 0.51965765 0.10000476 0.11373081 0.12893256 0.09918768 0.50036427
 0.20675323 0.11250731 0.11405429 0.0975234  0.39678371 0.07237382
 0.17129887 0.06913534 0.11685403 0.11191918 0.1097369  0.11627615
 0.07438494 0.5214407  0.41448209 0.0975234  0.12425555 0.51381526
 0.06972402 0.19947474 0.09918768 0.11298481 0.10548353 0.06972402
 0.11373081 0.12949718 0.12732556 0.52710016 0.06972402 0.06972402
 0.06972402 0.06972402 0.50415878 0.11191918 0.11191918 0.11281564
 0.10799663 0.11250731 0.11814892 0.13284403 0.52436532 0.521565
 0.06972402 0.12732556 0.1198852  0.11298481 0.52374345 0.10138393
 0.52005087 0.10799663 0.07501471 0.13509071 0.12893256 0.1

In [None]:
submit = pd.read_csv('./sample_submission.csv')
submit['허위매물여부'] = y_pred_binary
submit.head()

Unnamed: 0,ID,허위매물여부
0,TEST_000,0
1,TEST_001,1
2,TEST_002,0
3,TEST_003,0
4,TEST_004,0


In [None]:
submit.to_csv('./lgbm5.csv', index=False)

#lgbm6***** 제일 높음

In [None]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

train = pd.read_csv('/content/drive/MyDrive/DF/train.csv')


# SimpleImputer : 평균 대체
mean_imputer = SimpleImputer(strategy='mean')

# 결측값을 평균으로 대체할 열 목록
columns_fill_mean = ['해당층', '총층','총주차대수']

# 학습 데이터에 fit 및 transform
train[columns_fill_mean] = mean_imputer.fit_transform(train[columns_fill_mean])

train.isnull().sum()

Unnamed: 0,0
ID,0
매물확인방식,0
보증금,0
월세,0
전용면적,787
해당층,0
총층,0
방향,0
방수,16
욕실수,18


In [None]:
# 방수, 욕실수 결측값은 1로 채움
exclude_column = '방수', '욕실수'

train.loc[:, train.columns != exclude_column] = train.loc[:, train.columns != exclude_column].fillna(1)


# 방수와 욕실수 조합별 총 개수 계산
grouped_df = train.groupby(['방수', '욕실수']).size().reset_index(name='총_개수')

# 결과 출력
print(grouped_df)



    방수  욕실수  총_개수
0  1.0  1.0  1531
1  2.0  1.0   837
2  2.0  2.0    84


In [None]:
# 방수와 욕실수 조합별 전용면적 평균 계산
area_mean_df = train.groupby(['방수', '욕실수'])['전용면적'].mean().reset_index(name='전용면적_평균')

# 기존 데이터프레임에 병합
merged_df = pd.merge(grouped_df, area_mean_df, on=['방수', '욕실수'], how='left')

# 결과 출력
print(merged_df)


    방수  욕실수  총_개수    전용면적_평균
0  1.0  1.0  1531  18.928883
1  2.0  1.0   837  17.500956
2  2.0  2.0    84  25.352500


In [None]:
# 방수와 욕실수 조합별 전용면적 평균 계산
area_mean_df = train.groupby(['방수', '욕실수'])['전용면적'].mean()

# 전용면적 결측값 대체
train['전용면적'] = train.apply(
    lambda row: area_mean_df.loc[row['방수'], row['욕실수']]
    if pd.isnull(row['전용면적']) and (row['방수'], row['욕실수']) in area_mean_df.index
    else row['전용면적'],
    axis=1
)

In [None]:
import pandas as pd
from datetime import datetime

# 2. '게재일' 칼럼을 날짜 형식으로 변환
train['게재일'] = pd.to_datetime(train['게재일'])

# 3. 날짜를 특정 기준일로부터의 일수로 변환
reference_date = train['게재일'].min()  # 기준일: 첫 번째 데이터 날짜
train['게재일_일수'] = (train['게재일'] - reference_date).dt.days
# 5. 필요에 따라 '게재일' 삭제
train = train.drop(columns=['게재일'])

In [None]:
train.isnull().sum()

Unnamed: 0,0
ID,0
매물확인방식,0
보증금,0
월세,0
전용면적,0
해당층,0
총층,0
방향,0
방수,0
욕실수,0


In [None]:
x1 = train.drop(['ID','허위매물여부'],axis=1)
y1 = train['허위매물여부']

In [None]:
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMClassifier
import lightgbm as lgb

# 범주형 변수 목록
categorical_features = ['제공플랫폼', '방향', '매물확인방식', '주차가능여부', '중개사무소']

# 범주형 변수를 category 타입으로 변환
for col in categorical_features:
    x1[col] = x1[col].astype('category')

# LightGBM Dataset 생성
train_data = lgb.Dataset(x1, label=y1, categorical_feature=categorical_features)

# 모델 학습
params = {
    'objective': 'binary',
    'metric': 'f1',
    'learning_rate': 0.01,
    'class_weight': 'balanced',
    'num_leaves': 31,
    'max_depth': 5
}

num_round = 100
model = lgb.train(
    params,
    train_data,
    num_boost_round=num_round,
    callbacks=[lgb.log_evaluation(10)]
)


[LightGBM] [Info] Number of positive: 298, number of negative: 2154
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000994 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1153
[LightGBM] [Info] Number of data points in the train set: 2452, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.121533 -> initscore=-1.977988
[LightGBM] [Info] Start training from score -1.977988


In [None]:
test = pd.read_csv('/content/drive/MyDrive/DF/test.csv')
# test 데이터 전처리

mean_imputer = SimpleImputer(strategy='mean')
columns_fill_mean = ['해당층', '총층', '총주차대수']

# 방수, 욕실수 결측값은 1로 채움
exclude_column = '방수', '욕실수'
test.loc[:, test.columns != exclude_column] = test.loc[:, test.columns != exclude_column].fillna(1)

# 방수와 욕실수 조합별 전용면적 평균 계산
area_mean_df = test.groupby(['방수', '욕실수'])['전용면적'].mean()

# 전용면적 결측값 대체
test['전용면적'] = test.apply(
    lambda row: area_mean_df.loc[row['방수'], row['욕실수']]
    if pd.isnull(row['전용면적']) and (row['방수'], row['욕실수']) in area_mean_df.index
    else row['전용면적'],
    axis=1
)


from datetime import datetime

test['게재일'] = pd.to_datetime(test['게재일'])
reference_date = test['게재일'].min()
test['게재일_일수'] = (test['게재일'] - reference_date).dt.days
test = test.drop(columns=['게재일'])
test.drop(['ID'], axis=1, inplace=True)

categorical_features = ['제공플랫폼', '방향', '매물확인방식', '주차가능여부', '중개사무소']

# 범주형 변수를 category 타입으로 변환
for col in categorical_features:
    test[col] = test[col].astype('category')


# 예측
y_pred = model.predict(test)
y_pred_binary = (y_pred > 0.5).astype(int)


In [None]:
submit = pd.read_csv('/content/drive/MyDrive/DF/sample_submission.csv')
submit['허위매물여부'] = y_pred_binary
submit.head()

Unnamed: 0,ID,허위매물여부
0,TEST_000,0
1,TEST_001,0
2,TEST_002,1
3,TEST_003,0
4,TEST_004,0


In [None]:
submit.to_csv('./lgbm6.csv', index=False)

함수로 만들어 전처리

In [None]:
# lgbm6

import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer


def preprocess_data(df):
    # 평균으로 결측값 대체
    mean_imputer = SimpleImputer(strategy='mean')
    columns_fill_mean = ['해당층', '총층', '총주차대수']
    df[columns_fill_mean] = mean_imputer.fit_transform(df[columns_fill_mean])

    # 방수, 욕실수 결측값은 1로 채움
    exclude_columns = ['방수', '욕실수']

    df.loc[:, ~df.columns.isin(exclude_columns)] = df.loc[:, ~df.columns.isin(exclude_columns)].fillna(1)

    # 방수와 욕실수 조합별 전용면적 평균 계산
    area_mean_df = df.groupby(['방수', '욕실수'])['전용면적'].mean()

    # 전용면적 결측값 대체
    df['전용면적'] = df.apply(
        lambda row: area_mean_df.loc[row['방수'], row['욕실수']]
        if pd.isnull(row['전용면적']) and (row['방수'], row['욕실수']) in area_mean_df.index
        else row['전용면적'],
        axis=1
    )

    # 게재일 전처리
    df['게재일'] = pd.to_datetime(df['게재일'])
    reference_date = df['게재일'].min()
    df['게재일_일수'] = (df['게재일'] - reference_date).dt.days
    df.drop(columns=['게재일'], inplace=True)

    # ID 컬럼 삭제
    if 'ID' in df.columns:
        df.drop(['ID'], axis=1, inplace=True)

    # 범주형 변수 처리
    categorical_features = ['제공플랫폼', '방향', '매물확인방식', '주차가능여부', '중개사무소']
    for col in categorical_features:
        if col in df.columns:
            df[col] = df[col].astype('category')

    return df


In [None]:
test = pd.read_csv('/content/drive/MyDrive/DF/test.csv')
train = pd.read_csv('/content/drive/MyDrive/DF/train.csv')

In [None]:
train_preprocessed = preprocess_data(train)
test_preprocessed = preprocess_data(test)

In [None]:
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMClassifier
import lightgbm as lgb

x1 = train_preprocessed.drop(['허위매물여부'],axis=1)
y1 = train_preprocessed['허위매물여부']

# LightGBM Dataset 생성
categorical_features = ['제공플랫폼', '방향', '매물확인방식', '주차가능여부', '중개사무소']
train_data = lgb.Dataset(x1, label=y1, categorical_feature=categorical_features)

# 모델 학습
params = {
    'objective': 'binary',
    'metric': 'f1',
    'learning_rate': 0.01,
    'class_weight': 'balanced',
    'num_leaves': 31,
    'max_depth': 5
}

num_round = 100
model = lgb.train(
    params,
    train_data,
    num_boost_round=num_round,
    callbacks=[lgb.log_evaluation(10)]
)

# 예측
y_pred = model.predict(test_preprocessed)
y_pred_binary = (y_pred > 0.5).astype(int)




[LightGBM] [Info] Number of positive: 298, number of negative: 2154
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000672 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1155
[LightGBM] [Info] Number of data points in the train set: 2452, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.121533 -> initscore=-1.977988
[LightGBM] [Info] Start training from score -1.977988


In [None]:
submit = pd.read_csv('/content/drive/MyDrive/DF/sample_submission.csv')
submit['허위매물여부'] = y_pred_binary
submit.head()

Unnamed: 0,ID,허위매물여부
0,TEST_000,0
1,TEST_001,0
2,TEST_002,1
3,TEST_003,0
4,TEST_004,0


In [None]:
submit.to_csv('./lgbm6.csv', index=False)