In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
from tabulate import tabulate

# 데이터 불러오기
data_2020 = pd.read_csv(r'/content/drive/MyDrive/Acorn/Project/real_final_2020.csv')
data_2021 = pd.read_csv(r'/content/drive/MyDrive/Acorn/Project/real_final_2021.csv')
data_2022 = pd.read_csv(r'/content/drive/MyDrive/Acorn/Project/real_final_2022.csv')

data_2020['대여소ID'] = data_2020['대여소ID'].str[3:].astype(int)
data_2021['대여소ID'] = data_2021['대여소ID'].str[3:].astype(int)
data_2022['대여소ID'] = data_2022['대여소ID'].str[3:].astype(int)

print('합쳐진 데이터')
df = pd.concat([data_2020, data_2021,data_2022], axis=0)
print(tabulate(df.head(3), headers='keys', tablefmt='psql', showindex=True))

In [None]:
# 데이터프레임에서 'Pm2.5' 열의 최댓값과 최솟값 확인
max_pm25 = df['Pm2.5'].max()
min_pm25 = df['Pm2.5'].min()

print(f"Pm2.5 열의 최댓값: {max_pm25}")
print(f"Pm2.5 열의 최솟값: {min_pm25}")

max_tem = df['평균기온(°C)'].max()
min_tem = df['평균기온(°C)'].min()

print(f"평균기온(°C) 열의 최댓값: {max_tem}")
print(f"평균기온(°C) 열의 최솟값: {min_tem}")

max_people = df['유동인구(명)'].max()
min_people = df['유동인구(명)'].min()

print(f"유동인구(명) 열의 최댓값: {max_people}")
print(f"유동인구(명) 열의 최솟값: {min_people}")


In [None]:
def replace_outliers_with_mean_iqr(column):
    Q1 = column.quantile(0.25)
    Q3 = column.quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers_lower = column < lower_bound
    outliers_upper = column > upper_bound
    
    column[outliers_lower] = column.median()
    column[outliers_upper] = column.median()
    
    num_replaced = outliers_lower.sum() + outliers_upper.sum()
    
    return column, num_replaced

# '평균기온(°C)', 'Pm2.5', '유동인구(명)' 열에 대해 이상치 처리
columns_to_process = ['평균기온(°C)', 'Pm2.5', '유동인구(명)']
for col in columns_to_process:
    df[col], num_replaced = replace_outliers_with_mean_iqr(df[col])
    print(f"열 '{col}'에서 이상치로 대체된 행의 개수: {num_replaced}")


# LGBM

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, explained_variance_score, mean_absolute_error
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# 독립변수 및 종속변수 설정
columns_to_keep = [col for col in df.columns if col not in ['대여건수', '반납건수']]
train_x = df[columns_to_keep]
train_y1 = df['대여건수']
train_y2 = df['반납건수']

# 데이터 분할
X_train, X_test, y1_train, y1_test = train_test_split(train_x, train_y1, test_size=0.2, random_state=42)
_, _, y2_train, y2_test = train_test_split(train_x, train_y2, test_size=0.2, random_state=42)


# LGBM 모델
lgbm_model_rent = lgb.LGBMRegressor()
lgbm_model_return = lgb.LGBMRegressor()
lgbm_model_rent.fit(X_train, y1_train)
lgbm_model_return.fit(X_train, y2_train)

# 함수화
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    return r2, mse, mae, y_pred

# 모델 평가
lgbm_rent_metrics = evaluate_model(lgbm_model_rent, X_test, y1_test)
lgbm_return_metrics = evaluate_model(lgbm_model_return, X_test, y2_test)

# 결과 출력
print("LGBM Rent Metrics:")
print("R2 Score:", lgbm_rent_metrics[0])
print("Mean Squared Error:", lgbm_rent_metrics[1])
print("Mean Absolute Error:", lgbm_rent_metrics[2])

print("LGBM Return Metrics:")
print("R2 Score:", lgbm_return_metrics[0])
print("Mean Squared Error:", lgbm_return_metrics[1])
print("Mean Absolute Error:", lgbm_return_metrics[2])

# OLS

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.preprocessing import PolynomialFeatures
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# 독립변수 및 종속변수 설정
columns_to_keep = [col for col in df.columns if col not in ['대여건수', '반납건수']]
train_x = df[columns_to_keep]
train_y1 = df['대여건수']
train_y2 = df['반납건수']

# 데이터 분할
X_train, X_test, y1_train, y1_test = train_test_split(train_x, train_y1, test_size=0.2, random_state=42)
_, _, y2_train, y2_test = train_test_split(train_x, train_y2, test_size=0.2, random_state=42)

def train_and_evaluate_ols(X_train, y_train, X_test, y_test, degree=2):
    # 다항 특성 생성 (상수항 미포함, 상호 작용만 포함)
    poly = PolynomialFeatures(degree, include_bias=False, interaction_only=True)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.transform(X_test)

    # OLS 모델을 위해 상수항 추가
    X_train_poly_with_constant = sm.add_constant(X_train_poly)

    # OLS 모델 학습
    model = sm.OLS(y_train, X_train_poly_with_constant).fit()

    # 학습된 모델 요약 출력
    print(model.summary())

    # 예측 및 평가 지표 계산
    X_test_poly_with_constant = sm.add_constant(X_test_poly)
    y_pred = model.predict(X_test_poly_with_constant)
    mse = np.mean((y_test - y_pred) ** 2)
    rmse = np.sqrt(mse)

    # 평가 지표 출력
    print("Mean Squared Error:", mse)
    print("Root Mean Squared Error:", rmse)

# 대여건수에 대한 모델 학습 및 평가
print("대여 모델 평가:")
train_and_evaluate_ols(X_train, y1_train, X_test, y1_test, degree=3)  # 예시로 degree를 3으로 설정했습니다. 필요에 따라 조정하세요.

# 반납건수에 대한 모델 학습 및 평가
print("반납 모델 평가:")
train_and_evaluate_ols(X_train, y2_train, X_test, y2_test, degree=3)  # 예시로 degree를 3으로 설정했습니다. 필요에 따라 조정하세요.


# LinearRegression

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import PolynomialFeatures
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

# 독립변수 및 종속변수 설정
columns_to_keep = [col for col in df.columns if col not in ['대여건수', '반납건수']]
train_x = df[columns_to_keep]
train_y1 = df['대여건수']
train_y2 = df['반납건수']

# 데이터 분할
X_train, X_test, y1_train, y1_test = train_test_split(train_x, train_y1, test_size=0.2, random_state=42)
_, _, y2_train, y2_test = train_test_split(train_x, train_y2, test_size=0.2, random_state=42)

# 모델 평가 함수 정의
def evaluate_model(model, X_train, y_train, X_test, y_test, poly_degree):
    poly = PolynomialFeatures(degree=poly_degree)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.transform(X_test)

    model.fit(X_train_poly, y_train)
    y_pred = model.predict(X_test_poly)

    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)

    print("R-squared (결정 계수):", r2)
    print("평균 제곱 오차:", mse)
    print("평균 제곱근 오차:", rmse)
    print("평균 절대 오차:", mae)

model = LinearRegression()

# y1에 대한 모델 학습 및 평가
print("대여건수에 대한 모델 평가:")
evaluate_model(model, X_train, y1_train, X_test, y1_test, poly_degree=3)

# y2에 대한 모델 학습 및 평가
print("\n반납건수에 대한 모델 평가:")
evaluate_model(model, X_train, y2_train, X_test, y2_test, poly_degree=3)

# XGBOOST

In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, explained_variance_score, mean_absolute_error
import matplotlib.pyplot as plt


# 독립변수 및 종속변수 설정
columns_to_keep = [col for col in df.columns if col not in ['대여건수', '반납건수']]
train_x = df[columns_to_keep]
train_y1 = df['대여건수']
train_y2 = df['반납건수']

# 데이터 분할
X_train, X_test, y1_train, y1_test = train_test_split(train_x, train_y1, test_size=0.2, random_state=42)
_, _, y2_train, y2_test = train_test_split(train_x, train_y2, test_size=0.2, random_state=42)

# 모델 작성
xgb_model_rent = xgb.XGBRegressor()
xgb_model_return = xgb.XGBRegressor()
xgb_model_rent.fit(X_train, y1_train)
xgb_model_return.fit(X_train, y2_train)

# 모델 학습 및 평가
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    return r2, mse, mae

# XGBoost 모델 평가
xgb_rent_metrics = evaluate_model(xgb_model_rent, X_test, y1_test)
xgb_return_metrics = evaluate_model(xgb_model_return, X_test, y2_test)

# 결과 출력
print("XGBoost Rent Metrics:")
print("R2 Score:", xgb_rent_metrics[0])
print("Mean Squared Error:", xgb_rent_metrics[1])
print("Mean Absolute Error:", xgb_rent_metrics[2])

print("XGBoost Return Metrics:")
print("R2 Score:", xgb_return_metrics[0])
print("Mean Squared Error:", xgb_return_metrics[1])
print("Mean Absolute Error:", xgb_return_metrics[2])
