In [60]:
import pandas as pd

# Excel 파일 읽기
df = pd.read_csv('C:/Users/82106/Desktop/디지털화의 경제 영향도/데이터/processed_digitalization_data.csv')

df.head()

Unnamed: 0,Country,id,Year,INCOME,LGDP,FPS,MPS,BBS,IU,DDI,GFCF,TO,Labor,LCPI,LPOP,CONSUM,RD
0,Pays Bas,1,2000,1,10.607994,0.620927,0.675303,0.016325,0.439844,0.39172,0.225702,1.255219,1.27381,4.406078,16.583433,0.364773,0.017898
1,Pays Bas,1,2001,1,10.623448,0.509298,0.685303,0.026325,0.449844,0.26795,0.235702,1.275219,1.28668,4.446796,16.590981,0.369977,0.017961
2,Pays Bas,1,2002,1,10.619236,0.498189,0.695303,0.036325,0.459844,0.29133,0.245702,1.295219,1.30062,4.479143,16.597364,0.372896,0.017454
3,Pays Bas,1,2003,1,10.616073,0.484293,0.705303,0.046325,0.469844,0.31063,0.255702,1.315219,1.29774,4.499847,16.602082,0.382046,0.017839
4,Pays Bas,1,2004,1,10.632253,0.482649,0.715303,0.056325,0.479844,0.34787,0.265702,1.335219,1.3006,4.512404,16.605557,0.374042,0.01789


In [37]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

### LGDP를 종속변수로 예측모델링

In [61]:
# 필요한 특성 선택
features = ['FPS', 'MPS', 'BBS', 'IU', 'DDI', 'GFCF', 'TO', 'Labor', 'LCPI', 'LPOP', 'CONSUM', 'RD']
X = df[features]
y = df['LGDP']

# 학습용과 테스트용 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [62]:
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

# 여러 모델 정의
models = {
    'Linear Regression': LinearRegression(),
    'SVR': SVR(kernel='rbf'),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, random_state=42)
}

# 각 모델 학습 및 성능 평가
for name, model in models.items():
    model.fit(X_train, y_train)
    train_score = model.score(X_train, y_train)
    test_score = model.score(X_test, y_test)
    print(f'\n{name}:')
    print(f'Train R²: {train_score:.4f}')
    print(f'Test R²: {test_score:.4f}')


Linear Regression:
Train R²: 0.8013
Test R²: 0.8148

SVR:
Train R²: 0.7482
Test R²: 0.7692

Random Forest:
Train R²: 0.9965
Test R²: 0.9837

Gradient Boosting:
Train R²: 0.9553
Test R²: 0.9412

XGBoost:
Train R²: 0.9997
Test R²: 0.9828


#### 가장 성능이 좋은 RandomForestRegressor 모델 선택

In [63]:
# RandomForestRegressor 으로 모델 학습
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 모델 평가
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print('훈련 데이터 R² 점수:', train_score)
print('테스트 데이터 R² 점수:', test_score)

훈련 데이터 R² 점수: 0.9964892656892977
테스트 데이터 R² 점수: 0.9836877038607016


### 각 변수값에 따른 LGDP 변화 분석

In [73]:
def analyze_variable_impact(variable_name, change_percentage):
    # DataFrame으로 변환
    X_test_df = pd.DataFrame(X_test, columns=features)
    
    # 테스트 데이터에 대해 분석 진행
    X_test_modified = X_test_df.copy()
    
    # 특정 변수의 값을 변경
    X_test_modified[variable_name] *= (1 + change_percentage/100)
    
    # 새로운 예측값과 원래 예측값 계산
    original_predictions = model.predict(X_test_df)
    new_predictions = model.predict(X_test_modified)
    
    # 변화율 계산
    avg_change = np.mean(new_predictions - original_predictions)
    percent_change = (avg_change / np.mean(original_predictions)) * 100

    return avg_change, percent_change

In [74]:
def analyze_multiple_variables(variables, changes):
    results = {}
    for var, change in zip(variables, changes):
        abs_change, pct_change = analyze_variable_impact(var, change)
        results[var] = {
            '변화율': change, 
            'LGDP 절대 변화': abs_change, 
            'LGDP 상대 변화(%)': pct_change
        }
    
    # 결과를 데이터프레임으로 변환
    results_df = pd.DataFrame.from_dict(results, orient='index')
    return results_df

In [96]:
# 테스트
variables = ['FPS', 'MPS', 'BBS', 'IU', 'DDI', 'GFCF', 'TO', 'Labor', 'LCPI', 'LPOP', 'CONSUM', 'RD']
changes = [10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10]
results = analyze_multiple_variables(variables, changes)
print("\n종합 결과:")
print(results)


종합 결과:
        변화율  LGDP 절대 변화  LGDP 상대 변화(%)
FPS      10    0.037637       0.431771
MPS      10    0.006425       0.073706
BBS      10    0.006962       0.079871
IU       10    0.010041       0.115196
DDI      10   -0.001136      -0.013029
GFCF     10    0.001040       0.011930
TO       10    0.004612       0.052913
Labor    10    0.068371       0.784360
LCPI     10    0.000033       0.000380
LPOP     10   -0.032755      -0.375768
CONSUM   10    0.006002       0.068858
RD       10    0.007758       0.089004


### 변수들을 동시에 변화시켜 LGDP 종합적인 변화

In [64]:
def analyze_combined_impact(variables, changes):
    # DataFrame으로 변환
    X_test_df = pd.DataFrame(X_test, columns=features)
    
    # 테스트 데이터에 대해 분석 진행
    X_test_modified = X_test_df.copy()
    
    # 모든 변수의 값을 동시에 변경
    for var, change in zip(variables, changes):
        X_test_modified[var] *= (1 + change/100)
    
    # 새로운 예측값과 원래 예측값 계산
    original_predictions = model.predict(X_test_df)
    new_predictions = model.predict(X_test_modified)
    
    # 변화율 계산
    avg_change = np.mean(new_predictions - original_predictions)
    percent_change = (avg_change / np.mean(original_predictions)) * 100
    
    print("변수들의 동시 변화:")
    for var, change in zip(variables, changes):
        print(f"{var}: {change}% 변화")
    print(f"\nLGDP 종합 변화:")
    print(f"절대 변화량: {avg_change:.4f}")
    print(f"상대 변화율: {percent_change:.4f}%")
    
    return avg_change, percent_change

### FPS를 10% 증가시켰을 때의 LGDP 변화

In [65]:
variables = ['FPS', 'MPS', 'IU', 'GFCF']
changes = [50, 10, 10, 10]
results = analyze_combined_impact(variables, changes)

변수들의 동시 변화:
FPS: 10% 변화
MPS: 10% 변화
IU: 10% 변화
GFCF: 10% 변화

LGDP 종합 변화:
절대 변화량: 0.0543
상대 변화율: 0.6229%


In [97]:
variables = ['FPS', 'MPS', 'BBS', 'IU', 'DDI', 'GFCF', 'TO', 'Labor', 'LCPI', 'LPOP', 'CONSUM', 'RD']
changes = [50,10,10,50,10,10,10,10,10,10,10,50]
results = analyze_combined_impact(variables, changes)

변수들의 동시 변화:
FPS: 50% 변화
MPS: 10% 변화
BBS: 10% 변화
IU: 50% 변화
DDI: 10% 변화
GFCF: 10% 변화
TO: 10% 변화
Labor: 10% 변화
LCPI: 10% 변화
LPOP: 10% 변화
CONSUM: 10% 변화
RD: 50% 변화

LGDP 종합 변화:
절대 변화량: 0.3040
상대 변화율: 3.4873%
