In [19]:
import pandas as pd

# Load the uploaded files
mcdonalds_file_path = 'data/맥도날드_상권정보.csv'
starbucks_file_path = 'data/스타벅스_상권정보.csv'

# Read the CSV files
mcdonalds_df = pd.read_csv(mcdonalds_file_path)
starbucks_df = pd.read_csv(starbucks_file_path)

In [20]:
def clean_numeric_column(column):
    return column.str.replace('만원', '').str.replace(',', '').astype(float)

# Apply cleaning function to relevant columns
mcdonalds_df['SalesAmt'] = clean_numeric_column(mcdonalds_df['SalesAmt'])
starbucks_df['SalesAmt'] = clean_numeric_column(starbucks_df['SalesAmt'])

# Also clean 'FlowPop', 'EmpEop', 'AbodePop', and 'EmpAvgCo' columns which have '명' and '만원'
for column in ['FlowPop', 'EmpEop', 'AbodePop', 'EmpAvgCo']:
    mcdonalds_df[column] = mcdonalds_df[column].str.replace('명', '').str.replace('만원', '').str.replace(',', '').astype(float)
    starbucks_df[column] = starbucks_df[column].str.replace('명', '').str.replace('만원', '').str.replace(',', '').astype(float)

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Select features and target variable
features = ['FlowPop', 'EmpEop', 'AbodePop', 'EmpAvgCo']
target = 'SalesAmt'

# Prepare the data for McDonald's
X = mcdonalds_df[features]
y = mcdonalds_df[target]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Regressor model
model = RandomForestRegressor(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict sales on the test set
y_pred = model.predict(X_test)

# Evaluate the model performance
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mae, mse, r2

(5072.571960784313, 62008460.10236667, -0.18474637606859212)

: 