In [None]:
!pip install pykalman
!pip install linearmodels



## 더미변수 포함

In [None]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
from statsmodels.tools.tools import add_constant
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from pykalman import KalmanFilter


ModuleNotFoundError: No module named 'pykalman'

In [None]:
data = pd.read_csv('/content/drive/MyDrive/코코비_R&D팀/사용데이터/final_data_24102012.csv')

In [None]:
data.info()

In [None]:
# Convert Date to datetime format for proper sorting
data['Date'] = pd.to_datetime(data['Date'], errors='coerce')

# Sort the data by Date, AppName, and Country
data = data.sort_values(by=['AppName', 'Country', 'Date'])

# Group by AppName and Country, then calculate the cumulative mean for Daily Average Rating
data['Cumulative_Avg_Rating'] = data.groupby(['AppName', 'Country'])['Daily Average Rating'].expanding().mean().reset_index(level=[0,1], drop=True)

# Replace missing values in Total Average Rating with the cumulative average
data['Total Average Rating'].fillna(data['Cumulative_Avg_Rating'], inplace=True)

# Drop the helper column 'Cumulative_Avg_Rating' after imputation
data.drop(columns=['Cumulative_Avg_Rating'], inplace=True)

In [None]:
data.drop(columns = 'Daily Average Rating',axis = 1, inplace = True)

In [None]:
import numpy as np
import pandas as pd
from linearmodels.panel import PanelOLS
from statsmodels.tools.tools import add_constant
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample  # resample 함수는 sklearn.utils에서 가져옵니다
from statsmodels.stats.outliers_influence import variance_inflation_factor
from pykalman import KalmanFilter

# 데이터의 결측값을 처리 (예: 특정 열에서 결측값을 제거)
data = data.dropna(subset=['User acquisition'])

# 새로운 YouTube_Flag 생성: _youtube가 포함되지만 unsimilar_youtube 제외
youtube_columns = data.filter(like='_youtube').columns
youtube_columns = youtube_columns[~youtube_columns.str.contains('unsimilar')]
data['YouTube_Flag'] = data[youtube_columns].apply(lambda row: 1 if (row != 0).any() else 0, axis=1)

# 새로운 Games_Flag 생성: _games가 포함되지만 previous_games 제외
games_columns = data.filter(like='_games').columns
games_columns = games_columns[~games_columns.str.contains('previous')]
data['Games_Flag'] = data[games_columns].apply(lambda row: 1 if (row != 0).any() else 0, axis=1)

# AppName과 Country를 더미 변환 전에 따로 저장
data['AppName_original'] = data['AppName']
data['Country_original'] = data['Country']
data['AD_LAN_original'] = data['AD_LAN']

# 변수 이름의 공백을 밑줄로 대체
data.columns = data.columns.str.replace(' ', '_')

# 범주형 변수를 더미 변수로 변환 (Country와 다른 범주형 변수들)
data = pd.get_dummies(data, columns=['Country', 'AppName', 'CONTINENT', 'AD_LAN'], drop_first=True)

# Language_Game_Flag 적용
def create_language_game_flag(row):
    is_korean_or_english = (row['AD_LAN_original'] == '한국어' or row['AD_LAN_original'] == '영어')
    if is_korean_or_english and row['Category_ID_games'] > 0:
        return 1
    else:
        return 0

data['Language_Game_Flag'] = data.apply(create_language_game_flag, axis=1)

# 샘플링 단계
data_flag_0 = data[data['Language_Game_Flag'] == 0]
data_flag_1 = data[data['Language_Game_Flag'] == 1]

# 10% 샘플링 비율 (비율 조정 가능)
sampling_ratio = 0.1

# Language_Game_Flag == 1 데이터를 복제해 샘플링
data_flag_1_resampled = resample(data_flag_1,
                                 replace=True,
                                 n_samples=int(len(data_flag_0) * sampling_ratio),
                                 random_state=42)

# Language_Game_Flag == 0 데이터를 동일 비율로 샘플링
data_flag_0_resampled = resample(data_flag_0,
                                 replace=False,
                                 n_samples=len(data_flag_1_resampled),
                                 random_state=42)

# 샘플링된 데이터 병합
data_balanced = pd.concat([data_flag_0_resampled, data_flag_1_resampled])

# 독립변수 리스트 (수치형 데이터)
independent_vars = ['Store_Visit', 'View_Count_games', 'similarity_score', 'View_Count_youtube',
                    'Like_Count_youtube', 'Comment_Count_youtube', 'video_potentiality_youtube',
                    'video_potentiality_games', 'day__youtube', 'day__games', 'Category_ID_youtube',
                    'Category_ID_games', 'similarity_score_games', 'Total_Average_Rating',
                    'View_Count_previous_games', 'video_potentiality_previous_games',
                    'day__unsimilar_youtube', 'video_potentiality_unsimilar_youtube',
                    'View_Count_unsimilar_youtube',
                    'YouTube_Flag', 'Games_Flag', 'Language_Game_Flag']

# 변수명에서 '+' 기호를 '_'로 교체하여 에러 방지
independent_vars = [var.replace('+', '_') for var in independent_vars]
data_balanced.columns = data_balanced.columns.str.replace('+', '_')

# VIF 계산을 위한 데이터 준비
X = data_balanced[independent_vars].select_dtypes(include=[np.number]).dropna()

# 상수항 추가 (회귀에 필요)
X = add_constant(X)

# 스케일링 적용
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# VIF 계산
vif_data = pd.DataFrame()
vif_data["Feature"] = X_scaled.columns
vif_data["VIF"] = [variance_inflation_factor(X_scaled.values, i) for i in range(X_scaled.shape[1])]

# VIF가 10 이하인 변수들만 선택
low_vif_features = vif_data[vif_data["VIF"] <= 10]["Feature"].tolist()

# Kalman Filter 설정
user_acquisition = data_balanced['User_acquisition'].values

n_timesteps = len(user_acquisition)
n_dim_state = 2  # 두 개의 숨겨진 상태: permanent와 temporary

# 전이 행렬 (transition matrix)
transition_matrix = np.array([[1, 0.1],  # Permanent component: 약간의 변화를 허용
                              [0, 0.9]])  # Temporary component: 시간이 지남에 따라 감소

# 관찰 행렬 (observation matrix)
observation_matrix = np.array([[1, 1]])  # Permanent와 Temporary의 합을 관찰

# 초기 상태값 설정
initial_state_mean = [user_acquisition[0], 0]  # 첫 관측 값으로 시작

# Kalman Filter 초기화
kf = KalmanFilter(
    transition_matrices=transition_matrix,
    observation_matrices=observation_matrix,
    initial_state_mean=initial_state_mean
)

# Kalman Filter를 사용해 상태 추정
state_means, state_covariances = kf.filter(user_acquisition)

# Permanent 및 Temporary Component 추출
data_balanced['Permanent_Component'] = state_means[:, 0]
data_balanced['Temporary_Component'] = state_means[:, 1]

# 상수항을 추가 (Intercept)
data_balanced = add_constant(data_balanced)

# 날짜, AppName, Country를 패널 분석용 식별자로 사용 (2개의 계층만 사용)
data_balanced['Date'] = pd.to_datetime(data_balanced['Date'])

# MultiIndex 설정 (AppName과 Date 또는 Country와 Date)
panel_data = data_balanced.set_index(['AppName_original', 'Date'])

# 고정효과 패널 회귀 분석 (Permanent Component 분석)
model_perm = PanelOLS.from_formula('Permanent_Component ~ ' + ' + '.join(low_vif_features) + ' + EntityEffects', data=panel_data).fit()

# 결과 출력
print("고정효과 패널 분석 결과 (Permanent Component):")
print(model_perm)

# 고정효과 패널 회귀 분석 (Temporary Component 분석)
model_temp = PanelOLS.from_formula('Temporary_Component ~ ' + ' + '.join(low_vif_features) + ' + EntityEffects', data=panel_data).fit()

# 결과 출력
print("\n고정효과 패널 분석 결과 (Temporary Component):")
print(model_temp)

  return 1 - self.ssr/self.uncentered_tss
Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)


고정효과 패널 분석 결과 (Permanent Component):
                           PanelOLS Estimation Summary                           
Dep. Variable:     Permanent_Component   R-squared:                        0.7793
Estimator:                    PanelOLS   R-squared (Between):              0.5719
No. Observations:                43865   R-squared (Within):               0.7793
Date:                 Wed, Oct 23 2024   R-squared (Overall):              0.6965
Time:                         15:26:48   Log-likelihood                -2.503e+05
Cov. Estimator:             Unadjusted                                           
                                         F-statistic:                      9103.2
Entities:                           13   P-value                           0.0000
Avg Obs:                        3374.2   Distribution:                F(17,43835)
Min Obs:                        1050.0                                           
Max Obs:                        8626.0   F-statistic (robust)

Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)



고정효과 패널 분석 결과 (Temporary Component):
                           PanelOLS Estimation Summary                           
Dep. Variable:     Temporary_Component   R-squared:                        0.8580
Estimator:                    PanelOLS   R-squared (Between):             -0.3850
No. Observations:                43865   R-squared (Within):               0.8580
Date:                 Wed, Oct 23 2024   R-squared (Overall):              0.7654
Time:                         15:26:48   Log-likelihood                -2.351e+05
Cov. Estimator:             Unadjusted                                           
                                         F-statistic:                   1.558e+04
Entities:                           13   P-value                           0.0000
Avg Obs:                        3374.2   Distribution:                F(17,43835)
Min Obs:                        1050.0                                           
Max Obs:                        8626.0   F-statistic (robust

### 더미변수 제외

In [None]:
data = pd.read_csv('/content/drive/MyDrive/코코비_R&D팀/사용데이터/final_data_24102012.csv')

# 국가와 앱 이름별로 그룹화하여 그룹별 평균값으로 결측치 채우기
grouped = data.groupby(['Country', 'AppName'])

# 그룹의 평균값으로 결측치 채우기
data['Daily Average Rating'] = grouped['Daily Average Rating'].transform(lambda x: x.fillna(x.mean()))
data['Total Average Rating'] = grouped['Total Average Rating'].transform(lambda x: x.fillna(x.mean()))

# 만약 그룹에 평균값도 없다면, 전체 평균으로 한 번 더 채워주는 코드 추가
data['Daily Average Rating'].fillna(data['Daily Average Rating'].mean(), inplace=True)
data['Total Average Rating'].fillna(data['Total Average Rating'].mean(), inplace=True)

# 결과를 확인하기 위해 데이터 출력
data.head()
data.drop(columns = 'Daily Average Rating',axis = 1, inplace = True)

  data = pd.read_csv('/content/drive/MyDrive/코코비_R&D팀/사용데이터/final_data_24102012.csv')
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Daily Average Rating'].fillna(data['Daily Average Rating'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Total Average Rating'].fillna(data['Total 

In [None]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
from statsmodels.tools.tools import add_constant
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from pykalman import KalmanFilter

# 데이터의 결측값을 처리 (예: 특정 열에서 결측값을 제거)
data = data.dropna(subset=['User acquisition'])

# 새로운 YouTube_Flag 생성: _youtube가 포함되지만 unsimilar_youtube 제외
youtube_columns = data.filter(like='_youtube').columns
youtube_columns = youtube_columns[~youtube_columns.str.contains('unsimilar')]
data['YouTube_Flag'] = data[youtube_columns].apply(lambda row: 1 if (row != 0).any() else 0, axis=1)

# 새로운 Games_Flag 생성: _games가 포함되지만 previous_games 제외
games_columns = data.filter(like='_games').columns
games_columns = games_columns[~games_columns.str.contains('previous')]
data['Games_Flag'] = data[games_columns].apply(lambda row: 1 if (row != 0).any() else 0, axis=1)

# AppName과 Country를 더미 변환 전에 따로 저장
data['AppName_original'] = data['AppName']
data['Country_original'] = data['Country']
data['AD_LAN_original'] = data['AD_LAN']

# 변수 이름의 공백을 밑줄로 대체
data.columns = data.columns.str.replace(' ', '_')

# 범주형 변수를 더미 변수로 변환 (Country와 다른 범주형 변수들)
data = pd.get_dummies(data, columns=['Country', 'AppName', 'CONTINENT', 'AD_LAN'], drop_first=True)

# Language_Game_Flag 적용
def create_language_game_flag(row):
    is_korean_or_english = (row['AD_LAN_original'] == '한국어' or row['AD_LAN_original'] == '영어')
    if is_korean_or_english and row['Category_ID_games'] > 0:
        return 1
    else:
        return 0

data['Language_Game_Flag'] = data.apply(create_language_game_flag, axis=1)

# 독립변수 리스트 (수치형 데이터)
independent_vars = ['Store_Visit',
       'View_Count_games',
       'similarity_score', 'View_Count_youtube', 'Like_Count_youtube',
       'Comment_Count_youtube', 'video_potentiality_youtube',
       'video_potentiality_games', 'day+_youtube', 'day+_games',
       'Category_ID_youtube', 'Category_ID_games', 'similarity_score_games',
       'Total_Average_Rating', 'View_Count_previous_games', 'video_potentiality_previous_games',
       'day+_unsimilar_youtube', 'video_potentiality_unsimilar_youtube',
       'Comment_Count_Count_unsimilar_youtube', 'View_Count_unsimilar_youtube',
       'Like_Count_unsimilar_youtube', 'YouTube_Flag', 'Games_Flag', 'Language_Game_Flag']

# VIF 계산을 위한 데이터 준비
X = data[independent_vars]

# 수치형 데이터만 선택하여 NaN값 처리
X = X.select_dtypes(include=[np.number]).dropna()

# 상수항 추가 (회귀에 필요)
X = add_constant(X)

# 스케일링 적용
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# VIF 계산
vif_data = pd.DataFrame()
vif_data["Feature"] = X_scaled.columns
vif_data["VIF"] = [variance_inflation_factor(X_scaled.values, i) for i in range(X_scaled.shape[1])]

# VIF 결과 출력
print("VIF 결과:\n", vif_data)

# VIF가 10 이하인 변수들만 선택
low_vif_features = vif_data[vif_data["VIF"] <= 10]["Feature"].tolist()
print("VIF가 낮은 독립변수들:", low_vif_features)

# Kalman Filter 설정
user_acquisition = data['User_acquisition'].values

n_timesteps = len(user_acquisition)
n_dim_state = 2  # 두 개의 숨겨진 상태: permanent와 temporary

# 전이 행렬 (transition matrix)
transition_matrix = np.array([[1, 0.1],  # Permanent component: 약간의 변화를 허용
                              [0, 0.9]])  # Temporary component: 시간이 지남에 따라 감소

# 관찰 행렬 (observation matrix)
observation_matrix = np.array([[1, 1]])  # Permanent와 Temporary의 합을 관찰

# 초기 상태값 설정
initial_state_mean = [user_acquisition[0], 0]  # 첫 관측 값으로 시작

# Kalman Filter 초기화
kf = KalmanFilter(
    transition_matrices=transition_matrix,
    observation_matrices=observation_matrix,
    initial_state_mean=initial_state_mean
)

# Kalman Filter를 사용해 상태 추정
state_means, state_covariances = kf.filter(user_acquisition)

# Permanent 및 Temporary Component 추출
data['Permanent_Component'] = state_means[:, 0]
data['Temporary_Component'] = state_means[:, 1]

# 상수항을 추가 (Intercept)
data = add_constant(data)

# 패널 데이터를 위한 필요한 독립 변수만 선택 (low_vif_features 및 원본 AppName, Country, AD_LAN 사용)
panel_data = data[low_vif_features + ['Permanent_Component', 'Temporary_Component']]

# 결측값 제거
panel_data = panel_data.dropna()

# 변수명에서 +를 _로 교체
panel_data.columns = panel_data.columns.str.replace('+', '_')
low_vif_features = [feature.replace('+', '_') for feature in low_vif_features]

# 고정효과 패널 회귀 분석 (Permanent Component 분석)
model_perm = smf.ols('Permanent_Component ~ ' + ' + '.join(low_vif_features), data=panel_data).fit()

# 결과 출력
print("고정효과 패널 분석 결과 (Permanent Component):")
print(model_perm.summary())

# 고정효과 패널 회귀 분석 (Temporary Component 분석)
model_temp = smf.ols('Temporary_Component ~ ' + ' + '.join(low_vif_features), data=panel_data).fit()

# 결과 출력
print("\n고정효과 패널 분석 결과 (Temporary Component):")
print(model_temp.summary())

  return 1 - self.ssr/self.uncentered_tss


VIF 결과:
                                   Feature         VIF
0                                   const         NaN
1                             Store_Visit    1.013292
2                        View_Count_games   12.527275
3                        similarity_score    6.891993
4                      View_Count_youtube    1.817350
5                      Like_Count_youtube    1.080320
6                   Comment_Count_youtube         NaN
7              video_potentiality_youtube    1.773818
8                video_potentiality_games   13.497815
9                            day+_youtube    4.235413
10                             day+_games   11.187053
11                    Category_ID_youtube    2.748766
12                      Category_ID_games   92.749093
13                 similarity_score_games   11.752945
14                   Total_Average_Rating    1.007163
15              View_Count_previous_games    7.237026
16      video_potentiality_previous_games    7.238720
17                 

### VIF 무시
#### independent_vars의 내용물을 수정해서 관찰해보세요~~~

In [None]:
data = pd.read_csv('/content/drive/MyDrive/코코비_R&D팀/사용데이터/final_data_24102012.csv')

# 국가와 앱 이름별로 그룹화하여 그룹별 평균값으로 결측치 채우기
grouped = data.groupby(['Country', 'AppName'])

# 그룹의 평균값으로 결측치 채우기
data['Daily Average Rating'] = grouped['Daily Average Rating'].transform(lambda x: x.fillna(x.mean()))
data['Total Average Rating'] = grouped['Total Average Rating'].transform(lambda x: x.fillna(x.mean()))

# 만약 그룹에 평균값도 없다면, 전체 평균으로 한 번 더 채워주는 코드 추가
data['Daily Average Rating'].fillna(data['Daily Average Rating'].mean(), inplace=True)
data['Total Average Rating'].fillna(data['Total Average Rating'].mean(), inplace=True)

# 결과를 확인하기 위해 데이터 출력
data.head()
data.drop(columns = 'Daily Average Rating',axis = 1, inplace = True)

  data = pd.read_csv('/content/drive/MyDrive/코코비_R&D팀/사용데이터/final_data_24102012.csv')
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Daily Average Rating'].fillna(data['Daily Average Rating'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Total Average Rating'].fillna(data['Total 

In [None]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
from statsmodels.tools.tools import add_constant
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from pykalman import KalmanFilter

# 데이터의 결측값을 처리 (예: 특정 열에서 결측값을 제거)
data = data.dropna(subset=['User acquisition'])

# 새로운 YouTube_Flag 생성: _youtube가 포함되지만 unsimilar_youtube 제외
youtube_columns = data.filter(like='_youtube').columns
youtube_columns = youtube_columns[~youtube_columns.str.contains('unsimilar')]
data['YouTube_Flag'] = data[youtube_columns].apply(lambda row: 1 if (row != 0).any() else 0, axis=1)

# 새로운 Games_Flag 생성: _games가 포함되지만 previous_games 제외
games_columns = data.filter(like='_games').columns
games_columns = games_columns[~games_columns.str.contains('previous')]
data['Games_Flag'] = data[games_columns].apply(lambda row: 1 if (row != 0).any() else 0, axis=1)

# AppName과 Country를 더미 변환 전에 따로 저장
data['AppName_original'] = data['AppName']
data['Country_original'] = data['Country']
data['AD_LAN_original'] = data['AD_LAN']

# 변수 이름의 공백을 밑줄로 대체
data.columns = data.columns.str.replace(' ', '_')

# 범주형 변수를 더미 변수로 변환 (Country와 다른 범주형 변수들)
data = pd.get_dummies(data, columns=['Country', 'AppName', 'CONTINENT', 'AD_LAN'], drop_first=True)

# Language_Game_Flag 적용
def create_language_game_flag(row):
    is_korean_or_english = (row['AD_LAN_original'] == '한국어' or row['AD_LAN_original'] == '영어')
    if is_korean_or_english and row['Category_ID_games'] > 0:
        return 1
    else:
        return 0

data['Language_Game_Flag'] = data.apply(create_language_game_flag, axis=1)

# 독립변수 리스트 (수치형 데이터)
independent_vars = ['Store_Visit','Store_Acquisition',
       'View_Count_games',
       'similarity_score', 'View_Count_youtube', 'Like_Count_youtube',
       'Comment_Count_youtube', 'video_potentiality_youtube',
       'video_potentiality_games', 'day+_youtube', 'day+_games',
       'Category_ID_youtube', 'Category_ID_games', 'similarity_score_games',
       'Total_Average_Rating', 'View_Count_previous_games', 'video_potentiality_previous_games',
       'day+_unsimilar_youtube', 'video_potentiality_unsimilar_youtube',
       'Comment_Count_Count_unsimilar_youtube', 'View_Count_unsimilar_youtube',
       'Like_Count_unsimilar_youtube', 'YouTube_Flag', 'Games_Flag', 'Language_Game_Flag']

##위의 independent_vars의 내용물을 수정해서 관찰해보세요~~~
# VIF 계산을 위한 데이터 준비
X = data[independent_vars]

# 수치형 데이터만 선택하여 NaN값 처리
X = X.select_dtypes(include=[np.number]).dropna()

# 상수항 추가 (회귀에 필요)
X = add_constant(X)

# 스케일링 적용
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# VIF 계산
vif_data = pd.DataFrame()
vif_data["Feature"] = X_scaled.columns
vif_data["VIF"] = [variance_inflation_factor(X_scaled.values, i) for i in range(X_scaled.shape[1])]

# VIF 결과 출력
print("VIF 결과:\n", vif_data)

# VIF가 10 이하인 변수들만 선택
low_vif_features = vif_data[vif_data["VIF"] <= 10]["Feature"].tolist()
print("VIF가 낮은 독립변수들:", low_vif_features)

# Kalman Filter 설정
user_acquisition = data['User_acquisition'].values

n_timesteps = len(user_acquisition)
n_dim_state = 2  # 두 개의 숨겨진 상태: permanent와 temporary

# 전이 행렬 (transition matrix)
transition_matrix = np.array([[1, 0.1],  # Permanent component: 약간의 변화를 허용
                              [0, 0.9]])  # Temporary component: 시간이 지남에 따라 감소

# 관찰 행렬 (observation matrix)
observation_matrix = np.array([[1, 1]])  # Permanent와 Temporary의 합을 관찰

# 초기 상태값 설정
initial_state_mean = [user_acquisition[0], 0]  # 첫 관측 값으로 시작

# Kalman Filter 초기화
kf = KalmanFilter(
    transition_matrices=transition_matrix,
    observation_matrices=observation_matrix,
    initial_state_mean=initial_state_mean
)

# Kalman Filter를 사용해 상태 추정
state_means, state_covariances = kf.filter(user_acquisition)

# Permanent 및 Temporary Component 추출
data['Permanent_Component'] = state_means[:, 0]
data['Temporary_Component'] = state_means[:, 1]

# 상수항을 추가 (Intercept)
data = add_constant(data)

# 패널 데이터를 위한 필요한 독립 변수만 선택 (low_vif_features 및 원본 AppName, Country, AD_LAN 사용)
panel_data = data[independent_vars + ['Permanent_Component', 'Temporary_Component']]

# 결측값 제거
panel_data = panel_data.dropna()

# 변수명에서 +를 _로 교체 (특수문자 교체)
panel_data.columns = panel_data.columns.str.replace('+', '_')
independent_vars = [feature.replace('+', '_') for feature in independent_vars]

# 고정효과 패널 회귀 분석 (Permanent Component 분석)
model_perm = smf.ols('Permanent_Component ~ ' + ' + '.join(independent_vars), data=panel_data).fit()

# 결과 출력
print("고정효과 패널 분석 결과 (Permanent Component):")
print(model_perm.summary())

# 고정효과 패널 회귀 분석 (Temporary Component 분석)
model_temp = smf.ols('Temporary_Component ~ ' + ' + '.join(independent_vars), data=panel_data).fit()

# 결과 출력
print("\n고정효과 패널 분석 결과 (Temporary Component):")
print(model_temp.summary())

  return 1 - self.ssr/self.uncentered_tss


VIF 결과:
                                   Feature         VIF
0                                   const         NaN
1                             Store_Visit   19.768703
2                       Store_Acquisition   19.720494
3                        View_Count_games   12.527587
4                        similarity_score    6.898393
5                      View_Count_youtube    1.817435
6                      Like_Count_youtube    1.080324
7                   Comment_Count_youtube         NaN
8              video_potentiality_youtube    1.773985
9                video_potentiality_games   13.498064
10                           day+_youtube    4.235497
11                             day+_games   11.187361
12                    Category_ID_youtube    2.750797
13                      Category_ID_games   92.754928
14                 similarity_score_games   11.759599
15                   Total_Average_Rating    1.008208
16              View_Count_previous_games    7.237027
17      video_poten

### Segment 요소 추가

In [None]:
data = pd.read_csv('/content/drive/MyDrive/코코비_R&D팀/사용데이터/final_data_24102012.csv')

# 국가와 앱 이름별로 그룹화하여 그룹별 평균값으로 결측치 채우기
grouped = data.groupby(['Country', 'AppName'])

# 그룹의 평균값으로 결측치 채우기
data['Daily Average Rating'] = grouped['Daily Average Rating'].transform(lambda x: x.fillna(x.mean()))
data['Total Average Rating'] = grouped['Total Average Rating'].transform(lambda x: x.fillna(x.mean()))

# 만약 그룹에 평균값도 없다면, 전체 평균으로 한 번 더 채워주는 코드 추가
data['Daily Average Rating'].fillna(data['Daily Average Rating'].mean(), inplace=True)
data['Total Average Rating'].fillna(data['Total Average Rating'].mean(), inplace=True)

# 결과를 확인하기 위해 데이터 출력
data.head()
data.drop(columns = 'Daily Average Rating',axis = 1, inplace = True)

  data = pd.read_csv('/content/drive/MyDrive/코코비_R&D팀/사용데이터/final_data_24102012.csv')
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Daily Average Rating'].fillna(data['Daily Average Rating'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Total Average Rating'].fillna(data['Total 

In [None]:
segment = pd.read_csv('/content/drive/MyDrive/코코비_R&D팀/사용데이터/segment.csv')

In [None]:
segment_cocobi = pd.merge(data, segment, on = 'Country', how = 'left')

In [None]:
data = segment_cocobi.copy()

segment와

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 291809 entries, 0 to 291808
Data columns (total 39 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   Date                                   291809 non-null  object 
 1   AppName                                291809 non-null  object 
 2   Country                                291809 non-null  object 
 3   CONTINENT                              291809 non-null  object 
 4   AD_LAN                                 291809 non-null  object 
 5   Store_Visit                            291809 non-null  int64  
 6   Store_Acquisition                      291809 non-null  int64  
 7   DAU                                    291809 non-null  float64
 8   MAU                                    291809 non-null  float64
 9   Bound                                  291809 non-null  float64
 10  User acquisition                       291809 non-null  

### segment 1을 기준으로 segment를 더미변수로 지정한 다음 다중회귀분석

In [None]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
from statsmodels.tools.tools import add_constant
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from pykalman import KalmanFilter
from patsy.contrasts import Treatment

# 데이터의 결측값을 처리 (예: 특정 열에서 결측값을 제거)
data = data.dropna(subset=['User acquisition'])

# 새로운 YouTube_Flag 생성: _youtube가 포함되지만 unsimilar_youtube 제외
youtube_columns = data.filter(like='_youtube').columns
youtube_columns = youtube_columns[~youtube_columns.str.contains('unsimilar')]
data['YouTube_Flag'] = data[youtube_columns].apply(lambda row: 1 if (row != 0).any() else 0, axis=1)

# 새로운 Games_Flag 생성: _games가 포함되지만 previous_games 제외
games_columns = data.filter(like='_games').columns
games_columns = games_columns[~games_columns.str.contains('previous')]
data['Games_Flag'] = data[games_columns].apply(lambda row: 1 if (row != 0).any() else 0, axis=1)

# AppName과 Country를 더미 변환 전에 따로 저장
data['AD_LAN_original'] = data['AD_LAN']
data['segment_original'] = data['segment']

# 변수 이름의 공백을 밑줄로 대체
data.columns = data.columns.str.replace(' ', '_')

#Check if 'segment' column exists before converting to dummy variables
if 'segment' in data.columns:
    data = pd.get_dummies(data, columns=['segment'], drop_first=False)
else:
    print("'segment' column not found, skipping dummy conversion for 'segment'.")

# Language_Game_Flag 적용 함수
def create_language_game_flag(row):
    is_korean_or_english = (row['AD_LAN_original'] == '한국어' or row['AD_LAN_original'] == '영어')
    if is_korean_or_english and row['video_potentiality_games'] > 0:
        return 1
    else:
        return 0

# Language_Game_Flag 적용
data['Language_Game_Flag'] = data.apply(create_language_game_flag, axis=1)

# 독립변수 리스트 (수치형 데이터)
independent_vars = ['Store_Visit','View_Count_games',
       'similarity_score', 'View_Count_youtube', 'Like_Count_youtube',
       'Comment_Count_youtube', 'video_potentiality_youtube',
       'video_potentiality_games', 'day+_youtube', 'day+_games',
       'Category_ID_youtube', 'Category_ID_games', 'similarity_score_games',
      'Total_Average_Rating','View_Count_previous_games', 'video_potentiality_previous_games',
       'day+_unsimilar_youtube', 'video_potentiality_unsimilar_youtube',
       'Comment_Count_Count_unsimilar_youtube', 'View_Count_unsimilar_youtube',
       'Like_Count_unsimilar_youtube','YouTube_Flag','Games_Flag','Language_Game_Flag']

# VIF 계산을 위한 데이터 준비
X = data[independent_vars +
         list(data.filter(like='segment').columns)] if 'segment' in data.columns else data[independent_vars]

# 수치형 데이터만 선택하여 NaN값 처리
X = X.select_dtypes(include=[np.number]).dropna()

# 상수항 추가 (회귀에 필요)
X = add_constant(X)

# 스케일링 적용
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# VIF 계산
vif_data = pd.DataFrame()
vif_data["Feature"] = X_scaled.columns
vif_data["VIF"] = [variance_inflation_factor(X_scaled.values, i) for i in range(X_scaled.shape[1])]

# VIF 결과 출력
print("VIF 결과:\n", vif_data)

# VIF가 10 이하인 변수들만 선택
low_vif_features = vif_data[vif_data["VIF"] <= 10]["Feature"].tolist()
print("VIF가 낮은 독립변수들:", low_vif_features)

# Kalman Filter 설정
user_acquisition = data['User_acquisition'].values

n_timesteps = len(user_acquisition)
n_dim_state = 2  # 두 개의 숨겨진 상태: permanent와 temporary

# 전이 행렬 (transition matrix)
transition_matrix = np.array([[1, 0.1],  # Permanent component: 약간의 변화를 허용
                              [0, 0.9]])  # Temporary component: 시간이 지남에 따라 감소

# 관찰 행렬 (observation matrix)
observation_matrix = np.array([[1, 1]])  # Permanent와 Temporary의 합을 관찰

# 초기 상태값 설정
initial_state_mean = [user_acquisition[0], 0]  # 첫 관측 값으로 시작

# Kalman Filter 초기화
kf = KalmanFilter(
    transition_matrices=transition_matrix,
    observation_matrices=observation_matrix,
    initial_state_mean=initial_state_mean
)

# Kalman Filter를 사용해 상태 추정
state_means, state_covariances = kf.filter(user_acquisition)

# Permanent 및 Temporary Component 추출
data['Permanent_Component'] = state_means[:, 0]
data['Temporary_Component'] = state_means[:, 1]

# 상수항을 추가 (Intercept)
data = add_constant(data)

# 패널 데이터를 위한 필요한 독립 변수만 선택 (low_vif_features 및 원본 AppName, Country, AD_LAN 사용)
panel_data = data[['Date','segment_original', 'Permanent_Component', 'Temporary_Component'] + low_vif_features]

# 결측값 제거
panel_data = panel_data.dropna()

# 변수명에서 +를 _로 교체
panel_data.columns = panel_data.columns.str.replace('+', '_')
low_vif_features = [feature.replace('+', '_') for feature in low_vif_features]


# 고정효과 패널 회귀 분석 (Permanent Component 분석) - segment와 Language_Game_Flag 상호작용 추가
model_perm = smf.ols('Permanent_Component ~ ' + ' + '.join(low_vif_features) +
                     ' + C(segment_original) * Language_Game_Flag', data=panel_data).fit()

# 고정효과 패널 회귀 분석 (Temporary Component 분석) - segment와 Language_Game_Flag 상호작용 추가
model_temp = smf.ols('Temporary_Component ~ ' + ' + '.join(low_vif_features) +
                     ' + C(segment_original) * Language_Game_Flag', data=panel_data).fit()

# 결과 출력
print("고정효과 패널 분석 결과 (Permanent Component):")
print(model_perm.summary())

print("\n고정효과 패널 분석 결과 (Temporary Component):")
print(model_temp.summary())

  return 1 - self.ssr/self.uncentered_tss


VIF 결과:
                                   Feature         VIF
0                                   const         NaN
1                             Store_Visit    1.013292
2                        View_Count_games   12.527275
3                        similarity_score    6.891993
4                      View_Count_youtube    1.817350
5                      Like_Count_youtube    1.080320
6                   Comment_Count_youtube         NaN
7              video_potentiality_youtube    1.773818
8                video_potentiality_games   13.497815
9                            day+_youtube    4.235413
10                             day+_games   11.187053
11                    Category_ID_youtube    2.748766
12                      Category_ID_games   92.749093
13                 similarity_score_games   11.752945
14                   Total_Average_Rating    1.007163
15              View_Count_previous_games    7.237026
16      video_potentiality_previous_games    7.238720
17                 

  return np.sqrt(eigvals[0]/eigvals[-1])
  return np.sqrt(eigvals[0]/eigvals[-1])


### 아래부터는 2,3,4를 각각 참조변수로 하여 진행한 분석

In [None]:
# segment와 Language_Game_Flag기준 데이터 수(여기 나타나지 않으면 아예 데이터가 없는것)
data.groupby(['segment_original','Language_Game_Flag'])['const'].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,const
segment_original,Language_Game_Flag,Unnamed: 2_level_1
1,0,9441
1,1,286
2,0,19454
3,0,106997
4,0,152669
4,1,2962


In [None]:
data = segment_cocobi.copy()

In [None]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
from statsmodels.tools.tools import add_constant
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from pykalman import KalmanFilter
from patsy.contrasts import Treatment

# 데이터의 결측값을 처리 (예: 특정 열에서 결측값을 제거)
data = data.dropna(subset=['User acquisition'])

# 새로운 YouTube_Flag 생성: _youtube가 포함되지만 unsimilar_youtube 제외
youtube_columns = data.filter(like='_youtube').columns
youtube_columns = youtube_columns[~youtube_columns.str.contains('unsimilar')]
data['YouTube_Flag'] = data[youtube_columns].apply(lambda row: 1 if (row != 0).any() else 0, axis=1)

# 새로운 Games_Flag 생성: _games가 포함되지만 previous_games 제외
games_columns = data.filter(like='_games').columns
games_columns = games_columns[~games_columns.str.contains('previous')]
data['Games_Flag'] = data[games_columns].apply(lambda row: 1 if (row != 0).any() else 0, axis=1)

# AppName과 Country를 더미 변환 전에 따로 저장
data['AD_LAN_original'] = data['AD_LAN']
data['segment_original'] = data['segment']

# 변수 이름의 공백을 밑줄로 대체
data.columns = data.columns.str.replace(' ', '_')

# Check if 'segment' column exists before converting to dummy variables
if 'segment' in data.columns:
    data = pd.get_dummies(data, columns=['segment'], drop_first=False)
else:
    print("'segment' column not found, skipping dummy conversion for 'segment'.")

# Language_Game_Flag 적용 함수
def create_language_game_flag(row):
    is_korean_or_english = (row['AD_LAN_original'] == '한국어' or row['AD_LAN_original'] == '영어')
    if is_korean_or_english and row['video_potentiality_games'] > 0:
        return 1
    else:
        return 0

# Language_Game_Flag 적용
data['Language_Game_Flag'] = data.apply(create_language_game_flag, axis=1)

# 독립변수 리스트 (수치형 데이터)
independent_vars = ['Store_Visit','View_Count_games',
       'similarity_score', 'View_Count_youtube', 'Like_Count_youtube',
       'Comment_Count_youtube', 'video_potentiality_youtube',
       'video_potentiality_games', 'day+_youtube', 'day+_games',
       'Category_ID_youtube', 'Category_ID_games', 'similarity_score_games',
      'Total_Average_Rating','View_Count_previous_games', 'video_potentiality_previous_games',
       'day+_unsimilar_youtube', 'video_potentiality_unsimilar_youtube',
       'Comment_Count_Count_unsimilar_youtube', 'View_Count_unsimilar_youtube',
       'Like_Count_unsimilar_youtube','YouTube_Flag','Games_Flag','Language_Game_Flag']

# VIF 계산을 위한 데이터 준비
X = data[independent_vars +
         list(data.filter(like='segment').columns)] if 'segment' in data.columns else data[independent_vars]

# 수치형 데이터만 선택하여 NaN값 처리
X = X.select_dtypes(include=[np.number]).dropna()

# 상수항 추가 (회귀에 필요)
X = add_constant(X)

# 스케일링 적용
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# VIF 계산
vif_data = pd.DataFrame()
vif_data["Feature"] = X_scaled.columns
vif_data["VIF"] = [variance_inflation_factor(X_scaled.values, i) for i in range(X_scaled.shape[1])]

# VIF 결과 출력
print("VIF 결과:\n", vif_data)

# VIF가 10 이하인 변수들만 선택
low_vif_features = vif_data[vif_data["VIF"] <= 10]["Feature"].tolist()
print("VIF가 낮은 독립변수들:", low_vif_features)

# Kalman Filter 설정
user_acquisition = data['User_acquisition'].values

n_timesteps = len(user_acquisition)
n_dim_state = 2  # 두 개의 숨겨진 상태: permanent와 temporary

# 전이 행렬 (transition matrix)
transition_matrix = np.array([[1, 0.1],  # Permanent component: 약간의 변화를 허용
                              [0, 0.9]])  # Temporary component: 시간이 지남에 따라 감소

# 관찰 행렬 (observation matrix)
observation_matrix = np.array([[1, 1]])  # Permanent와 Temporary의 합을 관찰

# 초기 상태값 설정
initial_state_mean = [user_acquisition[0], 0]  # 첫 관측 값으로 시작

# Kalman Filter 초기화
kf = KalmanFilter(
    transition_matrices=transition_matrix,
    observation_matrices=observation_matrix,
    initial_state_mean=initial_state_mean
)

# Kalman Filter를 사용해 상태 추정
state_means, state_covariances = kf.filter(user_acquisition)

# Permanent 및 Temporary Component 추출
data['Permanent_Component'] = state_means[:, 0]
data['Temporary_Component'] = state_means[:, 1]

# 상수항을 추가 (Intercept)
data = add_constant(data)

# 패널 데이터를 위한 필요한 독립 변수만 선택 (low_vif_features 및 원본 AppName, Country, AD_LAN 사용)
panel_data = data[['Date','segment_original', 'Permanent_Component', 'Temporary_Component'] + low_vif_features]

# 결측값 제거
panel_data = panel_data.dropna()

# 변수명에서 +를 _로 교체
panel_data.columns = panel_data.columns.str.replace('+', '_')
low_vif_features = [feature.replace('+', '_') for feature in low_vif_features]


# 고정효과 패널 회귀 분석 (Permanent Component 분석) - segment와 Language_Game_Flag 상호작용 추가
model_perm = smf.ols('Permanent_Component ~ ' + ' + '.join(low_vif_features) +
                     ' + C(segment_original, Treatment(reference=2)) * Language_Game_Flag', data=panel_data).fit()

# 고정효과 패널 회귀 분석 (Temporary Component 분석) - segment와 Language_Game_Flag 상호작용 추가
model_temp = smf.ols('Temporary_Component ~ ' + ' + '.join(low_vif_features) +
                     ' + C(segment_original, Treatment(reference=2)) * Language_Game_Flag', data=panel_data).fit()

# 결과 출력
print("고정효과 패널 분석 결과 (Permanent Component):")
print(model_perm.summary())

print("\n고정효과 패널 분석 결과 (Temporary Component):")
print(model_temp.summary())

  return 1 - self.ssr/self.uncentered_tss


VIF 결과:
                                   Feature         VIF
0                                   const         NaN
1                             Store_Visit    1.013292
2                        View_Count_games   12.527275
3                        similarity_score    6.891993
4                      View_Count_youtube    1.817350
5                      Like_Count_youtube    1.080320
6                   Comment_Count_youtube         NaN
7              video_potentiality_youtube    1.773818
8                video_potentiality_games   13.497815
9                            day+_youtube    4.235413
10                             day+_games   11.187053
11                    Category_ID_youtube    2.748766
12                      Category_ID_games   92.749093
13                 similarity_score_games   11.752945
14                   Total_Average_Rating    1.007163
15              View_Count_previous_games    7.237026
16      video_potentiality_previous_games    7.238720
17                 

  return np.sqrt(eigvals[0]/eigvals[-1])
  return np.sqrt(eigvals[0]/eigvals[-1])


In [None]:
data = segment_cocobi.copy()

In [None]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
from statsmodels.tools.tools import add_constant
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from pykalman import KalmanFilter
from patsy.contrasts import Treatment

# 데이터의 결측값을 처리 (예: 특정 열에서 결측값을 제거)
data = data.dropna(subset=['User acquisition'])

# 새로운 YouTube_Flag 생성: _youtube가 포함되지만 unsimilar_youtube 제외
youtube_columns = data.filter(like='_youtube').columns
youtube_columns = youtube_columns[~youtube_columns.str.contains('unsimilar')]
data['YouTube_Flag'] = data[youtube_columns].apply(lambda row: 1 if (row != 0).any() else 0, axis=1)

# 새로운 Games_Flag 생성: _games가 포함되지만 previous_games 제외
games_columns = data.filter(like='_games').columns
games_columns = games_columns[~games_columns.str.contains('previous')]
data['Games_Flag'] = data[games_columns].apply(lambda row: 1 if (row != 0).any() else 0, axis=1)

# AppName과 Country를 더미 변환 전에 따로 저장
data['AD_LAN_original'] = data['AD_LAN']
data['segment_original'] = data['segment']

# 변수 이름의 공백을 밑줄로 대체
data.columns = data.columns.str.replace(' ', '_')

# Check if 'segment' column exists before converting to dummy variables
if 'segment' in data.columns:
    data = pd.get_dummies(data, columns=['segment'], drop_first=False)
else:
    print("'segment' column not found, skipping dummy conversion for 'segment'.")

# Language_Game_Flag 적용 함수
def create_language_game_flag(row):
    is_korean_or_english = (row['AD_LAN_original'] == '한국어' or row['AD_LAN_original'] == '영어')
    if is_korean_or_english and row['video_potentiality_games'] > 0:
        return 1
    else:
        return 0

# Language_Game_Flag 적용
data['Language_Game_Flag'] = data.apply(create_language_game_flag, axis=1)

# 독립변수 리스트 (수치형 데이터)
independent_vars = ['Store_Visit','View_Count_games',
       'similarity_score', 'View_Count_youtube', 'Like_Count_youtube',
       'Comment_Count_youtube', 'video_potentiality_youtube',
       'video_potentiality_games', 'day+_youtube', 'day+_games',
       'Category_ID_youtube', 'Category_ID_games', 'similarity_score_games',
      'Total_Average_Rating','View_Count_previous_games', 'video_potentiality_previous_games',
       'day+_unsimilar_youtube', 'video_potentiality_unsimilar_youtube',
       'Comment_Count_Count_unsimilar_youtube', 'View_Count_unsimilar_youtube',
       'Like_Count_unsimilar_youtube','YouTube_Flag','Games_Flag','Language_Game_Flag']

# VIF 계산을 위한 데이터 준비
X = data[independent_vars +
         list(data.filter(like='segment').columns)] if 'segment' in data.columns else data[independent_vars]

# 수치형 데이터만 선택하여 NaN값 처리
X = X.select_dtypes(include=[np.number]).dropna()

# 상수항 추가 (회귀에 필요)
X = add_constant(X)

# 스케일링 적용
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# VIF 계산
vif_data = pd.DataFrame()
vif_data["Feature"] = X_scaled.columns
vif_data["VIF"] = [variance_inflation_factor(X_scaled.values, i) for i in range(X_scaled.shape[1])]

# VIF 결과 출력
print("VIF 결과:\n", vif_data)

# VIF가 10 이하인 변수들만 선택
low_vif_features = vif_data[vif_data["VIF"] <= 10]["Feature"].tolist()
print("VIF가 낮은 독립변수들:", low_vif_features)

# Kalman Filter 설정
user_acquisition = data['User_acquisition'].values

n_timesteps = len(user_acquisition)
n_dim_state = 2  # 두 개의 숨겨진 상태: permanent와 temporary

# 전이 행렬 (transition matrix)
transition_matrix = np.array([[1, 0.1],  # Permanent component: 약간의 변화를 허용
                              [0, 0.9]])  # Temporary component: 시간이 지남에 따라 감소

# 관찰 행렬 (observation matrix)
observation_matrix = np.array([[1, 1]])  # Permanent와 Temporary의 합을 관찰

# 초기 상태값 설정
initial_state_mean = [user_acquisition[0], 0]  # 첫 관측 값으로 시작

# Kalman Filter 초기화
kf = KalmanFilter(
    transition_matrices=transition_matrix,
    observation_matrices=observation_matrix,
    initial_state_mean=initial_state_mean
)

# Kalman Filter를 사용해 상태 추정
state_means, state_covariances = kf.filter(user_acquisition)

# Permanent 및 Temporary Component 추출
data['Permanent_Component'] = state_means[:, 0]
data['Temporary_Component'] = state_means[:, 1]

# 상수항을 추가 (Intercept)
data = add_constant(data)

# 패널 데이터를 위한 필요한 독립 변수만 선택 (low_vif_features 및 원본 AppName, Country, AD_LAN 사용)
panel_data = data[['Date','segment_original', 'Permanent_Component', 'Temporary_Component'] + low_vif_features]

# 결측값 제거
panel_data = panel_data.dropna()

# 변수명에서 +를 _로 교체
panel_data.columns = panel_data.columns.str.replace('+', '_')
low_vif_features = [feature.replace('+', '_') for feature in low_vif_features]


# 고정효과 패널 회귀 분석 (Permanent Component 분석) - segment와 Language_Game_Flag 상호작용 추가
model_perm = smf.ols('Permanent_Component ~ ' + ' + '.join(low_vif_features) +
                     ' + C(segment_original, Treatment(reference=3)) * Language_Game_Flag', data=panel_data).fit()

# 고정효과 패널 회귀 분석 (Temporary Component 분석) - segment와 Language_Game_Flag 상호작용 추가
model_temp = smf.ols('Temporary_Component ~ ' + ' + '.join(low_vif_features) +
                     ' + C(segment_original, Treatment(reference=3)) * Language_Game_Flag', data=panel_data).fit()

# 결과 출력
print("고정효과 패널 분석 결과 (Permanent Component):")
print(model_perm.summary())

print("\n고정효과 패널 분석 결과 (Temporary Component):")
print(model_temp.summary())

  return 1 - self.ssr/self.uncentered_tss


VIF 결과:
                                   Feature         VIF
0                                   const         NaN
1                             Store_Visit    1.013292
2                        View_Count_games   12.527275
3                        similarity_score    6.891993
4                      View_Count_youtube    1.817350
5                      Like_Count_youtube    1.080320
6                   Comment_Count_youtube         NaN
7              video_potentiality_youtube    1.773818
8                video_potentiality_games   13.497815
9                            day+_youtube    4.235413
10                             day+_games   11.187053
11                    Category_ID_youtube    2.748766
12                      Category_ID_games   92.749093
13                 similarity_score_games   11.752945
14                   Total_Average_Rating    1.007163
15              View_Count_previous_games    7.237026
16      video_potentiality_previous_games    7.238720
17                 

  return np.sqrt(eigvals[0]/eigvals[-1])
  return np.sqrt(eigvals[0]/eigvals[-1])


In [None]:
data = segment_cocobi.copy()

In [None]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
from statsmodels.tools.tools import add_constant
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from pykalman import KalmanFilter
from patsy.contrasts import Treatment

# 데이터의 결측값을 처리 (예: 특정 열에서 결측값을 제거)
data = data.dropna(subset=['User acquisition'])

# 새로운 YouTube_Flag 생성: _youtube가 포함되지만 unsimilar_youtube 제외
youtube_columns = data.filter(like='_youtube').columns
youtube_columns = youtube_columns[~youtube_columns.str.contains('unsimilar')]
data['YouTube_Flag'] = data[youtube_columns].apply(lambda row: 1 if (row != 0).any() else 0, axis=1)

# 새로운 Games_Flag 생성: _games가 포함되지만 previous_games 제외
games_columns = data.filter(like='_games').columns
games_columns = games_columns[~games_columns.str.contains('previous')]
data['Games_Flag'] = data[games_columns].apply(lambda row: 1 if (row != 0).any() else 0, axis=1)

# AppName과 Country를 더미 변환 전에 따로 저장
data['AD_LAN_original'] = data['AD_LAN']
data['segment_original'] = data['segment']

# 변수 이름의 공백을 밑줄로 대체
data.columns = data.columns.str.replace(' ', '_')

# Check if 'segment' column exists before converting to dummy variables
if 'segment' in data.columns:
    data = pd.get_dummies(data, columns=['segment'], drop_first=False)
else:
    print("'segment' column not found, skipping dummy conversion for 'segment'.")

# Language_Game_Flag 적용 함수
def create_language_game_flag(row):
    is_korean_or_english = (row['AD_LAN_original'] == '한국어' or row['AD_LAN_original'] == '영어')
    if is_korean_or_english and row['video_potentiality_games'] > 0:
        return 1
    else:
        return 0

# Language_Game_Flag 적용
data['Language_Game_Flag'] = data.apply(create_language_game_flag, axis=1)

# 독립변수 리스트 (수치형 데이터)
independent_vars = ['Store_Visit','View_Count_games',
       'similarity_score', 'View_Count_youtube', 'Like_Count_youtube',
       'Comment_Count_youtube', 'video_potentiality_youtube',
       'video_potentiality_games', 'day+_youtube', 'day+_games',
       'Category_ID_youtube', 'Category_ID_games', 'similarity_score_games',
      'Total_Average_Rating','View_Count_previous_games', 'video_potentiality_previous_games',
       'day+_unsimilar_youtube', 'video_potentiality_unsimilar_youtube',
       'Comment_Count_Count_unsimilar_youtube', 'View_Count_unsimilar_youtube',
       'Like_Count_unsimilar_youtube','YouTube_Flag','Games_Flag','Language_Game_Flag']

# VIF 계산을 위한 데이터 준비
X = data[independent_vars +
         list(data.filter(like='segment').columns)] if 'segment' in data.columns else data[independent_vars]

# 수치형 데이터만 선택하여 NaN값 처리
X = X.select_dtypes(include=[np.number]).dropna()

# 상수항 추가 (회귀에 필요)
X = add_constant(X)

# 스케일링 적용
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# VIF 계산
vif_data = pd.DataFrame()
vif_data["Feature"] = X_scaled.columns
vif_data["VIF"] = [variance_inflation_factor(X_scaled.values, i) for i in range(X_scaled.shape[1])]

# VIF 결과 출력
print("VIF 결과:\n", vif_data)

# VIF가 10 이하인 변수들만 선택
low_vif_features = vif_data[vif_data["VIF"] <= 10]["Feature"].tolist()
print("VIF가 낮은 독립변수들:", low_vif_features)

# Kalman Filter 설정
user_acquisition = data['User_acquisition'].values

n_timesteps = len(user_acquisition)
n_dim_state = 2  # 두 개의 숨겨진 상태: permanent와 temporary

# 전이 행렬 (transition matrix)
transition_matrix = np.array([[1, 0.1],  # Permanent component: 약간의 변화를 허용
                              [0, 0.9]])  # Temporary component: 시간이 지남에 따라 감소

# 관찰 행렬 (observation matrix)
observation_matrix = np.array([[1, 1]])  # Permanent와 Temporary의 합을 관찰

# 초기 상태값 설정
initial_state_mean = [user_acquisition[0], 0]  # 첫 관측 값으로 시작

# Kalman Filter 초기화
kf = KalmanFilter(
    transition_matrices=transition_matrix,
    observation_matrices=observation_matrix,
    initial_state_mean=initial_state_mean
)

# Kalman Filter를 사용해 상태 추정
state_means, state_covariances = kf.filter(user_acquisition)

# Permanent 및 Temporary Component 추출
data['Permanent_Component'] = state_means[:, 0]
data['Temporary_Component'] = state_means[:, 1]

# 상수항을 추가 (Intercept)
data = add_constant(data)

# 패널 데이터를 위한 필요한 독립 변수만 선택 (low_vif_features 및 원본 AppName, Country, AD_LAN 사용)
panel_data = data[['Date','segment_original', 'Permanent_Component', 'Temporary_Component'] + low_vif_features]

# 결측값 제거
panel_data = panel_data.dropna()

# 변수명에서 +를 _로 교체
panel_data.columns = panel_data.columns.str.replace('+', '_')
low_vif_features = [feature.replace('+', '_') for feature in low_vif_features]


# 고정효과 패널 회귀 분석 (Permanent Component 분석) - segment와 Language_Game_Flag 상호작용 추가
model_perm = smf.ols('Permanent_Component ~ ' + ' + '.join(low_vif_features) +
                     ' + C(segment_original, Treatment(reference=4)) * Language_Game_Flag', data=panel_data).fit()

# 고정효과 패널 회귀 분석 (Temporary Component 분석) - segment와 Language_Game_Flag 상호작용 추가
model_temp = smf.ols('Temporary_Component ~ ' + ' + '.join(low_vif_features) +
                     ' + C(segment_original, Treatment(reference=4)) * Language_Game_Flag', data=panel_data).fit()

# 결과 출력
print("고정효과 패널 분석 결과 (Permanent Component):")
print(model_perm.summary())

print("\n고정효과 패널 분석 결과 (Temporary Component):")
print(model_temp.summary())

  return 1 - self.ssr/self.uncentered_tss


VIF 결과:
                                   Feature         VIF
0                                   const         NaN
1                             Store_Visit    1.013292
2                        View_Count_games   12.527275
3                        similarity_score    6.891993
4                      View_Count_youtube    1.817350
5                      Like_Count_youtube    1.080320
6                   Comment_Count_youtube         NaN
7              video_potentiality_youtube    1.773818
8                video_potentiality_games   13.497815
9                            day+_youtube    4.235413
10                             day+_games   11.187053
11                    Category_ID_youtube    2.748766
12                      Category_ID_games   92.749093
13                 similarity_score_games   11.752945
14                   Total_Average_Rating    1.007163
15              View_Count_previous_games    7.237026
16      video_potentiality_previous_games    7.238720
17                 

  return np.sqrt(eigvals[0]/eigvals[-1])
  return np.sqrt(eigvals[0]/eigvals[-1])
