In [45]:
# import re
import sys
import random
import pandas as pd
import numpy as np
import warnings 
warnings.filterwarnings('ignore')
import statsmodels.api as sm
from statsmodels.formula.api import ols
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from scipy import optimize
from gekko import GEKKO

def input_sisul():
    sisul = input('알아보고 싶은 시설명을 입력하세요:')
    temp = stage[stage['시설명'].str.contains(sisul)].copy()
    temp.reset_index(inplace=True, drop=True)
    
    if len(temp)==1:
        print(temp['시설명'])
        return temp.iloc[0, :]
    elif len(temp)==0:
        print('찾으시는 시설이 없습니다. 다른 시설을 입력하세요.')
        sys.exit()
    elif len(temp)>1:
        print(f'{len(temp)}개의 검색 결과가 있습니다.')
        print('원하는 공연장에 해당하는 숫자를 입력하세요:')
        for i in range(len(temp)):
            print(f"{i+1}. {temp['시설명'][i]} {temp['공연홀'][i]} ({temp['지역'][i]} {temp['시군구'][i]})")
        inp = input()
        try:
            hall = int(inp)
            return temp.iloc[hall-1, :]
        except:
            print('잘못된 입력입니다.')
            sys.exit()
            
def age_cat(x):
    try:
        x=int(x)
        if x<10: x='less than 10'
        elif x<20: x='10s'
        elif x<30: x='20s'
        elif x<40: x='30s'
        elif x<50: x='40s'
        elif x<60: x='50s'
        elif x<70: x='60s'
        elif x<80: x='70s'
        else: x='over 80'
        return x
    except:
        return x

def make_xy(genre: str):
    obj = 'satisfy_' + genre
    temp = df[df[obj].isnull()==False].copy()
    temp = temp[temp['time']!='기타']
    # convert to binary classification
    temp['satisfaction'] = temp[obj].apply(lambda x: 1 if x>=5 else 0)
    
    temp = temp[['time', 'location', 'sex', 'age', 'satisfaction']]
    #temp['age'] = temp['age'].apply(lambda x: age_cat(x))
    temp = pd.get_dummies(temp, columns=['location'])
    temp = pd.get_dummies(temp, columns=['time', 'age', 'sex'], drop_first=True)

    X = temp.drop(columns=['satisfaction'])
    y = temp['satisfaction']
    return X, y

    
def get_coef(X, y):  
    # commit logsitic regression
    reg = LogisticRegression()
    reg.fit(X, y)
    cv_results = cross_validate(reg, X, y, cv=5, scoring='f1')
    #print('logistic regression f1score -', genre, ':')
    #print(cv_results['test_score'].mean())
    return reg.coef_, reg.intercept_

# Optimization model of location and time with logistic regression model
def optimize(genre: str, loc, sigungu, coef, intercept, X, k=-1):    
    m = GEKKO(remote=False)
    m.options.SOLVER=1  # APOPT is an MINLP solver

    mod = sys.modules[__name__]
    for i in range(20):
        setattr(mod, f'x{i}', m.Var(value=0,lb=0,ub=1,integer=True))
    for i in range(20, 28):
        setattr(mod, f'x{i}', m.Var(value=0,lb=0,ub=1))

    v1 = np.array([x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, #location
                   x17, x18, x19, #time
                   x20, x21, x22, x23, x24, x25, x26, #age
                   x27]) #sex
    loc_list = ['강원', '경기', '경남', '경북', '광주', '대구', '대전', '부산', '서울',
                '세종', '울산', '인천', '전남', '전북', '제주', '충남', '충북']
    
    temp = df_pop[(df_pop['지역']==loc) & (df_pop['시군구']==sigungu)]
    total = temp.iloc[:, 3:-1].sum().sum()
    sex_f = temp.iloc[1, 3:-1].sum() / total
    age_10 = temp.iloc[:, 4].sum() / total
    age_20 = temp.iloc[:, 5].sum() / total
    age_30 = temp.iloc[:, 6].sum() / total
    age_40 = temp.iloc[:, 7].sum() / total
    age_50 = temp.iloc[:, 8].sum() / total
    age_60 = temp.iloc[:, 9].sum() / total
    age_70 = temp.iloc[:, 10].sum() / total
    age_80 = temp.iloc[:, 11].sum() / total
    
    a = intercept[0]
    # set constraints
    m.Equation(x0+x1+x2+x3+x4+x5+x6+x7+x8+x9+x10+x11+x12+x13+x14+x15+x16==1) # location
    m.Equation(x17+x18+x19<=1) # time: weekday daytime / weekday dinner / weekend daytime / weekend dinner
    m.Equation(eval('x'+str(loc_list.index(loc)))==1)
    m.Equation(x20==age_20)
    m.Equation(x21==age_30)
    m.Equation(x22==age_40)
    m.Equation(x23==age_50)
    m.Equation(x24==age_60)
    m.Equation(x25==age_70)
    m.Equation(x26==age_80)
    m.Equation(x27==sex_f)
    # get next optimized value
    m.Equation((-m.exp(a+(np.dot(v1, coef[0]))) / (1+m.exp(a+(np.dot(v1, coef[0])))))>k)
    m.Obj(-(m.exp(a+(np.dot(v1, coef[0])))) / (1+m.exp(a+(np.dot(v1, coef[0])))))
    m.solve(disp=False)
    
    obj = (m.options.objfcnval)*-1
    time = '주말 낮 시간'
    for i in range(17, 19):
        if list(v1)[i][0] == 1:
            time = X.columns[i][5:]

    return time, obj

# select the optimal stage under the conditions            
def show_select(genre: str, loc: str, time: str, pos):
    
    # find optimal age for given conditions
    tmp = df[(df['location']==loc) 
             & ~(df['satisfy_'+genre].isnull()) 
             & (df['time']==time)].groupby(['age', 'sex'])['location'].count()
    try:
        age, sex = tmp.idxmax()
    except:
        age, sex = '30대', '여자'
    # convert age label to Korean
    if age == '80s':
        age = '80대 이상'
    else:
        age = age.replace('s', '대')
    
    # convert genre label to Korean
    if genre == 'western':
        genre = '서양음악'
    elif genre == 'tradition':
        genre = '전통공연'
    elif genre == 'acting':
        genre = '연극'
    elif genre == 'musical':
        genre = '뮤지컬'
    elif genre == 'dancing':
        genre = '무용'
    
    output = pd.DataFrame([[sisul['시설명'], sisul['공연홀'], sisul['좌석수'], genre, time, age + ' ' + sex, pos]], 
                          columns=['시설명', '공연홀', '좌석수', '장르', '시간대', '타겟층', '만족도 점수'])
    return output


def genre_check(sisul, data):
    genres = ['서양음악', '전통공연', '연극', '뮤지컬', '무용']
    drop_index = []
    for genre in genres:
        if sisul[genre]==0:
            drop_index += list(data[data['장르']==genre].index)
    data = data.drop(index=drop_index)
    return data
            

df = pd.read_csv('data/data_preprocessed.csv', index_col=0)
stage = pd.read_csv('data/stage_info.csv', index_col=0)
df_pop = pd.read_csv('data/data_population.csv', index_col=0)

sisul = input_sisul()
loc = sisul['지역']
sigungu = sisul['시군구']
genres = ['western', 'tradition', 'acting', 'musical', 'dancing']
dic = {}
first = True
for genre in genres:
    X, y = make_xy(genre)
    coef, intercept = get_coef(X, y)
    k = -1
    for i in range(3):
        time, pos = optimize(genre, loc, sigungu, coef, intercept, X, k)
        if k==-1:
            temp_output = show_select(genre, loc, time, pos)
        else:
            temp_output = pd.concat([temp_output, show_select(genre, loc, time, pos)])
        k = pos*-1 + 0.001
        if first == True:
            output = temp_output
        else:
            output = pd.concat([output, temp_output])
        first = False
    
result = output.sort_values('만족도 점수', ascending=False).drop_duplicates()
result.reset_index(inplace=True, drop=True)
result = genre_check(sisul, result)
result.reset_index(inplace=True, drop=True)
result.index.name = '순위'
result.index += 1

result

알아보고 싶은 시설명을 입력하세요: 전주


4개의 검색 결과가 있습니다.
원하는 공연장에 해당하는 숫자를 입력하세요:
1. 전주소리문화관 놀이마당(야외마당) (전북 전주시)
2. 전주한벽문화관 화명원 (전북 전주시)
3. 전주한벽문화관 한벽공연장 (전북 전주시)
4. 전주한벽문화관 혼례마당 (전북 전주시)


 1


Unnamed: 0_level_0,시설명,공연홀,좌석수,장르,시간대,타겟층,만족도 점수
순위,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,전주소리문화관,놀이마당(야외마당),200,연극,주말 낮 시간,30대 남자,0.933732
2,전주소리문화관,놀이마당(야외마당),200,연극,주말 저녁 시간,20대 여자,0.923173
3,전주소리문화관,놀이마당(야외마당),200,전통공연,주중 낮 시간,70대 남자,0.922178
4,전주소리문화관,놀이마당(야외마당),200,전통공연,주말 낮 시간,30대 남자,0.914423
5,전주소리문화관,놀이마당(야외마당),200,연극,주중 낮 시간,40대 여자,0.913036
6,전주소리문화관,놀이마당(야외마당),200,전통공연,주말 낮 시간,30대 남자,0.909433
