In [1]:
from google.cloud import bigquery
import pandas as pd
import os
import numpy as np
import shap
from sklearn.metrics import mean_squared_error, mean_absolute_error
import evaluation
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] =  'tcloud-ga.json'
bq_client = bigquery.Client()

In [2]:
sql_query1 ="""
with orderinfo as(
SELECT sme_ban, sum(sol_price) as total_pay, sum(sol_point) as pointsused , sum(sol_selfpay) as selfpay, sum(solution_duration) servicelen, count(order_num) as ordernums, solution_uuid FROM `tcloud-data-analysis.highly_use_data.orders` 
group by sme_ban,solution_uuid 
),
sme as (
  select * from tcloud-data-analysis.ml_data.sme_basic_numeric
),
page as (
  select * from `tcloud-data-analysis.ga3.solution_pv`
),
ind as (
  select sme_ban, ind_large from `tcloud-data-analysis.tcloud_analytics_iii.sme_basic`
)

select orderinfo.* , sme.* EXCEPT(sme_ban), page.* EXCEPT(clean_path2), ind_large
from orderinfo
join sme on orderinfo.sme_ban = sme.sme_ban
join page on orderinfo.solution_uuid = page.clean_path2
join ind on orderinfo.sme_ban= ind.sme_ban
"""
query_job1 = bq_client.query(sql_query1)
recommend = query_job1.to_dataframe()


query_indnm = """
SELECT * FROM `tcloud-data-analysis.tcloud_analytics_iii.industry_large`
"""

# 查詢資料並將結果存為 DataFrame
query_job = bq_client.query(query_indnm)
industry_df = query_job.to_dataframe()

# 提取所有可能的 ind_large 選項
ind_large_values = industry_df['ind_large'].unique()


query_subcate = """
SELECT * FROM `tcloud-data-analysis.tcloud_analytics_iii.solution_subcategory_encoding`
"""

# 查詢資料並將結果存為 DataFrame
query_job = bq_client.query(query_subcate)
solution_sub = query_job.to_dataframe()
import pandas as pd
from pandas.api.types import CategoricalDtype

# 將 ind_large 轉換為 CategoricalDtype 並指定所有可能的類別
ind_large_type = CategoricalDtype(categories=ind_large_values, ordered=False)
recommend['ind_large'] = recommend['ind_large'].astype(ind_large_type)

# 進行 one-hot encoding
ind_large_dummies = pd.get_dummies(recommend['ind_large'], prefix='ind_large')

# 將所有編碼列轉換為 'Int64' 數據類型
ind_large_dummies = ind_large_dummies.astype('Int64')

# 合併原始 DataFrame 和編碼後的 DataFrame
recommend = pd.concat([recommend.drop('ind_large', axis=1), ind_large_dummies], axis=1)
recommend = recommend.merge(solution_sub, on='solution_uuid', how='left')
import pandas as pd
from sklearn.model_selection import train_test_split

# ...其他程式碼(資料讀取等)...

def create_mappings(df, user_col, item_col):
    user_mapping = {user: idx for idx, user in enumerate(df[user_col].unique())}
    item_mapping = {item: idx for idx, item in enumerate(df[item_col].unique())}
    return user_mapping, item_mapping

def encode_data(df, user_col, item_col, user_mapping, item_mapping):
    df[user_col] = df[user_col].map(user_mapping)
    df[item_col] = df[item_col].map(item_mapping)
    return df

def reverse_mappings(mapping):
    return {idx: key for key, idx in mapping.items()}

def save_mappings(user_reverse_mapping, item_reverse_mapping, user_mapping_filename, item_mapping_filename):
    user_reverse_mapping_df = pd.DataFrame(list(user_reverse_mapping.items()), columns=['encoded', 'original'])
    item_reverse_mapping_df = pd.DataFrame(list(item_reverse_mapping.items()), columns=['encoded', 'original'])
    user_reverse_mapping_df.to_csv(user_mapping_filename, index=False)
    item_reverse_mapping_df.to_csv(item_mapping_filename, index=False)

sme_ban_mapping, solution_uuid_mapping = create_mappings(recommend, 'sme_ban', 'solution_uuid')

recommend_encoded = encode_data(recommend.copy(), 'sme_ban', 'solution_uuid', sme_ban_mapping, solution_uuid_mapping)

sme_ban_reverse_mapping = reverse_mappings(sme_ban_mapping)
solution_uuid_reverse_mapping = reverse_mappings(solution_uuid_mapping)

save_mappings(sme_ban_reverse_mapping, solution_uuid_reverse_mapping, 'sme_ban_reverse_mapping.csv', 'solution_uuid_reverse_mapping.csv')

recommend_encoded = recommend_encoded.dropna(axis=0)
# 數據分割
train_data, test_data = train_test_split(recommend_encoded, test_size=0.2, random_state=42)


# 欄位分割
sme_ban_columns = [
    'q_organizationsize_level', 'q_planningtime_level', 'q_budget_level',
    'opscore1', 'opscore2', 'marscore1', 'marscore2', 'salescore1', 'salescore2',
    'securscore1', 'securscore2', 'remotescore1', 'remotescore2', 'schedscore1',
    'schedscore2', 'sme_age', 'capital', 'employee_count',
    'ind_large_A', 'ind_large_B', 'ind_large_C', 'ind_large_D',
    'ind_large_E', 'ind_large_F', 'ind_large_G', 'ind_large_H',
    'ind_large_I', 'ind_large_J', 'ind_large_K', 'ind_large_L',
    'ind_large_M', 'ind_large_N', 'ind_large_P', 'ind_large_Q',
    'ind_large_R', 'ind_large_S'
]

solution_uuid_columns = [
    'pageview', 'bound', 'in_site', 'crm_system', 'erp_system', 'pos_integration', 'seo',
    'hr_management', 'credit_card_ticketing', 'survey_analysis',
    'big_data_analysis', 'customer_interaction', 'market_research',
    'digital_advertising', 'document_processing_software',
    'membership_point_system', 'production_logistics_management',
    'carbon_emission_calculation_analysis',
    'community_content_management_operation', 'sms_system',
    'online_customer_service', 'online_meeting', 'online_reservation',
    'energy_management_system', 'mobile_payment',
    'marketing_matchmaking_kol', 'financial_management',
    'information_security', 'public_opinion_analysis',
    'inventory_management_system', 'remote_collaboration',
    'antivirus_software', 'ecommerce_online_shopping_platform',
    'enewsletter_edm', 'electronic_invoice'
]
interaction_columns = ['total_pay']

# 將訓練集和測試集拆分為用戶編碼、物品編碼和交互作用
train_sme_ban = train_data['sme_ban'].astype('int32')
train_solution_uuid = train_data['solution_uuid'].astype('int32')
train_interactions = train_data[interaction_columns].astype('int32')

test_sme_ban = test_data['sme_ban'].astype('int32')
test_solution_uuid = test_data['solution_uuid'].astype('int32')
test_interactions = test_data[interaction_columns].astype('int32')



#interaction_columns = ['sme_ban', 'solution_uuid', 'total_pay', 'pointsused', 'selfpay', 'servicelen', 'ordernums']

# 分別獲取訓練集和測試集中的用戶和物品特徵
train_sme_ban_features = train_data[sme_ban_columns].astype('int32')
train_solution_uuid_features = train_data[solution_uuid_columns].astype('int32')

test_sme_ban_features = test_data[sme_ban_columns].astype('int32')
test_solution_uuid_features = test_data[solution_uuid_columns].astype('int32')








In [11]:
newsme_id_query = """
SELECT *
FROM `tcloud-data-analysis.ml_data.new_sme_test` 
"""


# 查詢資料並將結果存為 DataFrame
query_job = bq_client.query(newsme_id_query)
newsme = query_job.to_dataframe()
newsme_id= newsme['sme_ban'].tolist()


In [15]:
print(newsme.columns)


Index(['sme_ban', 'q_organizationsize_level', 'q_planningtime_level',
       'q_budget_level', 'opscore1', 'opscore2', 'marscore1', 'marscore2',
       'salescore1', 'salescore2', 'securscore1', 'securscore2',
       'remotescore1', 'remotescore2', 'schedscore1', 'schedscore2', 'sme_age',
       'capital', 'employee_count', 'ind_large_A', 'ind_large_B',
       'ind_large_C', 'ind_large_D', 'ind_large_E', 'ind_large_F',
       'ind_large_G', 'ind_large_H', 'ind_large_I', 'ind_large_J',
       'ind_large_K', 'ind_large_L', 'ind_large_M', 'ind_large_N',
       'ind_large_O', 'ind_large_P', 'ind_large_Q', 'ind_large_R',
       'ind_large_S'],
      dtype='object')


In [13]:
# 將 ind_large 轉換為 CategoricalDtype 並指定所有可能的類別
ind_large_type = CategoricalDtype(categories=ind_large_values, ordered=False)
newsme['ind_large'] = newsme['ind_large'].astype(ind_large_type)

# 進行 one-hot encoding
ind_large_dummies = pd.get_dummies(newsme['ind_large'], prefix='ind_large')

# 將所有編碼列轉換為 'Int64' 數據類型
ind_large_dummies = ind_large_dummies.astype('Int64')

# 合併原始 DataFrame 和編碼後的 DataFrame
newsme = pd.concat([newsme.drop('ind_large', axis=1), ind_large_dummies], axis=1)


In [17]:
newsme[sme_ban_columns]= newsme[sme_ban_columns].astype('int32')

In [18]:
def row_to_dict(row):
    data = {"sme_ban": row['sme_ban'],
            "features": row.drop('sme_ban').to_dict()
            }
    return data

# 將 DataFrame 的第五個 row 轉換為 dict
fifth_row_dict = row_to_dict(newsme.iloc[4])
print(fifth_row_dict)


{'sme_ban': '92050012', 'features': {'q_organizationsize_level': 1, 'q_planningtime_level': 1, 'q_budget_level': 1, 'opscore1': 0, 'opscore2': 0, 'marscore1': 0, 'marscore2': 0, 'salescore1': 0, 'salescore2': 0, 'securscore1': 0, 'securscore2': 0, 'remotescore1': 0, 'remotescore2': 0, 'schedscore1': 0, 'schedscore2': 0, 'sme_age': 53, 'capital': 100000, 'employee_count': 0, 'ind_large_A': 0, 'ind_large_B': 0, 'ind_large_C': 0, 'ind_large_D': 0, 'ind_large_E': 0, 'ind_large_F': 0, 'ind_large_G': 0, 'ind_large_H': 0, 'ind_large_I': 1, 'ind_large_J': 0, 'ind_large_K': 0, 'ind_large_L': 0, 'ind_large_M': 0, 'ind_large_N': 0, 'ind_large_O': 0, 'ind_large_P': 0, 'ind_large_Q': 0, 'ind_large_R': 0, 'ind_large_S': 0}}


In [19]:
import requests
import json

url = "http://10.140.0.35:5000/predict"  # Update with your server's IP address and port


response = requests.post(url, json=fifth_row_dict)

print(response.json())

{'top_5_item_ids': ['F03FE16033A30DAEE0531512620AC1A1', 'F03FE160366B0DAEE0531512620AC1A1', 'F39E93C29D052923E0531512620AECF8', 'F03FE160363F0DAEE0531512620AC1A1', 'F03FE16039270DAEE0531512620AC1A1']}


In [20]:
import requests
import pandas as pd

# 創建 DataFrame
recommend_result = pd.DataFrame(columns=['sme_ban', 'top1', 'top2', 'top3', 'top4', 'top5'])

url = "http://10.140.0.35:5000/predict"  # Update with your server's IP address and port

# 對 newsme 的每一個 row 執行以下操作
for i, row in newsme.iterrows():
    # 轉換 row 為 dict
    row_dict = row_to_dict(row)

    # 發送請求至 API
    response = requests.post(url, json=row_dict)

    # 獲取回應中的 'top_5_item_ids'
    top_5_item_ids = response.json()['top_5_item_ids']

    # 將 'sme_ban' 和 'top_5_item_ids' 存入 DataFrame
    recommend_result = recommend_result.append({
        'sme_ban': row_dict['sme_ban'],
        'top1': top_5_item_ids[0],
        'top2': top_5_item_ids[1],
        'top3': top_5_item_ids[2],
        'top4': top_5_item_ids[3],
        'top5': top_5_item_ids[4],
    }, ignore_index=True)

# 顯示 DataFrame
recommend_result.head(5)


Unnamed: 0,sme_ban,top1,top2,top3,top4,top5
0,53275421,F39E93C29E4D2923E0531512620AECF8,F03FE16033A30DAEE0531512620AC1A1,F03FE160363F0DAEE0531512620AC1A1,F39E93C29EAD2923E0531512620AECF8,F03FE160382B0DAEE0531512620AC1A1
1,50986480,F39E93C29E4D2923E0531512620AECF8,F03FE16033A30DAEE0531512620AC1A1,F03FE160363F0DAEE0531512620AC1A1,F39E93C29EAD2923E0531512620AECF8,F03FE160382B0DAEE0531512620AC1A1
2,11014591,F39E93C29E4D2923E0531512620AECF8,F03FE16033A30DAEE0531512620AC1A1,F03FE160363F0DAEE0531512620AC1A1,F39E93C29EAD2923E0531512620AECF8,F03FE160382B0DAEE0531512620AC1A1
3,53436329,F39E93C29E4D2923E0531512620AECF8,F03FE16033A30DAEE0531512620AC1A1,F03FE160363F0DAEE0531512620AC1A1,F39E93C29EAD2923E0531512620AECF8,F03FE160382B0DAEE0531512620AC1A1
4,92050012,F03FE16033A30DAEE0531512620AC1A1,F03FE160366B0DAEE0531512620AC1A1,F39E93C29D052923E0531512620AECF8,F03FE160363F0DAEE0531512620AC1A1,F03FE16039270DAEE0531512620AC1A1
