# MongoDB에서 바로 DataFrame으로 가져오기

In [107]:
# 방법 1: Binary UUID를 문자열로 변환하여 DataFrame에서 조회
from pymongo import MongoClient
import pandas as pd
import os
from dotenv import load_dotenv
from bson.binary import Binary
import uuid

load_dotenv(override=True)

# MongoDB 연결 정보
uri = os.getenv("MONGO_URI")
db_name = os.getenv("MONGO_DB_NAME")
collection_name = os.getenv("COLLECTION_NAME")

client = MongoClient(uri)
db = client[db_name]
collection = db[collection_name]

# MongoDB에서 데이터 불러오기
data = list(collection.find())

# DataFrame으로 변환
df = pd.DataFrame(data)

# Binary UUID를 문자열로 변환하는 함수
def binary_to_uuid_string(binary_uuid):
    if isinstance(binary_uuid, Binary):
        return str(uuid.UUID(bytes=binary_uuid))
    return binary_uuid

# childId(또는 userId) 컬럼의 Binary를 문자열 UUID로 변환
if 'childId' in df.columns:
    df['childId'] = df['childId'].apply(binary_to_uuid_string)
    df.rename(columns={'childId':'userId'}, inplace=True)
elif 'userId' in df.columns:
    df['userId'] = df['userId'].apply(binary_to_uuid_string)

print("변환 후 데이터:")
df.head()



변환 후 데이터:


Unnamed: 0,_id,investSessionId,chapterId,userId,turn,riskLevel,currentPoint,beforeValue,currentValue,initialValue,numberOfShares,income,transactionType,plusClick,minusClick,newsTag,startedAt,endedAt,_class
0,"b""\tJS+\xad\xe3\xb5\x07\xcce'\x91j\xd3\xa6\xb9""",b'\x07J\x88\xb0fH\x00\xe5\xe2\xb1\x91\x03\x18\...,1111,124aa0d6-a851-6f95-79c8-ffcda39d2ba2,1,고위험 고수익,2000,0,100,100,0,0,KEEP,0,0,all,2025-06-17 10:53:12,2025-06-17 10:53:16,com.popoworld.backend.invest.entity.InvestHistory
1,b'\x01M\xe1\xa9\x96\x17\x1e\xcb\x92k\xf3\x9d\x...,b'\x07J\x88\xb0fH\x00\xe5\xe2\xb1\x91\x03\x18\...,1111,124aa0d6-a851-6f95-79c8-ffcda39d2ba2,1,저위험 저수익,2000,0,100,100,0,0,KEEP,0,0,all,2025-06-17 10:53:12,2025-06-17 10:53:16,com.popoworld.backend.invest.entity.InvestHistory
2,b'oF\xefb$\xf6\xe0\x02T\x91\x1e3\xd6\xe9S\x89',b'\x07J\x88\xb0fH\x00\xe5\xe2\xb1\x91\x03\x18\...,1111,124aa0d6-a851-6f95-79c8-ffcda39d2ba2,1,중위험 균형형,2000,0,100,100,0,0,KEEP,0,0,all,2025-06-17 10:53:12,2025-06-17 10:53:16,com.popoworld.backend.invest.entity.InvestHistory
3,b'QI\x18\xc24_\n~?\xc2\x96\x9a\t\x9b\xed\x98',b'\x07J\x88\xb0fH\x00\xe5\xe2\xb1\x91\x03\x18\...,1111,124aa0d6-a851-6f95-79c8-ffcda39d2ba2,2,중위험 균형형,866,100,96,100,3,0,BUY,3,0,high,2025-06-17 10:53:16,2025-06-17 10:53:30,com.popoworld.backend.invest.entity.InvestHistory
4,b'\xb6D\x00F\xf2\xca}S\xaeN\x9a\x81\xb5\xee\x9...,b'\x07J\x88\xb0fH\x00\xe5\xe2\xb1\x91\x03\x18\...,1111,124aa0d6-a851-6f95-79c8-ffcda39d2ba2,2,저위험 저수익,866,100,98,100,3,0,BUY,3,0,high,2025-06-17 10:53:16,2025-06-17 10:53:30,com.popoworld.backend.invest.entity.InvestHistory


In [108]:
df["userId"].value_counts()

userId
124aa0d6-a851-6f95-79c8-ffcda39d2ba2    36
8a4e5eed-f77f-9da7-830d-232c0b5b4ebe    36
956f51a8-d6a0-4a12-a22b-9da3cdffc879    18
f0220d43-513a-4619-973d-4ed84a42bf6a    18
Name: count, dtype: int64

In [109]:
def mongo_preprocess(df):

    df['riskLevel'] = df['riskLevel'].replace({
        '고위험 고수익': 'high',
        '중위험 균형형': 'mid',
        '저위험 저수익': 'low'
    })

    df.sort_values(by=['investSessionId', 'riskLevel', 'turn'], inplace=True)
    df['deltaShares'] = df.groupby(['investSessionId', 'riskLevel'])['numberOfShares'].diff()
    df['deltaShares'] = df['deltaShares'].fillna(df['numberOfShares'])
    df['deltaShares'] = df['deltaShares'].astype(int)

    return df

df = mongo_preprocess(df)
df.head()

Unnamed: 0,_id,investSessionId,chapterId,userId,turn,riskLevel,currentPoint,beforeValue,currentValue,initialValue,numberOfShares,income,transactionType,plusClick,minusClick,newsTag,startedAt,endedAt,_class,deltaShares
0,"b""\tJS+\xad\xe3\xb5\x07\xcce'\x91j\xd3\xa6\xb9""",b'\x07J\x88\xb0fH\x00\xe5\xe2\xb1\x91\x03\x18\...,1111,124aa0d6-a851-6f95-79c8-ffcda39d2ba2,1,high,2000,0,100,100,0,0,KEEP,0,0,all,2025-06-17 10:53:12,2025-06-17 10:53:16,com.popoworld.backend.invest.entity.InvestHistory,0
5,b'\xc8I\xab\x14\x84\x0c&\xe9\x97\xb6\xe6\x88\x...,b'\x07J\x88\xb0fH\x00\xe5\xe2\xb1\x91\x03\x18\...,1111,124aa0d6-a851-6f95-79c8-ffcda39d2ba2,2,high,866,100,92,100,6,0,BUY,6,0,high,2025-06-17 10:53:16,2025-06-17 10:53:30,com.popoworld.backend.invest.entity.InvestHistory,6
8,b'\x1bB\\\x12\xfa\xf8W-;\xf9e\xde\xaa\xe2\x18\...,b'\x07J\x88\xb0fH\x00\xe5\xe2\xb1\x91\x03\x18\...,1111,124aa0d6-a851-6f95-79c8-ffcda39d2ba2,3,high,187,92,77,100,6,0,KEEP,0,0,mid,2025-06-17 10:53:30,2025-06-17 10:53:41,com.popoworld.backend.invest.entity.InvestHistory,0
9,b'\xaaL&\x8dv\x99s\xc3\xe2\xa1-\xa5\xf1Lw\x98',b'\x07J\x88\xb0fH\x00\xe5\xe2\xb1\x91\x03\x18\...,1111,124aa0d6-a851-6f95-79c8-ffcda39d2ba2,4,high,8,77,78,100,7,0,BUY,1,0,all,2025-06-17 10:53:41,2025-06-17 10:53:53,com.popoworld.backend.invest.entity.InvestHistory,1
12,b'\x1cNjWCC\xb5e\xe4\xa46_[\xeb\x9b\xbf',b'\x07J\x88\xb0fH\x00\xe5\xe2\xb1\x91\x03\x18\...,1111,124aa0d6-a851-6f95-79c8-ffcda39d2ba2,5,high,356,78,87,100,3,-12,SELL,0,4,high,2025-06-17 10:53:53,2025-06-17 10:54:01,com.popoworld.backend.invest.entity.InvestHistory,-4


# PostgreSQL에서 DataFrame으로 가져오기

In [110]:
import pandas as pd
from dotenv import load_dotenv
import os
import psycopg2

load_dotenv(override=True)

# PostgreSQL 연결 정보
conn = psycopg2.connect(
    host=os.getenv("DB_HOST"),
    port=os.getenv("DB_PORT"),          
    dbname=os.getenv("DB_NAME"), 
    user=os.getenv("DB_USER"),
    password=os.getenv("DB_PASSWORD") 
)

# SQL 쿼리 실행 후 DataFrame으로 불러오기
s_query = "SELECT chapter_id, seed_money FROM invest_chapter;"
c_query = "SELECT user_id, age, sex FROM users;"
seed = pd.read_sql(s_query, conn)
child = pd.read_sql(c_query, conn)

# 연결 종료
conn.close()

# 결과 확인
print(seed.head())
print(child.head())

  chapter_id  seed_money
0       1111         700
1       2222        2000
2       3333        5000
3       4444       10000
                                user_id   age sex
0  969e5004-542c-47c1-b906-d81b907c5d08    39   F
1  d97a07eb-8dd9-4cbe-a4c4-8c61017ada71     6   F
2  5bda4ade-431a-47ef-a62c-7bd895ece820  1234   M
3  c1194c9c-1ca4-435b-950e-ad68be4ba92f    22   M
4  74f9a8f9-4f95-4759-ba5f-106040e2993a    24   M


  seed = pd.read_sql(s_query, conn)
  child = pd.read_sql(c_query, conn)


In [111]:
# PostgreSQL과 MongoDB의 변수명 통일
seed.rename(columns={'chapter_id': 'chapterId'}, inplace=True)
seed.rename(columns={'seed_money': 'seedMoney'}, inplace=True)
print(seed.head())

# PostgreSQL과 MongoDB의 변수명 통일
child.rename(columns={'user_id':'userId'}, inplace=True)
child.head()

  chapterId  seedMoney
0      1111        700
1      2222       2000
2      3333       5000
3      4444      10000


Unnamed: 0,userId,age,sex
0,969e5004-542c-47c1-b906-d81b907c5d08,39,F
1,d97a07eb-8dd9-4cbe-a4c4-8c61017ada71,6,F
2,5bda4ade-431a-47ef-a62c-7bd895ece820,1234,M
3,c1194c9c-1ca4-435b-950e-ad68be4ba92f,22,M
4,74f9a8f9-4f95-4759-ba5f-106040e2993a,24,M


# 거래참여 비율 (tradingTurn / investSessionId + childId)

In [113]:
# 거래참여 턴 비율
def trading_turn(df):
    tpt = df[['investSessionId', 
          'userId',
          'turn',
          'riskLevel',
          'transactionType']]
    
    hold_flags = tpt.groupby(['investSessionId', 'userId', 'turn'])['transactionType'].apply(lambda x: int((x != "KEEP").any())).reset_index(name="notAllKeep")
    tradingTurn = hold_flags.groupby(['investSessionId', 'userId'])['notAllKeep'].mean().reset_index(name="avgNotKeep")
    
    return tradingTurn


tradingTurn = trading_turn(df)
tradingTurn.head()

Unnamed: 0,investSessionId,userId,avgNotKeep
0,b'\x07J\x88\xb0fH\x00\xe5\xe2\xb1\x91\x03\x18\...,124aa0d6-a851-6f95-79c8-ffcda39d2ba2,0.833333
1,b'\x17C}\xc0\x8f\x8bO\xfd\xca\x0eZ\xa0\xe7\xd9...,f0220d43-513a-4619-973d-4ed84a42bf6a,0.833333
2,"b'6NY\xb0^\x89,d\x1b:\xda\xaa\x94H\xc4\xa7'",956f51a8-d6a0-4a12-a22b-9da3cdffc879,0.833333
3,b'{FL|O@\xc1\x165\xa5\xff\x99\xfb_\xc8\x89',8a4e5eed-f77f-9da7-830d-232c0b5b4ebe,1.0
4,b'\xb4@B\x98\xc0\xbdlc\x9f\xf9\xd7*<\xb6\xeb\xa4',8a4e5eed-f77f-9da7-830d-232c0b5b4ebe,1.0


# 거래횟수 (transactionNum / investSessionId)

In [164]:
# 거래 횟수
def transaction_num(df):
    nt = df[['investSessionId',
             'userId' ,
          'turn',
          'riskLevel',
          'plusClick',
          'minusClick']].copy()

    nt['click'] = nt['plusClick'] + nt['minusClick']
    transactionNum = nt.groupby(['investSessionId','userId'])[['click']].mean().reset_index()

    return transactionNum

transactionNum = transaction_num(df)

In [165]:
transactionNum.head()

Unnamed: 0,investSessionId,userId,click
0,b'\x07J\x88\xb0fH\x00\xe5\xe2\xb1\x91\x03\x18\...,124aa0d6-a851-6f95-79c8-ffcda39d2ba2,1.555556
1,b'\x17C}\xc0\x8f\x8bO\xfd\xca\x0eZ\xa0\xe7\xd9...,f0220d43-513a-4619-973d-4ed84a42bf6a,1.055556
2,"b'6NY\xb0^\x89,d\x1b:\xda\xaa\x94H\xc4\xa7'",956f51a8-d6a0-4a12-a22b-9da3cdffc879,1.111111
3,b'{FL|O@\xc1\x165\xa5\xff\x99\xfb_\xc8\x89',8a4e5eed-f77f-9da7-830d-232c0b5b4ebe,1.444444
4,b'\xb4@B\x98\xc0\xbdlc\x9f\xf9\xd7*<\xb6\xeb\xa4',8a4e5eed-f77f-9da7-830d-232c0b5b4ebe,1.777778


# 평균 현금잔여율 (averageCashRatio / investSessionId)

In [116]:
merged = pd.merge(df, seed, on="chapterId", how="left")

merged.head()

Unnamed: 0,_id,investSessionId,chapterId,userId,turn,riskLevel,currentPoint,beforeValue,currentValue,initialValue,...,income,transactionType,plusClick,minusClick,newsTag,startedAt,endedAt,_class,deltaShares,seedMoney
0,"b""\tJS+\xad\xe3\xb5\x07\xcce'\x91j\xd3\xa6\xb9""",b'\x07J\x88\xb0fH\x00\xe5\xe2\xb1\x91\x03\x18\...,1111,124aa0d6-a851-6f95-79c8-ffcda39d2ba2,1,high,2000,0,100,100,...,0,KEEP,0,0,all,2025-06-17 10:53:12,2025-06-17 10:53:16,com.popoworld.backend.invest.entity.InvestHistory,0,700
1,b'\xc8I\xab\x14\x84\x0c&\xe9\x97\xb6\xe6\x88\x...,b'\x07J\x88\xb0fH\x00\xe5\xe2\xb1\x91\x03\x18\...,1111,124aa0d6-a851-6f95-79c8-ffcda39d2ba2,2,high,866,100,92,100,...,0,BUY,6,0,high,2025-06-17 10:53:16,2025-06-17 10:53:30,com.popoworld.backend.invest.entity.InvestHistory,6,700
2,b'\x1bB\\\x12\xfa\xf8W-;\xf9e\xde\xaa\xe2\x18\...,b'\x07J\x88\xb0fH\x00\xe5\xe2\xb1\x91\x03\x18\...,1111,124aa0d6-a851-6f95-79c8-ffcda39d2ba2,3,high,187,92,77,100,...,0,KEEP,0,0,mid,2025-06-17 10:53:30,2025-06-17 10:53:41,com.popoworld.backend.invest.entity.InvestHistory,0,700
3,b'\xaaL&\x8dv\x99s\xc3\xe2\xa1-\xa5\xf1Lw\x98',b'\x07J\x88\xb0fH\x00\xe5\xe2\xb1\x91\x03\x18\...,1111,124aa0d6-a851-6f95-79c8-ffcda39d2ba2,4,high,8,77,78,100,...,0,BUY,1,0,all,2025-06-17 10:53:41,2025-06-17 10:53:53,com.popoworld.backend.invest.entity.InvestHistory,1,700
4,b'\x1cNjWCC\xb5e\xe4\xa46_[\xeb\x9b\xbf',b'\x07J\x88\xb0fH\x00\xe5\xe2\xb1\x91\x03\x18\...,1111,124aa0d6-a851-6f95-79c8-ffcda39d2ba2,5,high,356,78,87,100,...,-12,SELL,0,4,high,2025-06-17 10:53:53,2025-06-17 10:54:01,com.popoworld.backend.invest.entity.InvestHistory,-4,700


In [145]:
def avg_cash_ratio(df):
    acr = df[['investSessionId', 
              'userId',
              'seedMoney',
            'chapterId',
            'turn',
            'currentPoint']].copy()

    acr = acr.groupby(['investSessionId', 'userId', 'seedMoney', 'chapterId'])['currentPoint'].mean().reset_index(name='avgCurrentValue')
    acr['diff'] = acr['seedMoney'] - acr['avgCurrentValue']
    acr["avgCashRatio"] = acr["diff"] / acr["seedMoney"]

    acr = acr[["investSessionId","userId","avgCashRatio"]]
        
    return acr

avgCashRatio = avg_cash_ratio(merged)
avgCashRatio.head()

Unnamed: 0,investSessionId,userId,avgCashRatio
0,b'\x07J\x88\xb0fH\x00\xe5\xe2\xb1\x91\x03\x18\...,124aa0d6-a851-6f95-79c8-ffcda39d2ba2,0.180238
1,b'\x17C}\xc0\x8f\x8bO\xfd\xca\x0eZ\xa0\xe7\xd9...,f0220d43-513a-4619-973d-4ed84a42bf6a,-0.026905
2,"b'6NY\xb0^\x89,d\x1b:\xda\xaa\x94H\xc4\xa7'",956f51a8-d6a0-4a12-a22b-9da3cdffc879,-0.476905
3,b'{FL|O@\xc1\x165\xa5\xff\x99\xfb_\xc8\x89',8a4e5eed-f77f-9da7-830d-232c0b5b4ebe,-0.105476
4,b'\xb4@B\x98\xc0\xbdlc\x9f\xf9\xd7*<\xb6\xeb\xa4',8a4e5eed-f77f-9da7-830d-232c0b5b4ebe,0.142381


# 평균 턴 체류시간 (avgStayTime / investSessionId)

In [146]:
# 평균 턴 체류시간
def avg_stay_time(df):
    turn = df[['investSessionId',
               'userId',
                'turn',
                'startedAt',
                'endedAt']].copy()
    
    avgStayTime = turn.drop_duplicates(subset=['investSessionId', 'turn']).copy()
    avgStayTime['stayTime'] = (avgStayTime['endedAt'] - avgStayTime['startedAt']).dt.total_seconds()
    avgStayTime.drop(columns=['startedAt', 'endedAt'], inplace=True)
    avgStayTime = avgStayTime.groupby(['investSessionId', 'userId'])['stayTime'].mean().reset_index(name='avgStayTime')

    return avgStayTime

avgStayTime = avg_stay_time(df)
avgStayTime.head()

Unnamed: 0,investSessionId,userId,avgStayTime
0,b'\x07J\x88\xb0fH\x00\xe5\xe2\xb1\x91\x03\x18\...,124aa0d6-a851-6f95-79c8-ffcda39d2ba2,9.333333
1,b'\x17C}\xc0\x8f\x8bO\xfd\xca\x0eZ\xa0\xe7\xd9...,f0220d43-513a-4619-973d-4ed84a42bf6a,5.666667
2,"b'6NY\xb0^\x89,d\x1b:\xda\xaa\x94H\xc4\xa7'",956f51a8-d6a0-4a12-a22b-9da3cdffc879,126.833333
3,b'{FL|O@\xc1\x165\xa5\xff\x99\xfb_\xc8\x89',8a4e5eed-f77f-9da7-830d-232c0b5b4ebe,16.166667
4,b'\xb4@B\x98\xc0\xbdlc\x9f\xf9\xd7*<\xb6\xeb\xa4',8a4e5eed-f77f-9da7-830d-232c0b5b4ebe,5.5


# 각 종목 별 구매/판매 비율 (avgTradeRatio / InvestSessionId)

In [147]:
import numpy as np
# 각 위험 별 구매/판매 비율
def avg_trade_ratio(df):
    # 구매 데이터 생성
    transcation_df = df[['investSessionId',
                        'userId',
                        'turn',
                        'riskLevel',
                        'numberOfShares',
                        'deltaShares']]
    
    # 변화량에 따라 BUY/SELL 구분하기
    transcation_df['is_buy'] = np.where(
        transcation_df['deltaShares'] > 0, 
        transcation_df['deltaShares'], 
        0
    )

    transcation_df['is_sell'] = np.where(
        transcation_df['deltaShares'] < 0, 
        -transcation_df['deltaShares'],
        0
    )

    # 각 investSessionId, riskLevel 별로 buy/sell 개수 집계
    buy_sell_counts = transcation_df.groupby(['investSessionId', 'riskLevel'])[['is_buy', 'is_sell']].sum().reset_index()
    buy_sell_counts.rename(columns={'is_buy': 'buyCount', 'is_sell': 'sellCount'}, inplace=True)


    # investSessionId별 전체 buy_count, sell_count 구하기
    total_buy = buy_sell_counts.groupby('investSessionId')['buyCount'].sum().reset_index(name='totalBuyCount')
    total_sell = buy_sell_counts.groupby('investSessionId')['sellCount'].sum().reset_index(name='totalSellCount')

    # 원본과 병합
    buy_sell_counts = buy_sell_counts.merge(total_buy, on='investSessionId')
    buy_sell_counts = buy_sell_counts.merge(total_sell, on='investSessionId')

    buy_sell_counts['buyRatio'] = buy_sell_counts['buyCount'] / buy_sell_counts['totalBuyCount']
    buy_sell_counts['sellRatio'] = buy_sell_counts['sellCount'] / buy_sell_counts['totalSellCount']

    buy_sell_counts.head()

    # buy_ratio_pivot: 열로 riskLevel을 펼치기  
    buy_ratio_pivot = buy_sell_counts.pivot(index='investSessionId', columns='riskLevel', values='buyRatio')
    buy_ratio_pivot = buy_ratio_pivot.fillna(0)
    buy_ratio_pivot.columns = [f"{level}BuyRatio" for level in buy_ratio_pivot.columns]
    buy_ratio_pivot.reset_index(inplace=True)

    # sell_ratio pivot: 열로 riskLevel을 펼치기
    sell_ratio_pivot = buy_sell_counts.pivot(index='investSessionId', columns='riskLevel', values='sellRatio')
    sell_ratio_pivot = sell_ratio_pivot.fillna(0)
    sell_ratio_pivot.columns = [f"{level}SellRatio" for level in sell_ratio_pivot.columns]
    sell_ratio_pivot.reset_index(inplace=True)

    # merge 두 pivot
    avgTradeRatio = pd.merge(buy_ratio_pivot, sell_ratio_pivot, on='investSessionId')

    user_info = transcation_df.groupby('investSessionId')[['userId']].first().reset_index()
    avgTradeRatio = avgTradeRatio.merge(user_info, on='investSessionId', how='left')

    return avgTradeRatio

avgTradeRatio = avg_trade_ratio(df)
avgTradeRatio.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transcation_df['is_buy'] = np.where(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transcation_df['is_sell'] = np.where(


Unnamed: 0,investSessionId,highBuyRatio,lowBuyRatio,midBuyRatio,highSellRatio,lowSellRatio,midSellRatio,userId
0,b'\x07J\x88\xb0fH\x00\xe5\xe2\xb1\x91\x03\x18\...,0.291667,0.166667,0.541667,1.0,0.0,0.0,124aa0d6-a851-6f95-79c8-ffcda39d2ba2
1,b'\x17C}\xc0\x8f\x8bO\xfd\xca\x0eZ\xa0\xe7\xd9...,0.052632,0.578947,0.368421,0.0,0.0,0.0,f0220d43-513a-4619-973d-4ed84a42bf6a
2,"b'6NY\xb0^\x89,d\x1b:\xda\xaa\x94H\xc4\xa7'",0.15,0.5,0.35,0.0,0.0,0.0,956f51a8-d6a0-4a12-a22b-9da3cdffc879
3,b'{FL|O@\xc1\x165\xa5\xff\x99\xfb_\xc8\x89',0.4,0.15,0.45,0.0,0.0,0.0,8a4e5eed-f77f-9da7-830d-232c0b5b4ebe
4,b'\xb4@B\x98\xc0\xbdlc\x9f\xf9\xd7*<\xb6\xeb\xa4',0.416667,0.166667,0.416667,1.0,0.0,0.0,8a4e5eed-f77f-9da7-830d-232c0b5b4ebe


# 평균 tag 뉴스 발생 턴 체류 시간 (tagAvgStayTime / investSessionId)

In [156]:
# 평균 tag 뉴스 발생 턴 체류 시간
def tag_avg_stay_time(df):
    tag_turn_df = df[['investSessionId',
                        'userId',
                        'turn',
                        'newsTag',
                        'startedAt',
                        'endedAt']].drop_duplicates()

    tag_turn_df = tag_turn_df[tag_turn_df['newsTag'] != "all"]
    tag_turn_df["turnDuration"] = tag_turn_df["endedAt"] - tag_turn_df["startedAt"]
    tagAvgStayTime = tag_turn_df.groupby(["investSessionId", 'userId'])["turnDuration"].mean().reset_index().rename(columns={"turnDuration":"tagTrunDuraion"})

    return tagAvgStayTime

tagAvgStayTime = tag_avg_stay_time(df)
tagAvgStayTime.head()

Unnamed: 0,investSessionId,userId,tagTrunDuraion
0,b'\x07J\x88\xb0fH\x00\xe5\xe2\xb1\x91\x03\x18\...,124aa0d6-a851-6f95-79c8-ffcda39d2ba2,0 days 00:00:10
1,b'\x17C}\xc0\x8f\x8bO\xfd\xca\x0eZ\xa0\xe7\xd9...,f0220d43-513a-4619-973d-4ed84a42bf6a,0 days 00:00:06.250000
2,"b'6NY\xb0^\x89,d\x1b:\xda\xaa\x94H\xc4\xa7'",956f51a8-d6a0-4a12-a22b-9da3cdffc879,0 days 00:03:06
3,b'{FL|O@\xc1\x165\xa5\xff\x99\xfb_\xc8\x89',8a4e5eed-f77f-9da7-830d-232c0b5b4ebe,0 days 00:00:07.750000
4,b'\xb4@B\x98\xc0\xbdlc\x9f\xf9\xd7*<\xb6\xeb\xa4',8a4e5eed-f77f-9da7-830d-232c0b5b4ebe,0 days 00:00:06


# 위험 감수율 (betMidShares, betHighShares / investSessionId)
- 중위험 tag 발생 시 중위험 종목 구매 수량
- 고위험 tag 발생 시 고위험 종목 구매 수량

In [168]:
def bet_shares(df):
    bet_data = df[["investSessionId","turn","userId","newsTag","riskLevel","numberOfShares","deltaShares", "transactionType", "beforeValue","currentValue","income"]].copy()

    # newsTag 발생 시 해당 종목을 구매한 경우
    bet_buy = bet_data.loc[(bet_data["newsTag"] == bet_data["riskLevel"]) & (bet_data["transactionType"]=="BUY")]

    bet_mid = bet_buy[bet_buy["riskLevel"]=="mid"].groupby(["investSessionId","userId"])["deltaShares"].sum().reset_index().rename(columns={"deltaShares":"betMidShares"})
    bet_high = bet_buy[bet_buy["riskLevel"]=="high"].groupby(["investSessionId","userId"])["deltaShares"].sum().reset_index().rename(columns={"deltaShares":"betHighShares"})

    risk_taking = pd.merge(bet_mid, bet_high, on=["investSessionId","userId"], how="outer")

    # nan 인 값들은 0으로 채우기
    risk_taking = risk_taking.fillna(0)

    return risk_taking

betShares = bet_shares(df)
betShares.head()

Unnamed: 0,investSessionId,userId,betMidShares,betHighShares
0,b'\x07J\x88\xb0fH\x00\xe5\xe2\xb1\x91\x03\x18\...,124aa0d6-a851-6f95-79c8-ffcda39d2ba2,10,6.0
1,b'\x17C}\xc0\x8f\x8bO\xfd\xca\x0eZ\xa0\xe7\xd9...,f0220d43-513a-4619-973d-4ed84a42bf6a,3,0.0
2,"b'6NY\xb0^\x89,d\x1b:\xda\xaa\x94H\xc4\xa7'",956f51a8-d6a0-4a12-a22b-9da3cdffc879,1,0.0
3,b'{FL|O@\xc1\x165\xa5\xff\x99\xfb_\xc8\x89',8a4e5eed-f77f-9da7-830d-232c0b5b4ebe,4,3.0
4,b'\xb4@B\x98\xc0\xbdlc\x9f\xf9\xd7*<\xb6\xeb\xa4',8a4e5eed-f77f-9da7-830d-232c0b5b4ebe,8,6.0


# 베팅 성공률 (betBuyRatio, betSellRatio / investSessionId)
: 베팅 성공률
1) 구매 베팅 성공 = tag뉴스 발생 턴에서 해당 종목 구매 후 다음 턴에서 가격이 증가한 횟수 / tag 뉴스 발생턴에서 해당 종목을 구매한 횟수
2) 판매 베팅 성공 = tag뉴스 발생 턴에서 해당 종목 판매 후 다음 턴에서 가격이 감소한 횟수 / tag 뉴스 발생턴에서 해당 종목을 판매한 횟수


In [149]:
def bet_buy_ratio(df):
    # 구매 베팅 성공
    bet_win = df[["investSessionId",
                  "userId",
                  "turn",
                  "newsTag",
                  "riskLevel",
                  "beforeValue",
                  "currentValue", 
                  "transactionType"]].copy()

    # 다음 턴의 value 컬럼 구하기
    bet_win.sort_values(by=["investSessionId","riskLevel","turn"], inplace=True)
    bet_win["nextValue"] = bet_win["currentValue"].shift(-1)

    # tag 뉴스 턴에서 해당 종목을 구매한 횟수
    bet_buy = bet_win.loc[(bet_win["newsTag"]==bet_win["riskLevel"]) & (bet_win["transactionType"]=="BUY")].copy()
    bet_buy_total = bet_buy.groupby("investSessionId")["nextValue"].count().reset_index().rename(columns={"nextValue":"bet_buy_total"})

    # tag 뉴스 턴에서 해당 종목을 구매하고 다음 턴에서 가격이 오른 횟수
    bet_buy["value_diff"] = bet_buy["nextValue"] - bet_buy["currentValue"]
    bet_buy_win = bet_buy[bet_buy["value_diff"]>0]
    bet_buy_win = bet_buy_win.groupby("investSessionId")["value_diff"].count().reset_index().rename(columns={"value_diff":"bet_buy_win"})

    # 성공 비율 계산
    bet_buy_df = pd.merge(bet_buy_total, bet_buy_win, on="investSessionId", how="left")
    bet_buy_df["betBuyRatio"] = bet_buy_df["bet_buy_win"] / bet_buy_df["bet_buy_total"]

    # nan값 0으로 채우기
    bet_buy_df = bet_buy_df.fillna(0) # 데이터가 없어서 nan으로 출력됨..! 0으로 채워주기기

    # 필요없는 컬럼 삭제
    bet_buy_df.drop(columns=["bet_buy_total","bet_buy_win"], inplace=True)

    user_info = df.groupby("investSessionId")[["userId"]].first().reset_index()
    bet_buy_df = pd.merge(bet_buy_df, user_info, on="investSessionId", how="left")
    
    return bet_buy_df

betBuyRatio = bet_buy_ratio(df)
betBuyRatio.head()

Unnamed: 0,investSessionId,betBuyRatio,userId
0,b'\x07J\x88\xb0fH\x00\xe5\xe2\xb1\x91\x03\x18\...,0.333333,124aa0d6-a851-6f95-79c8-ffcda39d2ba2
1,b'\x17C}\xc0\x8f\x8bO\xfd\xca\x0eZ\xa0\xe7\xd9...,1.0,f0220d43-513a-4619-973d-4ed84a42bf6a
2,"b'6NY\xb0^\x89,d\x1b:\xda\xaa\x94H\xc4\xa7'",0.0,956f51a8-d6a0-4a12-a22b-9da3cdffc879
3,b'{FL|O@\xc1\x165\xa5\xff\x99\xfb_\xc8\x89',0.5,8a4e5eed-f77f-9da7-830d-232c0b5b4ebe
4,b'\xb4@B\x98\xc0\xbdlc\x9f\xf9\xd7*<\xb6\xeb\xa4',0.333333,8a4e5eed-f77f-9da7-830d-232c0b5b4ebe


In [150]:
def bet_sell_ratio(df):
    # 판매 베팅 성공
    bet_win = df[["investSessionId",
                  "userId",
                  "turn",
                  "newsTag",
                  "riskLevel",
                  "beforeValue",
                  "currentValue", 
                  "transactionType"]].copy()

    # 다음 턴의 value 컬럼 구하기
    bet_win.sort_values(by=["investSessionId","riskLevel","turn"], inplace=True)
    bet_win["nextValue"] = bet_win["currentValue"].shift(-1)

    # tag 뉴스 턴에서 해당 종목을 판매한 횟수
    bet_sell = bet_win.loc[(bet_win["newsTag"]==bet_win["riskLevel"]) & (bet_win["transactionType"]=="SELL")].copy()
    bet_sell_total = bet_sell.groupby("investSessionId")["nextValue"].count().reset_index().rename(columns={"nextValue":"bet_sell_total"})

    # tag 뉴스 턴에서 해당 종목을 판매하고 다음 턴에서 가격이 떨어진 횟수
    bet_sell["value_diff"] = bet_sell["nextValue"] - bet_sell["currentValue"]
    bet_sell_win = bet_sell[bet_sell["value_diff"]<0]
    bet_sell_win = bet_sell_win.groupby("investSessionId")["value_diff"].count().reset_index().rename(columns={"value_diff":"bet_sell_win"})

    # 성공 비율 계산
    bet_sell_df = pd.merge(bet_sell_total, bet_sell_win, on="investSessionId", how="left")
    bet_sell_df["betSellRatio"] = bet_sell_df["bet_sell_win"] / bet_sell_df["bet_sell_total"]


    # nan값 0으로 채우기
    bet_sell_df = bet_sell_df.fillna(0) # 데이터가 없어서 nan으로 출력됨..! 0으로 채워주기기

    # 필요없는 컬럼 삭제
    bet_sell_df.drop(columns=["bet_sell_total","bet_sell_win"], inplace=True)

    user_info = df.groupby("investSessionId")[["userId"]].first().reset_index()
    bet_sell_df = pd.merge(bet_sell_df, user_info, on="investSessionId", how="left")
    
    return bet_sell_df

betSellRatio = bet_sell_ratio(df)
betSellRatio.head()

Unnamed: 0,investSessionId,betSellRatio,userId
0,b'\x07J\x88\xb0fH\x00\xe5\xe2\xb1\x91\x03\x18\...,0.0,124aa0d6-a851-6f95-79c8-ffcda39d2ba2
1,b'\xb4@B\x98\xc0\xbdlc\x9f\xf9\xd7*<\xb6\xeb\xa4',0.0,8a4e5eed-f77f-9da7-830d-232c0b5b4ebe


# 데이터 병합

In [170]:
# 병합할 데이터프레임 리스트
dataframes_to_merge = [
    tradingTurn,
    transactionNum,
    avgCashRatio,
    avgStayTime,
    avgTradeRatio,
    tagAvgStayTime,
    betShares,
    betBuyRatio,
    betSellRatio
]

# 첫 번째 데이터프레임을 기준으로 시작
merged_final = dataframes_to_merge[0]

# 나머지 데이터프레임을 순회하며 outer merge 수행
for i in range(1, len(dataframes_to_merge)):
    merged_final = pd.merge(merged_final, dataframes_to_merge[i],
                            on=["investSessionId", "userId"],
                            how="outer")

merged_final.head()

Unnamed: 0,investSessionId,userId,avgNotKeep,click,avgCashRatio,avgStayTime,highBuyRatio,lowBuyRatio,midBuyRatio,highSellRatio,lowSellRatio,midSellRatio,tagTrunDuraion,betMidShares,betHighShares,betBuyRatio,betSellRatio
0,b'\x07J\x88\xb0fH\x00\xe5\xe2\xb1\x91\x03\x18\...,124aa0d6-a851-6f95-79c8-ffcda39d2ba2,0.833333,1.555556,0.180238,9.333333,0.291667,0.166667,0.541667,1.0,0.0,0.0,0 days 00:00:10,10,6.0,0.333333,0.0
1,b'\x17C}\xc0\x8f\x8bO\xfd\xca\x0eZ\xa0\xe7\xd9...,f0220d43-513a-4619-973d-4ed84a42bf6a,0.833333,1.055556,-0.026905,5.666667,0.052632,0.578947,0.368421,0.0,0.0,0.0,0 days 00:00:06.250000,3,0.0,1.0,
2,"b'6NY\xb0^\x89,d\x1b:\xda\xaa\x94H\xc4\xa7'",956f51a8-d6a0-4a12-a22b-9da3cdffc879,0.833333,1.111111,-0.476905,126.833333,0.15,0.5,0.35,0.0,0.0,0.0,0 days 00:03:06,1,0.0,0.0,
3,b'{FL|O@\xc1\x165\xa5\xff\x99\xfb_\xc8\x89',8a4e5eed-f77f-9da7-830d-232c0b5b4ebe,1.0,1.444444,-0.105476,16.166667,0.4,0.15,0.45,0.0,0.0,0.0,0 days 00:00:07.750000,4,3.0,0.5,
4,b'\xb4@B\x98\xc0\xbdlc\x9f\xf9\xd7*<\xb6\xeb\xa4',8a4e5eed-f77f-9da7-830d-232c0b5b4ebe,1.0,1.777778,0.142381,5.5,0.416667,0.166667,0.416667,1.0,0.0,0.0,0 days 00:00:06,8,6.0,0.333333,0.0
