### merge 과정에서 컬럼의 Data Type이 의도하치 않게 float64로 바뀌는 문제  

In [26]:
# 원인 : 이것에 대한 원인은 해당 컬럼에 **NaN값이 포함되어있기 때문** 이다.  
import pandas as pd

user_data = {
    "user_id" : ["0001", "0002", "0003", "0004", "0005"],
    "name" : ["john", "mike", "merry", "poppins", "jenny"],
    "age" : [20, 30, 40, 50, 60]
}

purchase_history = {
    "user_id" : ["0001", "0002", "0001", "0004", "0005", "0006"],
    "item_id" : ["IT01", "IT33", "IT13", "IT99", "IT62", "IT43"],
    "price" : [3000, 52000, 950, 4350, 800, 5550]
    }

user_df = pd.DataFrame(user_data)
purchase_history_df = pd.DataFrame(purchase_history)

print(user_df.info())
print("\n")
print(purchase_history_df.info())

merge_pu = purchase_history_df.merge(user_df, on="user_id", how="left")
print("\n")
print(merge_pu.info())

merge_up = user_df.merge(purchase_history_df, on="user_id", how="left")
print("\n")
print(merge_up.info())

# 해결 (1) 의도한 대로 merge를 설정한건지 검토(merge key 등)
# 해결 (2) NaN 값을 처리한 뒤, 해당 컬럼을 다시 int로
merge_pu = purchase_history_df.merge(user_df, on="user_id", how="left")
merge_pu["age"] = merge_pu["age"].fillna(99)
merge_pu["age"] = merge_pu["age"].astype("int64")
print("\n")
print(merge_pu.info())

# 해결 (3) Int64 타입으로 관리
user_df["age"] = user_df["age"].astype("Int64")
merge_pu = purchase_history_df.merge(user_df, on="user_id", how="left")
print("\n")
print(merge_pu.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   user_id  5 non-null      object
 1   name     5 non-null      object
 2   age      5 non-null      int64 
dtypes: int64(1), object(2)
memory usage: 252.0+ bytes
None


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   user_id  6 non-null      object
 1   item_id  6 non-null      object
 2   price    6 non-null      int64 
dtypes: int64(1), object(2)
memory usage: 276.0+ bytes
None


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   user_id  6 non-null      object 
 1   item_id  6 non-null      object 
 2   price    6 non-null      int64  
 3   name     5 

Unnamed: 0,user_id,item_id,price
0,1,IT01,3000
1,2,IT33,52000
2,1,IT13,950
3,4,IT99,4350
4,5,IT62,800
5,6,IT43,5550
