In [None]:
# Mount Google Drive
# from google.colab import drive
# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd

path = "../data/processed/cleaned_data_final.csv"
df = pd.read_csv(path)


In [None]:
df.head()

Unnamed: 0,customerID,customerType,riskLevel,investmentCapacity,account_creation_date,ISIN,transactionID,transactionType,transaction_date,totalValue,...,assetSubCategory,sector,industry,asset_description,exchangeID,name,market_description,country,tradingHours,marketClass
0,00017496858921195E5A,Professional,Aggressive,CAP_GT300K,2021-03-19,GRS434003000,7590224,Buy,2020-03-27,11000.0,...,Large Cap,Utilities,Utilities - Renewable,Public Power Corporation S.A. (PPC SA) is a ma...,ATHEX,Athens Exchange S.A. Cash Market,The Athens Stock Exchange (ASE or ATHEX) is th...,Greece,08:15-15:20,Public Securities
1,00017496858921195E5A,Professional,Aggressive,CAP_GT300K,2021-03-19,GRS434003000,7607029,Sell,2020-04-06,12080.0,...,Large Cap,Utilities,Utilities - Renewable,Public Power Corporation S.A. (PPC SA) is a ma...,ATHEX,Athens Exchange S.A. Cash Market,The Athens Stock Exchange (ASE or ATHEX) is th...,Greece,08:15-15:20,Public Securities
2,00017496858921195E5A,Professional,Aggressive,CAP_GT300K,2021-03-19,GRS434003000,7634872,Buy,2020-04-24,13400.0,...,Large Cap,Utilities,Utilities - Renewable,Public Power Corporation S.A. (PPC SA) is a ma...,ATHEX,Athens Exchange S.A. Cash Market,The Athens Stock Exchange (ASE or ATHEX) is th...,Greece,08:15-15:20,Public Securities
3,00017496858921195E5A,Professional,Aggressive,CAP_GT300K,2021-03-19,GRS434003000,7652627,Sell,2020-05-07,12700.0,...,Large Cap,Utilities,Utilities - Renewable,Public Power Corporation S.A. (PPC SA) is a ma...,ATHEX,Athens Exchange S.A. Cash Market,The Athens Stock Exchange (ASE or ATHEX) is th...,Greece,08:15-15:20,Public Securities
4,00017496858921195E5A,Professional,Aggressive,CAP_GT300K,2021-03-19,GRS434003000,7664807,Buy,2020-05-15,12150.0,...,Large Cap,Utilities,Utilities - Renewable,Public Power Corporation S.A. (PPC SA) is a ma...,ATHEX,Athens Exchange S.A. Cash Market,The Athens Stock Exchange (ASE or ATHEX) is th...,Greece,08:15-15:20,Public Securities


# Transaction Level Feature Engineering

1. Transaction Recency (days since transaction)

    ‚úÖ Useful if applying recency weights (recent trades matter more).

    üö´ If not using time decay, it may add noise ‚Üí safe to drop.

2. Transaction Value per Unit

    ‚úÖ Useful for grouping customers who favor higher-priced vs lower-priced assets.

    Also helps distinguish investment style (e.g., penny stocks vs blue-chips).

3. Buy/Sell Encoding

    ‚úÖ Crucial for recommendation, because buy = positive signal and sell = negative (or down-weighted).

    Prevents recommending assets that a user just sold off.

4. Channel Encoding

    ‚úÖ Useful for segmenting customer behavior:

    Internet Banking users may be more active/digital-savvy.

    Branch/Phone users may be conservative or less frequent traders.

    Adds value in hybrid recommenders where user behavior is considered.

5. Account Age at Transaction

    ‚úÖ Useful proxy for customer maturity:

    New accounts ‚Üí safer, diversified recommendations.

    Long-tenured accounts ‚Üí more advanced/diverse asset recommendations.

    Helps mitigate cold start for new users.

6. Transaction frequency on the daily, monthly and yearly scale:
    
    ‚úÖ Useful indicator of engagement and activity level:

    High-frequency -> active trader, needs frequent updates
    
    Low-Frequency -> passive investor, prefers long-term strategies
    
    Helps distinguish churn risk from loyal, engaged customers.

In [None]:
import datetime as dt
import pandas as pd

# Convert date columns to datetime objects
df["transaction_date"] = pd.to_datetime(df["transaction_date"])
df["account_creation_date"] = pd.to_datetime(df["account_creation_date"])

# 1. Days since transaction (relative to dataset max date)
max_date = df["transaction_date"].max()
df["days_since_txn"] = (max_date - df["transaction_date"]).dt.days

# 2. Value per unit
df["value_per_unit"] = df["totalValue"] / df["units"]

# 3. Buy Sell encoded
df["is_buy"] = (df["transactionType"] == "Buy").astype(int)
df["is_sell"] = (df["transactionType"] == "Sell").astype(int)

# 4. Channel encoding
df["channel_encoded"] = df["channel"].astype("category").cat.codes

# 5. Account Age
df["account_age_days"] = (df["transaction_date"] - df["account_creation_date"]).dt.days

# 6. Number of Transactions per customer(daily, monthly, yearly)
# daily
daily_txn = (
    df
    .groupby(['customerID', df['transaction_date'].dt.date])['transactionID']
    .count()
    .reset_index(name='daily_txn_count')
)

# monthly
monthly_txn = (
    df
    .groupby(['customerID', df['transaction_date'].dt.to_period('M')])['transactionID']
    .count()
    .reset_index(name='monthly_txn_count')
)

# yearly
yearly_txn = (
    df
    .groupby(['customerID', df['transaction_date'].dt.year])['transactionID']
    .count()
    .reset_index(name='yearly_txn_count')
)

# Customer Features
1. Number of Unique Assets (n_unique_assets)

    ‚úÖ Measures portfolio diversification.
    Customers with broader diversification may prefer diversified recommendations (ETFs, balanced funds), while highly concentrated investors may be interested in assets similar to their core holdings.

2. Number of Unique Sectors (n_unique_sectors)

    ‚úÖ Captures breadth of exposure across industries.
    Helps recommend assets in underrepresented sectors to improve
    diversification, or reinforce existing sector preferences.

3. Number of Unique Markets (n_unique_markets)

    ‚úÖ Identifies geographic diversification.
    Investors trading across multiple markets may be open to international recommendations; single-market users may prefer local suggestions.

4. Total Invested (total_invested)

    ‚úÖ Proxy for customer scale and seriousness as an investor.
    High-value investors might need premium or complex products, while low-value investors may be more risk-averse.

5. Average Transaction Value (avg_txn_value)

    ‚úÖ Reflects typical trade size.
    Useful for matching customers with assets in the same ‚Äúticket size‚Äù range.

6. Median Units (median_units)

    ‚úÖ Captures typical lot size preference.
    Helps group customers who prefer smaller vs larger positions.

7. Number of Transactions (n_transactions)

    ‚úÖ Indicates activity level.
    More active traders should be recommended frequently traded assets; less active users may be better matched with long-term products.

8. Number of Buys / Sells (n_buys, n_sells)

    ‚úÖ Provides balance of entry vs exit behavior.
    High sell ratios might indicate churn or rebalancing needs; high buy ratios are stronger preference signals.

9. Buy Ratio (buy_ratio)

    ‚úÖ Normalized indicator of trading orientation.
    Helps distinguish accumulators from divestors.

10. Channel Preference (channel_pref)

    ‚úÖ Behavioral segmentation feature.
    Internet Banking‚Äìdominated customers may be younger and more tech-savvy; branch/phone users may prefer conservative, stable recommendations.

11. Days Since Last Transaction (days_since_last_txn)

    ‚úÖ Recency indicator at customer level.
    Active customers are more likely to engage with recommendations; inactive customers may need reactivation campaigns.

12. Risk Level (risk_level)

    ‚úÖ Core alignment feature.
    Ensures recommended assets match (or cautiously stretch) a customer‚Äôs risk tolerance.

13. Investment Capacity (invest_capacity)

    ‚úÖ Proxy for affordability.
    Prevents recommending assets outside the customer‚Äôs capacity range.

14. Portfolio Concentration (portfolio_concentration, HHI)

    ‚úÖ Diversification vs concentration metric.
    High HHI = concentrated ‚Üí recommend diversification.
    Low HHI = diversified ‚Üí recommend similar, low-correlation assets.

15. Total invested amount per customer per asset

    ‚úÖ Helps detect overexposure in single assets and adjust recommendations.

15. Seasonality (day of week, month, quarter)

    ‚úÖ Reveals behavioral cycles & market timing:
    End-of-month/quarter surges ‚Üí salary-based investments.
    Day-of-week patterns ‚Üí habitual trading preferences.
    Useful for predicting demand spikes and optimizing alerts.

In [None]:
customer_features = df.groupby("customerID").agg(
    n_unique_assets=("ISIN", "nunique"),         # diversification
    n_unique_sectors=("sector", "nunique"),      # sector diversification
    n_unique_markets=("marketID", "nunique"),    # geographical diversification
    total_invested=("totalValue", "sum"),        # lifetime investment
    avg_txn_value=("totalValue", "mean"),        # typical investment size
    median_units=("units", "median")             # median lot size
).reset_index()


In [None]:
trading_behavior = df.groupby("customerID").agg(
    n_transactions=("transactionID", "count"),
    n_buys=("is_buy", "sum"),
    n_sells=("is_sell", "sum"),
    buy_ratio=("is_buy", "mean"),                # % of trades that are buys
    channel_pref=("channel", lambda x: x.mode()[0]),  # most common channel
    last_txn_date=("transaction_date", "max")
).reset_index()

# Days since last trade
max_date = df["transaction_date"].max()
trading_behavior["days_since_last_txn"] = (max_date - trading_behavior["last_txn_date"]).dt.days


In [None]:
risk_map = {
    "Conservative": 0, "Income": 1, "Balanced": 2, "Aggressive": 3,
    "Predicted_Conservative": 0, "Predicted_Income": 1,
    "Predicted_Balanced": 2, "Predicted_Aggressive": 3
}
df["risk_encoded"] = df["riskLevel"].map(risk_map)

capacity_map = {
    "CAP_LT_30K": 15000, "CAP_30K_80K": 55000,
    "CAP_80K_300K": 190000, "CAP_GT300K": 400000,
    "Predicted_CAP_LT_30K": 15000, "Predicted_CAP_30K_80K": 55000,
    "Predicted_CAP_80K_300K": 190000, "Predicted_GT300K": 400000
}
df["capacity_numeric"] = df["investmentCapacity"].map(capacity_map)

risk_capacity = df.groupby("customerID").agg(
    risk_level=("risk_encoded", "max"),         # stable per user
    invest_capacity=("capacity_numeric", "max")
).reset_index()


In [None]:
portfolio_share = df.groupby(["customerID", "ISIN"]).agg(
    asset_value=("totalValue", "sum")
).reset_index()

portfolio_share["total_portfolio"] = portfolio_share.groupby("customerID")["asset_value"].transform("sum")
portfolio_share["share"] = portfolio_share["asset_value"] / portfolio_share["total_portfolio"]

hhi = portfolio_share.groupby("customerID").agg(
    portfolio_concentration=("share", lambda x: (x**2).sum())
).reset_index()


In [None]:
from functools import reduce

dfs = [customer_features, trading_behavior, risk_capacity, hhi]
customer_all = reduce(lambda left, right: pd.merge(left, right, on="customerID", how="left"), dfs)


In [None]:
df = df.merge(customer_all, on="customerID", how="left")


In [None]:
cust_asset_invested = (
    df
    .groupby(['customerID', 'ISIN'])['totalValue']
    .sum()
    .reset_index(name='total_invested_per_asset')
)

In [None]:
df['day_of_week'] = df['transaction_date'].dt.day_name()
df['month'] = df['transaction_date'].dt.month
df['quarter'] = df['transaction_date'].dt.quarter

## Asset-Level Feature Engineering

1. Number of Unique Investors (n_unique_investors)

    ‚úÖ Measures asset popularity.
    Highly held assets can be recommended as ‚Äúsafe‚Äù or ‚Äúmainstream‚Äù choices; niche assets with few investors may appeal to specialized users.

2. Total Traded Value (total_traded_value)

    ‚úÖ Proxy for liquidity and demand.
    High traded value assets are easier to buy/sell (liquid), making them safer for recommendation; low traded value assets may be riskier or illiquid.

3. Average Traded Value (avg_traded_value)

    ‚úÖ Reflects the average transaction size per trade.
    Useful for matching assets to customer investment styles ‚Äî e.g., retail-friendly small trades vs institutional-sized transactions.

4. Number of Transactions (n_transactions_asset)

    ‚úÖ Indicates trading frequency.
    Assets with high transaction counts are ‚Äúactive‚Äù and suitable for frequent traders; low counts may be more buy-and-hold type products.

5. Number of Buys / Sells (n_buys_asset, n_sells_asset)

    ‚úÖ Shows sentiment and demand direction.
    Assets with more buys than sells are growing in favor; assets with more sells may be losing appeal.

6. Buy Ratio (buy_ratio_asset)

    ‚úÖ Normalized demand indicator.
    Helps identify ‚Äúin-demand‚Äù vs ‚Äúout-of-favor‚Äù assets to guide recommendation scoring.

7. Last Trade Date (last_trade_date_asset)

    ‚úÖ Recency measure at the asset level.
    Shows how recently the market has been active in this asset ‚Äî useful for filtering out stale or inactive products.

8. Days Since Last Trade (days_since_last_trade_asset)

    ‚úÖ Indicator of current asset momentum.
    Assets traded recently are more relevant for recommendation; long-inactive assets may be excluded.

9. Holding Duration (holding_duration_days)

    ‚úÖ Captures asset‚Äôs typical investment horizon.

    Short holding duration = speculative or trading asset.

    Long holding duration = stable, long-term holding.
    Useful for matching assets to customer trading style.

10. Static Metadata (sector, industry, assetCategory, assetSubCategory, marketClass, country)

    ‚úÖ Essential for content-based recommendations.
    Allows suggesting ‚Äúsimilar assets‚Äù in the same sector/industry/country, or diversifying into under-represented categories.

In [None]:
asset_features = df.groupby("ISIN").agg(
    n_unique_investors=("customerID", "nunique"),       # how many unique customers
    total_traded_value=("totalValue", "sum"),           # total amount traded
    avg_traded_value=("totalValue", "mean"),            # avg per transaction
    n_transactions=("transactionID", "count"),          # # of trades
    n_buys=("is_buy", "sum"),
    n_sells=("is_sell", "sum"),
    buy_ratio=("is_buy", "mean"),                       # fraction of buy trades
    last_trade_date=("transaction_date", "max")
).reset_index()

# Recency at asset level
max_date = df["transaction_date"].max()
asset_features["days_since_last_trade"] = (max_date - asset_features["last_trade_date"]).dt.days


In [None]:
asset_dates = df.groupby("ISIN").agg(
    first_trade_date=("transaction_date", "min"),
    last_trade_date=("transaction_date", "max")
).reset_index()

asset_dates["holding_duration_days"] = (
    asset_dates["last_trade_date"] - asset_dates["first_trade_date"]
).dt.days


In [None]:
asset_all = asset_features.merge(asset_dates[["ISIN", "holding_duration_days"]], on="ISIN", how="left")


In [None]:
df = df.merge(asset_all, on="ISIN", how="left")


# Interaction Level Features

1. Customer‚ÄìAsset Total Value (cust_asset_total_value)

    ‚úÖ Captures how much money a customer has allocated to a specific asset.
    Higher values mean stronger conviction, so these assets (and similar ones) should rank higher in recommendations.

2. Customer‚ÄìAsset Number of Transactions (cust_asset_n_txn)

    ‚úÖ Shows intensity of interaction with the asset.
    Frequent trades suggest active interest, while single trades may indicate experimentation.

3. Customer‚ÄìAsset Average Transaction Value (cust_asset_avg_value)

    ‚úÖ Reflects how large each typical trade is for this customer‚Äìasset pair.
    Helps differentiate between small trial trades vs consistent large commitments.

4. Customer‚ÄìAsset Buy Ratio (cust_asset_buy_ratio)

    ‚úÖ Signals sentiment toward the asset.

    High buy ratio ‚Üí customer is accumulating.

    High sell ratio ‚Üí customer is offloading ‚Üí down-weight recommendations for this asset.

    Last Customer‚ÄìAsset Transaction (last_cust_asset_txn)

    ‚úÖ Records the most recent trade for this user‚Äìasset pair.
    Useful for understanding whether the customer‚Äôs relationship with the asset is current or outdated.

5. Days Since Last Customer‚ÄìAsset Transaction (cust_asset_days_since_last_txn)

    ‚úÖ Recency indicator of the specific relationship.
    Helps apply time-decay weights so recent interests are prioritized.

6. Customer‚ÄìAsset Portfolio Share (cust_asset_portfolio_share)

    ‚úÖ Measures how important an asset is within the customer‚Äôs portfolio.

    High share = core holding ‚Üí recommend similar assets to reinforce.

    Low share = peripheral holding ‚Üí recommend as exploratory or complementary.

In [None]:
interaction_features = df.groupby(["customerID", "ISIN"]).agg(
    cust_asset_total_value=("totalValue", "sum"),        # total $ invested in asset
    cust_asset_n_txn=("transactionID", "count"),         # # transactions for that asset
    cust_asset_avg_value=("totalValue", "mean"),         # avg transaction size
    cust_asset_buy_ratio=("is_buy", "mean"),             # % trades that are buys
    last_cust_asset_txn=("transaction_date", "max")      # last time customer touched asset
).reset_index()


In [None]:
max_date = df["transaction_date"].max()
interaction_features["cust_asset_days_since_last_txn"] = (
    max_date - interaction_features["last_cust_asset_txn"]
).dt.days


In [None]:
# First compute per-customer total portfolio value
cust_totals = df.groupby("customerID")["totalValue"].sum().reset_index(name="cust_total_value")

# Merge to interaction table
interaction_features = interaction_features.merge(cust_totals, on="customerID", how="left")

interaction_features["cust_asset_portfolio_share"] = (
    interaction_features["cust_asset_total_value"] / interaction_features["cust_total_value"]
)


In [None]:
df = df.merge(interaction_features, on=["customerID", "ISIN"], how="left")


In [None]:
df.head()

Unnamed: 0,customerID,customerType,riskLevel,investmentCapacity,account_creation_date,ISIN,transactionID,transactionType,transaction_date,totalValue,...,days_since_last_trade,holding_duration_days,cust_asset_total_value,cust_asset_n_txn,cust_asset_avg_value,cust_asset_buy_ratio,last_cust_asset_txn,cust_asset_days_since_last_txn,cust_total_value,cust_asset_portfolio_share
0,00017496858921195E5A,Professional,Aggressive,CAP_GT300K,2021-03-19,GRS434003000,7590224,Buy,2020-03-27,11000.0,...,0,1793,229180.0,12,19098.333333,0.5,2021-05-19,560,728451.013,0.314613
1,00017496858921195E5A,Professional,Aggressive,CAP_GT300K,2021-03-19,GRS434003000,7607029,Sell,2020-04-06,12080.0,...,0,1793,229180.0,12,19098.333333,0.5,2021-05-19,560,728451.013,0.314613
2,00017496858921195E5A,Professional,Aggressive,CAP_GT300K,2021-03-19,GRS434003000,7634872,Buy,2020-04-24,13400.0,...,0,1793,229180.0,12,19098.333333,0.5,2021-05-19,560,728451.013,0.314613
3,00017496858921195E5A,Professional,Aggressive,CAP_GT300K,2021-03-19,GRS434003000,7652627,Sell,2020-05-07,12700.0,...,0,1793,229180.0,12,19098.333333,0.5,2021-05-19,560,728451.013,0.314613
4,00017496858921195E5A,Professional,Aggressive,CAP_GT300K,2021-03-19,GRS434003000,7664807,Buy,2020-05-15,12150.0,...,0,1793,229180.0,12,19098.333333,0.5,2021-05-19,560,728451.013,0.314613


In [None]:
# Print out all the column names
print(df.columns)

Index(['customerID', 'customerType', 'riskLevel', 'investmentCapacity',
       'account_creation_date', 'ISIN', 'transactionID', 'transactionType',
       'transaction_date', 'totalValue', 'units', 'channel', 'marketID',
       'assetName', 'assetCategory', 'assetSubCategory', 'sector', 'industry',
       'asset_description', 'exchangeID', 'name', 'market_description',
       'country', 'tradingHours', 'marketClass', 'days_since_txn',
       'value_per_unit', 'is_buy', 'is_sell', 'channel_encoded',
       'account_age_days', 'risk_encoded', 'capacity_numeric',
       'n_unique_assets', 'n_unique_sectors', 'n_unique_markets',
       'total_invested', 'avg_txn_value', 'median_units', 'n_transactions_x',
       'n_buys_x', 'n_sells_x', 'buy_ratio_x', 'channel_pref', 'last_txn_date',
       'days_since_last_txn', 'risk_level', 'invest_capacity',
       'portfolio_concentration', 'day_of_week', 'month', 'quarter',
       'n_unique_investors', 'total_traded_value', 'avg_traded_value',
      

In [None]:
out_path = "/content/drive/MyDrive/NUSFintech/src/engineered_features.csv"
df.to_csv(out_path, index=False)
print("Saved to", out_path)

Saved to /content/drive/MyDrive/NUSFintech/src/engineered_features.csv
