# Feature Engineering

Loading and preprocessing the data, as seen in data_preprocessing notebook:

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams["figure.figsize"] = (20, 13)
%matplotlib inline
%config InlineBackend.figure_format = "retina"

interactions = pd.read_csv("data_final_project/KuaiRec 2.0/data/small_matrix.csv")
user_features = pd.read_csv("data_final_project/KuaiRec 2.0/data/user_features.csv")
item_daily_features = pd.read_csv("data_final_project/KuaiRec 2.0/data/item_daily_features.csv")
item_categories = pd.read_csv("data_final_project/KuaiRec 2.0/data/item_categories.csv")
social_network = pd.read_csv("data_final_project/KuaiRec 2.0/data/social_network.csv")

pd.set_option('display.max_columns', None)

item_daily_features = item_daily_features.dropna(subset=['video_tag_name', 'video_duration'])

We begin by removing useless columns, are they are not giving relevant informations for a content based filtering, or that are hard to use directly (likes ids, that we will use in the next step):

In [8]:
import ast
from sklearn.preprocessing import MultiLabelBinarizer

# Clean column names
item_categories.columns = item_categories.columns.str.strip()
item_daily_features.columns = item_daily_features.columns.str.strip()
interactions.columns = interactions.columns.str.strip()

# Remove useless usess columns
interactions = interactions.drop(columns=['play_duration','video_duration','time','date','timestamp'])
item_daily_features = item_daily_features.drop(columns=['author_id', 'music_id','video_tag_id', 'date', 'video_type', 'video_tag_name', 'upload_dt', 'upload_type', 'visible_status', 'video_width', 'video_height', 'delete_comment_cnt', 'delete_comment_user_num', 'reduce_similar_cnt', 'reduce_similar_user_num', 'collect_cnt', 'collect_user_num', 'cancel_collect_cnt', 'cancel_collect_user_num'])

Now, let's merge every video feature with the small_matrix:

In [9]:
# Convert 'feat' column from string to list
item_categories_copy = item_categories.copy()
item_categories_copy['feat'] = item_categories_copy['feat'].apply(ast.literal_eval)

# One-hot encode the 'feat' column
mlb = MultiLabelBinarizer()
one_hot = pd.DataFrame(
    mlb.fit_transform(item_categories_copy['feat']),
    columns=[f'category_{cls}' for cls in mlb.classes_],
    index=item_categories_copy.index
)

# Combine video_id with one-hot encoded categories
item_categories_encoded = pd.concat([item_categories_copy[['video_id']], one_hot], axis=1)

# Merge interactions with item_categories (on video_id)
merged_df = pd.merge(interactions, item_categories_encoded, on='video_id', how='left')

item_daily_agg = item_daily_features.groupby('video_id').sum().reset_index()

# Merge with item_daily_features on video_id
final_df = pd.merge(merged_df, item_daily_agg, on=['video_id'], how='left')

# Display first few rows
print(final_df.head())

   user_id  video_id  watch_ratio  category_0  category_1  category_2  \
0       14       148     0.722103           0           0           0   
1       14       183     1.907377           0           0           0   
2       14      3649     2.063311           0           0           0   
3       14      5262     0.566388           0           0           0   
4       14      8234     0.418364           0           0           0   

   category_3  category_4  category_5  category_6  category_7  category_8  \
0           0           0           0           0           0           0   
1           0           0           0           0           0           0   
2           0           0           0           0           0           0   
3           0           0           0           0           0           0   
4           0           0           0           1           0           0   

   category_9  category_10  category_11  category_12  category_13  \
0           0            0   

We can then add the liked column: if the user has a watch_ratio superior to 2 with a video, we consider that he liked the video.

In [10]:
final_df['liked'] = (final_df['watch_ratio'] > 2).astype(int)

Then we perform a PCA (Principal Component Analysis) to determine which features contribute most to whether a video is liked

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

final_df_pca = final_df.copy()

# Drop rows with any NaN values
df_clean = final_df_pca.dropna()

# Prepare feature matrix
non_feature_cols = ['user_id', 'video_id', 'watch_ratio', 'liked']
feature_cols = [col for col in df_clean.columns if col not in non_feature_cols]

X = df_clean[feature_cols]

# tandardize the features
print("Standardize feature")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
print("Apply PCA")
pca = PCA(n_components=len(feature_cols))
X_pca = pca.fit_transform(X_scaled)

# Analyze PCA components
pc1 = pca.components_[0]

# Create DataFrame to map features to their importance in PC1
pca_importance = pd.DataFrame({
    'feature': feature_cols,
    'pc1_weight': pc1
})
pca_importance['abs_weight'] = pca_importance['pc1_weight'].abs()
pca_importance_sorted = pca_importance.sort_values(by='abs_weight', ascending=False)

# Display top 10 most impactful features
print("Top 10 features contributing to PC1 (most variance):")
print(pca_importance_sorted[['feature', 'pc1_weight']].head(10))