<a href="https://colab.research.google.com/github/theresaskruzna/riiid_knowledge_tracing/blob/main/02_Feature_Engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Feature engineering

## Feature creation

One-hot encoding: Converting categorical variables to binary columns

In [None]:
categorical_cols = ['part', 'type_of']  # Columns for one-hot encoding

lectures_encoded = lectures_df.copy()  # Create a copy of the DataFrame

# Apply one-hot encoding with dtype=int to get binary values and drop original columns
for col in categorical_cols:
    one_hot_encoded = pd.get_dummies(lectures_df[['part', 'type_of']], prefix=['part', 'type'])
    lectures_encoded = pd.concat([lectures_df, one_hot_encoded], axis=1)
    lectures_encoded.drop(columns=[col], inplace=True)  # Drop the original column

lectures_encoded.head()

In [None]:
categorical_cols = ['correct_answer','part']  # Columns for one-hot encoding

questions_encoded = questions_df.copy()  # Create a copy of the DataFrame

# Apply one-hot encoding with dtype=int to get binary values and drop original columns
for col in categorical_cols:
    one_hot_encoded = pd.get_dummies(questions_df[['correct_answer','part']], prefix=['correct_answer','part'])
    questions_encoded = pd.concat([questions_df, one_hot_encoded], axis=1)
    questions_encoded.drop(columns=[col], inplace=True)  # Drop the original column

questions_encoded.head()

Learning Progress Features

Knowledge growth rate: Change in correctness over time

In [None]:
# Sort data by user and time
train_sorted = train.sort_values(['user_id', 'timestamp'])

# Create time bins (e.g., by day, week, or by groups of N questions)
# Option 1: By time periods
train_sorted['time_bin'] = pd.cut(train_sorted['timestamp'], bins=10)  # 10 equal time bins

# Option 2: By question sequence (every N questions)
train_sorted['question_seq'] = train_sorted.groupby('user_id').cumcount()
train_sorted['question_bin'] = train_sorted['question_seq'] // 10  # Every 10 questions

# Calculate correctness rate by user and time bin
growth_rate = train_sorted.loc[train_sorted.content_type_id == False,
                               ['user_id', 'time_bin', 'answered_correctly']
                              ].groupby(['user_id', 'time_bin']).agg(['mean'])

growth_rate.columns = ['correctness_rate']
growth_rate = growth_rate.reset_index()

# Calculate the change between consecutive time periods
growth_rate_change = growth_rate.sort_values(['user_id', 'time_bin'])
growth_rate_change['previous_rate'] = growth_rate_change.groupby('user_id')['correctness_rate'].shift(1)
growth_rate_change['growth_rate'] = growth_rate_change['correctness_rate'] - growth_rate_change['previous_rate']

# Calculate average growth rate per user
user_avg_growth = growth_rate_change.groupby('user_id')['growth_rate'].mean()

The average correctness rate for each user (what percentage of questions they answered correctly)

In [None]:
results_u_final = train.loc[train.content_type_id == False, ['user_id','answered_correctly']].groupby(['user_id']).agg(['mean'])
results_u_final.columns = ['answered_correctly_user']

In [None]:
 'avg_questions', 'avg_questions_seen'

The average rate at which users received explanations for prior questions

In [None]:
results_u2_final = train.loc[train.content_type_id == False, ['user_id','prior_question_had_explanation']].groupby(['user_id']).agg(['mean'])
results_u2_final.columns = ['explanation_mean_user']

In [None]:
results_u2_final.explanation_mean_user.describe()

How many times a unique question has been answered and what is the percentage of success for each unique question

# Merging datasets

Goal is to merge train, questions and lectures datasets for model building

question_id is a foreign key for the train/test content_id column, when the content type is question (0)

lecture_id is a foreign key for the train/test content_id column, when the content type is lecture (1)

In [None]:
import pandas as pd

def merge_datasets(train_df, questions_df, lectures_df):
    """
    Merge three datasets: train_df, questions_df, and lectures_df.

    Parameters:
    - train_df: DataFrame containing 'content_id' as the key column
    - questions_df: DataFrame containing 'question_id' that links to 'content_id' in train_df
    - lectures_df: DataFrame containing 'lecture_id' that links to 'content_id' in train_df

    Returns:
    - merged_df: The complete merged DataFrame
    """
    # Mrge train_df with questions_df
    # Use a left merge to keep all rows from train_df
    questions_merged = train_df.merge(
        questions_df,
        left_on='content_id',
        right_on='question_id',
        how='left',
        suffixes=('', '_question')
    )

    # Merge the resulting DataFrame with lectures_df
    # Use a left merge to keep all rows from our previous merge
    final_merged = questions_merged.merge(
        lectures_df,
        left_on='content_id',
        right_on='lecture_id',
        how='left',
        suffixes=('', '_lecture')
    )

    # Handle the common part_1 to part_7 columns to avoid confusion
    # For each common column, create a new column that takes the value from either source
    for i in range(1, 8):
        part_col = f'part_{i}'
        question_col = f'{part_col}'  # If no suffix was added in first merge
        lecture_col = f'{part_col}_lecture'

        # Use coalesce logic: take the first non-null value
        # If the value comes from questions_df, use it; otherwise, try lectures_df
        final_merged[f'combined_{part_col}'] = final_merged[question_col].combine_first(
            final_merged[lecture_col]
        )

    return final_merged

# Example usage
# merged_df = merge_datasets(train_df, questions_df, lectures_df)

In [None]:
merged_df = merge_datasets(train_df, questions_df, lectures_df)

### Scaling

In [None]:
# Ensure same scale for all columns from 0-1
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
# convert to a dataframe, for better readability
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

X_scaled.head()

In [None]:
# Select the numerical columns for standardization
num_col = maths_encoded.select_dtypes(include=['number']).columns

# Initialize the StandardScaler
scaler = StandardScaler()

# Standardize the numerical columns
maths_encoded_standardized = maths_encoded.copy()  # Create a copy to avoid modifying the original
maths_encoded_standardized[num_col] = scaler.fit_transform(maths_encoded[num_col])

# Display the first few rows of the standardized data
maths_encoded_standardized.head()