<a href="https://colab.research.google.com/github/theresaskruzna/riiid_knowledge_tracing/blob/main/06_Extra_Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load original data

In [None]:
%%time

dtypes = {
    "row_id": "int32",
    "timestamp": "int32",
    "user_id": "int32",
    "content_id": "int16",
    "content_type_id": "boolean",
    "task_container_id": "int16",
    "user_answer": "int8",
    "answered_correctly": "int8",
    "prior_question_elapsed_time": "float32",
    "prior_question_had_explanation": "boolean"
}

data = pd.read_csv("riiid_data/train.csv", dtype=dtypes)

# Convert to boolean after loading
data['user_answer'] = data['user_answer'] == 1
data['answered_correctly'] = data['answered_correctly'] == 1

print("Train size:", data.shape)

In [None]:
!pip install datatable
import datatable as dt

In [None]:
# First time processing:
data = dt.fread("riiid_data/train.csv").to_pandas()
data.to_pickle("riiid_train.pkl.gzip")

In [None]:
# In subsequent sessions:
data = pd.read_pickle("riiid_train.pkl.gzip")  # Much faster loading

In [None]:
import os
import subprocess

# Define the dataset and file you want
dataset = "rohanrao/riiid-train-data-multiple-formats"
filename = "riiid_train.pkl.gzip"

# Create the directory to store the file if it doesn't exist
os.makedirs("kaggle_data", exist_ok=True)

# Use the kaggle CLI to download just the specific file
command = f"kaggle datasets download {dataset} -f {filename} -p kaggle_data --unzip"
result = subprocess.run(command, shell=True, capture_output=True, text=True)

if result.returncode == 0:
    filepath = os.path.join("kaggle_data", filename)
    print(f"Successfully downloaded: {filepath}")
    print(f"File size: {os.path.getsize(filepath) / (1024 * 1024):.2f} MB")
else:
    print("Error downloading the file:")
    print(result.stderr)

In [None]:
train_df[['row_id',
    'timestamp',
    'user_id',
    'content_id',
    'content_type_id',
    'task_container_id',
    'user_answer',
    'answered_correctly',
    'prior_question_elapsed_time',
    'prior_question_had_explanation']].describe().T

Load in batches (chunks)

In [None]:
# Process the large dataset in chunks
chunksize = 1_000_000 # Load 1 million rows at a time
df_list = [] # List to store individual chunks

for chunk in pd.read_csv(f"{path}/riiid_train.csv", chunksize=chunksize, dtype=dtype_mapping):
    df_list.append(chunk)

# Concatenate all chunks into a single DataFrame
train_df = pd.concat(df_list, ignore_index=True)

Drop row_id column

In [None]:
# Check mememory usage before optimisation
def memory_usage(df):
    return f"{df.memory_usage(deep=True).sum() / 1024 ** 2:.2f} MB"

print(f"Dataset shape: {train_df.shape}")
print(f"Memory usage before optimisation: {memory_usage(train_df)}")

In [None]:
# Drop the row_id column to save memory
train_df.drop('row_id', axis=1, inplace=True) # axis=1 for columns, axis=0 for rows, inplace=True  modifies original df

# Check memory usage after dropping row_id
print(f"Memory usage after dropping row_id: {memory_usage(train_df)}")

Enable low_memory=True

In [None]:
train_df = pd.read_csv(f"{path}/riiid_train.csv", dtype=dtype_mapping, low_memory=True)

# Data exploration

In [None]:
duplicates = train_df.duplicated().sum()
print(f'Duplicated values: {duplicates}')

In [None]:
train['prior_question_had_explanation'] = train['prior_question_had_explanation'].astype('boolean')

train.memory_usage(deep=True)

In [None]:
train_df.columns

In [None]:
print(train_df.content_type_id.value_counts())

# New section