# Apex Legends S15 Player Retention — Data Wrangling

**Goal:** Load the raw Season 15 match data, audit quality (missingness, dtypes, duplicates), drop/fix problematic columns, and export a clean CSV for modeling.


In [240]:
import pandas as pd
import numpy as np
from pathlib import Path

pd.set_option("display.max_columns", 120)

In [242]:
RAW_PATH = Path("Apex_Game_History_Season15S1.csv")

df = pd.read_csv(RAW_PATH, parse_dates=["date"])
print(f"Rows: {df.shape[0]:,} | Columns: {df.shape[1]}")
df.head()

Rows: 499 | Columns: 36


Unnamed: 0,date,game,map,match_type,my_duration,my_rank,rp_earned,premade_squad,voice_chat,squad_placed,teamate_count,my_quit,teamate_quit_count,my_legend,teamate_1_legend,teamate_2_legend,my_damage,teamate_1_damage,teamate_2_damage,my_kills,teamate_1_kills,teamate_2_kills,my_assists,teamate_1_assists,teamate_2_assists,my_knocks,teamate_1_knocks,teamate_2_knocks,my_revives,teamate_1_revives,teamate_2_revives,my_respawns,teamate_1_respawns,teamate_2_respawns,Unnamed: 34,Unnamed: 35
0,2022-11-03,1,broken moon,casual,846.0,S3,0.0,no,yes,5.0,1.0,0.0,1.0,Valkyrie,Wattson,quit,1268.0,,,5.0,4.0,0.0,0.0,0.0,0.0,5.0,4.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,,
1,2022-11-03,2,broken moon,casual,,S3,0.0,no,yes,12.0,2.0,0.0,0.0,Valkyrie,Lifeline,Pathfinder,178.0,102.0,173.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
2,2022-11-03,3,broken moon,casual,,S3,0.0,no,no,3.0,2.0,0.0,0.0,Valkyrie,Wraith,Lifeline,141.0,493.0,256.0,0.0,2.0,2.0,1.0,1.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,,
3,2022-11-03,4,broken moon,casual,,S3,0.0,no,no,4.0,2.0,0.0,0.0,Valkyrie,Mirage,Wattson,497.0,254.0,449.0,0.0,2.0,1.0,1.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,
4,2022-11-03,5,broken moon,casual,,S3,0.0,no,no,8.0,2.0,0.0,0.0,Valkyrie,Lifeline,Wraith,792.0,115.0,758.0,1.0,2.0,2.0,4.0,1.0,2.0,2.0,2.0,3.0,0.0,1.0,0.0,0.0,0.0,1.0,,


In [244]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 499 entries, 0 to 498
Data columns (total 36 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   date                499 non-null    datetime64[ns]
 1   game                499 non-null    int64         
 2   map                 499 non-null    object        
 3   match_type          499 non-null    object        
 4   my_duration         265 non-null    float64       
 5   my_rank             498 non-null    object        
 6   rp_earned           487 non-null    float64       
 7   premade_squad       497 non-null    object        
 8   voice_chat          497 non-null    object        
 9   squad_placed        475 non-null    float64       
 10  teamate_count       314 non-null    float64       
 11  my_quit             317 non-null    float64       
 12  teamate_quit_count  306 non-null    float64       
 13  my_legend           287 non-null    object        

In [246]:
df.describe()

Unnamed: 0,date,game,my_duration,rp_earned,squad_placed,teamate_count,my_quit,teamate_quit_count,my_damage,teamate_1_damage,teamate_2_damage,my_kills,teamate_1_kills,teamate_2_kills,my_assists,teamate_1_assists,teamate_2_assists,my_knocks,teamate_1_knocks,teamate_2_knocks,my_revives,teamate_1_revives,teamate_2_revives,my_respawns,teamate_1_respawns,teamate_2_respawns,Unnamed: 34,Unnamed: 35
count,499,499.0,265.0,487.0,475.0,314.0,317.0,306.0,284.0,275.0,270.0,283.0,278.0,274.0,281.0,277.0,273.0,281.0,278.0,273.0,283.0,277.0,274.0,283.0,276.0,275.0,0.0,0.0
mean,2022-12-11 15:40:45.691382784,250.0,592.30566,12.25462,10.212632,1.964968,0.041009,0.071895,471.482394,464.152727,483.455556,0.968198,1.057554,1.051095,0.711744,0.66065,0.750916,1.259786,1.352518,1.362637,0.204947,0.241877,0.244526,0.141343,0.057971,0.054545,,
min,2022-11-03 00:00:00,1.0,46.0,-55.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
25%,2022-11-14 00:00:00,125.5,228.0,-44.5,5.0,2.0,0.0,0.0,150.75,143.5,148.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
50%,2022-12-08 00:00:00,250.0,498.0,-25.0,11.0,2.0,0.0,0.0,330.5,316.0,316.5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,
75%,2023-01-09 00:00:00,374.5,943.0,28.0,15.0,2.0,0.0,0.0,635.75,666.0,667.5,1.0,2.0,2.0,1.0,1.0,1.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,,
max,2023-01-15 00:00:00,499.0,1349.0,401.0,20.0,2.0,1.0,2.0,2426.0,2180.0,2590.0,7.0,8.0,9.0,6.0,7.0,6.0,7.0,10.0,10.0,3.0,3.0,3.0,2.0,2.0,2.0,,
std,,144.193157,393.549428,86.973583,5.732343,0.216084,0.198626,0.315804,429.3188,444.317929,478.563064,1.469204,1.492556,1.513413,1.142759,1.173532,1.186725,1.623527,1.666676,1.650645,0.491463,0.534309,0.607091,0.430842,0.249162,0.243018,,


In [307]:
df.dtypes

date                  datetime64[ns]
game                           int64
map                           object
match_type                    object
my_duration                  float64
my_rank                       object
premade_squad                 object
voice_chat                    object
squad_placed                 float64
teamate_count                float64
teamate_quit_count           float64
my_legend                     object
teamate_1_legend              object
teamate_2_legend              object
my_damage                    float64
teamate_1_damage             float64
teamate_2_damage             float64
my_kills                       int64
teamate_1_kills                int64
teamate_2_kills                int64
my_assists                     int64
teamate_1_assists              int64
teamate_2_assists              int64
my_knocks                      int64
teamate_1_knocks               int64
teamate_2_knocks               int64
my_revives                     int64
t

In [309]:
na = df.isna().sum()
na = na[na > 0].sort_values(ascending=False)

na_table = (na.to_frame("n_missing").assign(pct=lambda d: (d["n_missing"] / len(df) * 100).round(2)))

display(na_table)

Unnamed: 0,n_missing,pct


In [311]:
df.columns = df.columns.str.strip()

drop_cols = ["rp_bin", "rp_earned", "rp_delta", "rp_change", "quit_then_returned", "days_since_match", "session_num", "match_order", "my_quit", "teammate_quit_count", "match_id", "game_id"]

to_drop = [c for c in drop_cols if c in df.columns]
print("Dropping:", to_drop)
df.drop(columns=to_drop, inplace=True)
df.shape

Dropping: []


(499, 32)

In [313]:
unnamed_cols = [c for c in df.columns if c.startswith("Unnamed:")]
all_nan_cols = df.columns[df.isna().all()].tolist()
extra_drop = unnamed_cols + all_nan_cols
if extra_drop:
    print("Dropping extra junk columns:", extra_drop)

df.drop(columns=extra_drop, inplace=True)
print("After extra drops:", df.shape)

After extra drops: (499, 32)


In [315]:
def show_rows(stage):
    print(f"{stage}: {df.shape[0]} rows")

show_rows("After drops")

After drops: 499 rows


In [317]:
na_left = (df.isna().sum().loc[lambda s: s > 0].sort_values(ascending=False))
display(na_left.to_frame("n_missing").assign(pct=lambda d: (d["n_missing"] / len(df) * 100).round(2)))

Unnamed: 0,n_missing,pct


In [319]:
from pandas.api.types import is_numeric_dtype, is_bool_dtype

zero_fill_cols = [c for c in df.columns
                  if any(k in c for k in ["kills","assists","knocks","revives","respawns"])
                  and c.startswith(("my_", "teamate_", "teammate_"))]

median_fill_cols = [c for c in df.columns
                    if is_numeric_dtype(df[c]) and df[c].isna().any()]

mode_fill_cols = [c for c in df.columns
                  if (df[c].dtype == "object" or str(df[c].dtype).startswith("category") or is_bool_dtype(df[c]))
                  and df[c].isna().any()]

# my_rank is categorical, not numeric
if "my_rank" in median_fill_cols:
    median_fill_cols.remove("my_rank")
if "my_rank" in df.columns and df["my_rank"].isna().any() and "my_rank" not in mode_fill_cols:
    mode_fill_cols.append("my_rank")

In [321]:
for c in zero_fill_cols:
    df[c] = df[c].fillna(0)

for c in median_fill_cols:
    df[c] = df[c].fillna(df[c].median())

for c in mode_fill_cols:
    df[c] = df[c].fillna(df[c].mode()[0])

# Cast zero-fill cols back to int
int_cols = [c for c in zero_fill_cols if c in df.columns]
df[int_cols] = df[int_cols].astype(int)

print("Imputation done.")

Imputation done.


In [323]:
# Define churn/retention from recency
CUTOFF_DAYS = 7
latest_date = df["date"].max()
df["days_since_match"] = (latest_date - df["date"]).dt.days
df["retained"] = (df["days_since_match"] <= CUTOFF_DAYS).astype(int)

df.drop(columns=["days_since_match"], inplace=True)

In [325]:
na_left = (df.isna().sum().loc[lambda s: s > 0].sort_values(ascending=False))

display(na_left.to_frame("n_missing").assign(pct=lambda d: (d["n_missing"] / len(df) * 100).round(2)))

if na_left.empty:
    print("No NaNs left.")
else:
    print("Still have NaNs in:", na_left.index.tolist())

Unnamed: 0,n_missing,pct


No NaNs left.


In [327]:
dupes = df.duplicated().sum()
print(f"Duplicate rows: {dupes}")
if dupes:
    df = df.drop_duplicates().copy()
    print("Duplicates removed. New shape:", df.shape)

Duplicate rows: 0


In [329]:
df.shape

(499, 33)

In [331]:
df.head()

Unnamed: 0,date,game,map,match_type,my_duration,my_rank,premade_squad,voice_chat,squad_placed,teamate_count,teamate_quit_count,my_legend,teamate_1_legend,teamate_2_legend,my_damage,teamate_1_damage,teamate_2_damage,my_kills,teamate_1_kills,teamate_2_kills,my_assists,teamate_1_assists,teamate_2_assists,my_knocks,teamate_1_knocks,teamate_2_knocks,my_revives,teamate_1_revives,teamate_2_revives,my_respawns,teamate_1_respawns,teamate_2_respawns,retained
0,2022-11-03,1,broken moon,casual,846.0,S3,no,yes,5.0,1.0,1.0,Valkyrie,Wattson,quit,1268.0,316.0,316.5,5,4,0,0,0,0,5,4,0,1,0,0,1,1,0,0
1,2022-11-03,2,broken moon,casual,498.0,S3,no,yes,12.0,2.0,0.0,Valkyrie,Lifeline,Pathfinder,178.0,102.0,173.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2022-11-03,3,broken moon,casual,498.0,S3,no,no,3.0,2.0,0.0,Valkyrie,Wraith,Lifeline,141.0,493.0,256.0,0,2,2,1,1,0,0,2,2,0,0,0,0,0,0,0
3,2022-11-03,4,broken moon,casual,498.0,S3,no,no,4.0,2.0,0.0,Valkyrie,Mirage,Wattson,497.0,254.0,449.0,0,2,1,1,0,0,0,3,1,0,0,0,0,0,0,0
4,2022-11-03,5,broken moon,casual,498.0,S3,no,no,8.0,2.0,0.0,Valkyrie,Lifeline,Wraith,792.0,115.0,758.0,1,2,2,4,1,2,2,2,3,0,1,0,0,0,1,0


In [333]:
assert "rp_bin" not in df.columns
assert df.isna().sum().sum() == 0
assert "retained" in df.columns
assert set(df["retained"].unique()) <= {0,1}

print("Clean dataset with target created.")
print(f"Final shape: {df.shape[0]:,} rows x {df.shape[1]} cols")

CLEAN_PATH = Path("ApexPlayerRetention_DataWrangling.csv")
df.to_csv(CLEAN_PATH, index=False)
print("Saved:", CLEAN_PATH.resolve())

Clean dataset with target created.
Final shape: 499 rows x 33 cols
Saved: /Users/torribrigola/Documents/Capstone 3/ApexPlayerRetention_DataWrangling.csv
