# **Imports**

In [1]:
import os
import numpy as np
import pandas as pd

# from sklearn.experimental import enable_iterative_imputer
# from sklearn.impute import IterativeImputer

# **Paths & Global Variables**

In [None]:
ROOT_PATH = r"C:\Users\mario\OneDrive\Documents\Work\Side Hustles\Kaggle\titanic"

import sys

sys.path.append(ROOT_PATH)

from titanic.config import RAW_DATA_DIR, INTERIM_DATA_DIR

[32m2025-02-22 22:58:09.051[0m | [1mINFO    [0m | [36mtitanic.config[0m:[36m<module>[0m:[36m9[0m - [1mPROJ_ROOT path is: C:\Users\mario\OneDrive\Documents\Work\Side Hustles\Kaggle\titanic[0m


# **Reading Data**

In [3]:
os.chdir(RAW_DATA_DIR)
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [4]:
train_df["split"] = "train"
test_df["split"] = "test"
test_df["Survived"] = 0

raw_df = pd.concat([train_df, test_df], axis=0)

# **Data Wrangling**

## **Data Diagnosis**

In [5]:
raw_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,split
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,train
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,train
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,train
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,train
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,train


In [6]:
raw_df.shape

(1309, 13)

In [7]:
raw_df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'split'],
      dtype='object')

## **Column Name Consistency**

In [8]:
raw_df.columns = map(str.lower, raw_df.columns)

In [9]:
# axis=1` refers to the columns, `axis=0` would refer to the rows
column_new_names_dict = {
    "passengerid": "passenger_id",
    "pclass": "passenger_class",
    "name": "full_name",
    "sibsp": "siblings_spouses",
    "parch": "parents_children",
}

raw_df = raw_df.rename(column_new_names_dict, axis=1)

In [10]:
raw_df.columns

Index(['passenger_id', 'survived', 'passenger_class', 'full_name', 'sex',
       'age', 'siblings_spouses', 'parents_children', 'ticket', 'fare',
       'cabin', 'embarked', 'split'],
      dtype='object')

## **Column Profiling**

In [11]:
raw_df.dtypes

passenger_id          int64
survived              int64
passenger_class       int64
full_name            object
sex                  object
age                 float64
siblings_spouses      int64
parents_children      int64
ticket               object
fare                float64
cabin                object
embarked             object
split                object
dtype: object

In [12]:
raw_df.nunique()

passenger_id        1309
survived               2
passenger_class        3
full_name           1307
sex                    2
age                   98
siblings_spouses       7
parents_children       8
ticket               929
fare                 281
cabin                186
embarked               3
split                  2
dtype: int64

In [13]:
raw_df.isna().sum()

passenger_id           0
survived               0
passenger_class        0
full_name              0
sex                    0
age                  263
siblings_spouses       0
parents_children       0
ticket                 0
fare                   1
cabin               1014
embarked               2
split                  0
dtype: int64

In [14]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1309 entries, 0 to 417
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   passenger_id      1309 non-null   int64  
 1   survived          1309 non-null   int64  
 2   passenger_class   1309 non-null   int64  
 3   full_name         1309 non-null   object 
 4   sex               1309 non-null   object 
 5   age               1046 non-null   float64
 6   siblings_spouses  1309 non-null   int64  
 7   parents_children  1309 non-null   int64  
 8   ticket            1309 non-null   object 
 9   fare              1308 non-null   float64
 10  cabin             295 non-null    object 
 11  embarked          1307 non-null   object 
 12  split             1309 non-null   object 
dtypes: float64(2), int64(5), object(6)
memory usage: 143.2+ KB


## **Data Types**

### **Numerical Variables**

#### **Discrete Variables**

In [15]:
print(raw_df.passenger_id.nunique())
print(raw_df.passenger_id.unique())

1309
[   1    2    3 ... 1307 1308 1309]


In [16]:
print(raw_df.age.nunique())
print(raw_df.age.unique())

98
[22.   38.   26.   35.     nan 54.    2.   27.   14.    4.   58.   20.
 39.   55.   31.   34.   15.   28.    8.   19.   40.   66.   42.   21.
 18.    3.    7.   49.   29.   65.   28.5   5.   11.   45.   17.   32.
 16.   25.    0.83 30.   33.   23.   24.   46.   59.   71.   37.   47.
 14.5  70.5  32.5  12.    9.   36.5  51.   55.5  40.5  44.    1.   61.
 56.   50.   36.   45.5  20.5  62.   41.   52.   63.   23.5   0.92 43.
 60.   10.   64.   13.   48.    0.75 53.   57.   80.   70.   24.5   6.
  0.67 30.5   0.42 34.5  74.   22.5  18.5  67.   76.   26.5  60.5  11.5
  0.33  0.17 38.5 ]


In [17]:
print(raw_df.siblings_spouses.nunique())
print(raw_df.siblings_spouses.unique())

7
[1 0 3 4 2 5 8]


In [18]:
print(raw_df.parents_children.nunique())
print(raw_df.parents_children.unique())

8
[0 1 2 5 3 4 6 9]


#### **Continuous Variables**

In [19]:
print(raw_df.fare.nunique())
print(raw_df.fare.unique())

281
[  7.25    71.2833   7.925   53.1      8.05     8.4583  51.8625  21.075
  11.1333  30.0708  16.7     26.55    31.275    7.8542  16.      29.125
  13.      18.       7.225   26.       8.0292  35.5     31.3875 263.
   7.8792   7.8958  27.7208 146.5208   7.75    10.5     82.1708  52.
   7.2292  11.2417   9.475   21.      41.5792  15.5     21.6792  17.8
  39.6875   7.8     76.7292  61.9792  27.75    46.9     80.      83.475
  27.9     15.2458   8.1583   8.6625  73.5     14.4542  56.4958   7.65
  29.      12.475    9.       9.5      7.7875  47.1     15.85    34.375
  61.175   20.575   34.6542  63.3583  23.      77.2875   8.6542   7.775
  24.15     9.825   14.4583 247.5208   7.1417  22.3583   6.975    7.05
  14.5     15.0458  26.2833   9.2167  79.2      6.75    11.5     36.75
   7.7958  12.525   66.6      7.3125  61.3792   7.7333  69.55    16.1
  15.75    20.525   55.      25.925   33.5     30.6958  25.4667  28.7125
   0.      15.05    39.      22.025   50.       8.4042   6.4958  10.4625

### **Categorical Variables**

#### **Binary Variables**

In [20]:
print(raw_df.survived.nunique())
print(raw_df.survived.unique())

2
[0 1]


#### **Ordinal Variables**

In [21]:
print(raw_df.passenger_class.nunique())
print(raw_df.passenger_class.unique())

3
[3 1 2]


#### **Nominal Variables**

In [22]:
print(raw_df.sex.nunique())
print(raw_df.sex.unique())

2
['male' 'female']


In [23]:
print(raw_df.embarked.nunique())
print(raw_df.embarked.unique())

3
['S' 'C' 'Q' nan]


In [24]:
print(raw_df.cabin.nunique())
print(raw_df.cabin.unique())

186
[nan 'C85' 'C123' 'E46' 'G6' 'C103' 'D56' 'A6' 'C23 C25 C27' 'B78' 'D33'
 'B30' 'C52' 'B28' 'C83' 'F33' 'F G73' 'E31' 'A5' 'D10 D12' 'D26' 'C110'
 'B58 B60' 'E101' 'F E69' 'D47' 'B86' 'F2' 'C2' 'E33' 'B19' 'A7' 'C49'
 'F4' 'A32' 'B4' 'B80' 'A31' 'D36' 'D15' 'C93' 'C78' 'D35' 'C87' 'B77'
 'E67' 'B94' 'C125' 'C99' 'C118' 'D7' 'A19' 'B49' 'D' 'C22 C26' 'C106'
 'C65' 'E36' 'C54' 'B57 B59 B63 B66' 'C7' 'E34' 'C32' 'B18' 'C124' 'C91'
 'E40' 'T' 'C128' 'D37' 'B35' 'E50' 'C82' 'B96 B98' 'E10' 'E44' 'A34'
 'C104' 'C111' 'C92' 'E38' 'D21' 'E12' 'E63' 'A14' 'B37' 'C30' 'D20' 'B79'
 'E25' 'D46' 'B73' 'C95' 'B38' 'B39' 'B22' 'C86' 'C70' 'A16' 'C101' 'C68'
 'A10' 'E68' 'B41' 'A20' 'D19' 'D50' 'D9' 'A23' 'B50' 'A26' 'D48' 'E58'
 'C126' 'B71' 'B51 B53 B55' 'D49' 'B5' 'B20' 'F G63' 'C62 C64' 'E24' 'C90'
 'C45' 'E8' 'B101' 'D45' 'C46' 'D30' 'E121' 'D11' 'E77' 'F38' 'B3' 'D6'
 'B82 B84' 'D17' 'A36' 'B102' 'B69' 'E49' 'C47' 'D28' 'E17' 'A24' 'C50'
 'B42' 'C148' 'B45' 'B36' 'A21' 'D34' 'A9' 'C31' 'B61'

In [25]:
print(raw_df.full_name.nunique())
print(raw_df.full_name.unique())

1307
['Braund, Mr. Owen Harris'
 'Cumings, Mrs. John Bradley (Florence Briggs Thayer)'
 'Heikkinen, Miss. Laina' ... 'Saether, Mr. Simon Sivertsen'
 'Ware, Mr. Frederick' 'Peter, Master. Michael J']


In [26]:
print(raw_df.ticket.nunique())
print(raw_df.ticket.unique())

929
['A/5 21171' 'PC 17599' 'STON/O2. 3101282' '113803' '373450' '330877'
 '17463' '349909' '347742' '237736' 'PP 9549' '113783' 'A/5. 2151'
 '347082' '350406' '248706' '382652' '244373' '345763' '2649' '239865'
 '248698' '330923' '113788' '347077' '2631' '19950' '330959' '349216'
 'PC 17601' 'PC 17569' '335677' 'C.A. 24579' 'PC 17604' '113789' '2677'
 'A./5. 2152' '345764' '2651' '7546' '11668' '349253' 'SC/Paris 2123'
 '330958' 'S.C./A.4. 23567' '370371' '14311' '2662' '349237' '3101295'
 'A/4. 39886' 'PC 17572' '2926' '113509' '19947' 'C.A. 31026' '2697'
 'C.A. 34651' 'CA 2144' '2669' '113572' '36973' '347088' 'PC 17605' '2661'
 'C.A. 29395' 'S.P. 3464' '3101281' '315151' 'C.A. 33111' 'S.O.C. 14879'
 '2680' '1601' '348123' '349208' '374746' '248738' '364516' '345767'
 '345779' '330932' '113059' 'SO/C 14885' '3101278' 'W./C. 6608'
 'SOTON/OQ 392086' '343275' '343276' '347466' 'W.E.P. 5734' 'C.A. 2315'
 '364500' '374910' 'PC 17754' 'PC 17759' '231919' '244367' '349245'
 '349215' '3528

## **Removing Duplicates**

In [27]:
INDEX_COLS_LIST = ["passenger_id"]

In [28]:
duplicates = raw_df[raw_df[INDEX_COLS_LIST].duplicated()]
duplicates

Unnamed: 0,passenger_id,survived,passenger_class,full_name,sex,age,siblings_spouses,parents_children,ticket,fare,cabin,embarked,split


In [29]:
print(raw_df.shape)
raw_df.drop_duplicates(subset=INDEX_COLS_LIST, keep="first", inplace=True)
print(raw_df.shape)

(1309, 13)
(1309, 13)


## **Indexing**


In [30]:
raw_df.set_index(INDEX_COLS_LIST, inplace=True)
raw_df.head(1)

Unnamed: 0_level_0,survived,passenger_class,full_name,sex,age,siblings_spouses,parents_children,ticket,fare,cabin,embarked,split
passenger_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,train


## **Missing Values**

### **Deletion**

#### **Listwise Deletion**

In [31]:
# raw_df.dropna(inplace=True)

#### **Pairwise Deletion**

In [32]:
raw_df.dropna(subset=["embarked"], inplace=True)

#### **Dropping Columns**

### **Single Imputation**

In [33]:
raw_df.fillna(value={"cabin": "Unknown"}, inplace=True)

raw_df["age"] = raw_df.groupby(["passenger_class", "sex", "embarked"])["age"].transform(
    lambda x: x.fillna(x.mean())
)

In [34]:
raw_df["fare"] = raw_df.groupby(["passenger_class", "sex", "embarked"])[
    "fare"
].transform(lambda x: x.fillna(x.mean()))

### **Multiple Imputation** 🔴

## **Splitting Columns**

In [35]:
raw_df["surname"] = raw_df["full_name"].str.split(",").str[0]
raw_df["surname"].unique()

array(['Braund', 'Cumings', 'Heikkinen', 'Futrelle', 'Allen', 'Moran',
       'McCarthy', 'Palsson', 'Johnson', 'Nasser', 'Sandstrom', 'Bonnell',
       'Saundercock', 'Andersson', 'Vestrom', 'Hewlett', 'Rice',
       'Williams', 'Vander Planke', 'Masselmani', 'Fynney', 'Beesley',
       'McGowan', 'Sloper', 'Asplund', 'Emir', 'Fortune', "O'Dwyer",
       'Todoroff', 'Uruchurtu', 'Spencer', 'Glynn', 'Wheadon', 'Meyer',
       'Holverson', 'Mamee', 'Cann', 'Nicola-Yarred', 'Ahlin', 'Turpin',
       'Kraeff', 'Laroche', 'Devaney', 'Rogers', 'Lennon', "O'Driscoll",
       'Samaan', 'Arnold-Franchi', 'Panula', 'Nosworthy', 'Harper',
       'Faunthorpe', 'Ostby', 'Woolner', 'Rugg', 'Novel', 'West',
       'Goodwin', 'Sirayanian', 'Harris', 'Skoog', 'Stewart', 'Moubarek',
       'Nye', 'Crease', 'Kink', 'Jenkin', 'Hood', 'Chronopoulos', 'Bing',
       'Moen', 'Staneff', 'Moutal', 'Caldwell', 'Dowdell', 'Waelens',
       'Sheerlinck', 'McDermott', 'Carrau', 'Ilett', 'Backstrom', 'Ford',
     

In [36]:
raw_df["title"] = raw_df["full_name"].str.split(",").str[1].str.split(".").str[0]
raw_df["title"] = raw_df["title"].str.strip()
raw_df["title"].unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'the Countess',
       'Jonkheer', 'Dona'], dtype=object)

In [37]:
raw_df["first_name"] = raw_df["full_name"].str.split(",").str[1].str.split(".").str[1]
raw_df["first_name"] = raw_df["first_name"].str.strip()
raw_df["first_name"].unique()

array(['Owen Harris', 'John Bradley (Florence Briggs Thayer)', 'Laina',
       ..., 'Fermina', 'Simon Sivertsen', 'Michael J'], dtype=object)

## **String Parsing** 🔴

In [38]:
raw_df["list_cabins"] = raw_df["cabin"].str.split(r"\s")

# Could be Feature Engineering

raw_df["num_rooms"] = raw_df["list_cabins"].apply(
    lambda x: len(x) if x[0] != "Unknown" else 0
)
raw_df["num_rooms"] = raw_df["num_rooms"].astype(int)
raw_df["num_rooms"].unique()

array([0, 1, 3, 2, 4])

In [39]:
raw_df.drop(columns=["list_cabins"], inplace=True)

In [40]:
raw_df["cabin_level_1"] = raw_df["cabin"].str[0]
raw_df["cabin_level_1"].unique()

array(['U', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [41]:
# Define a function to replace 'U' with the mode of the group
def replace_U_with_mode(group):
    mode_value = group[~group.isin(["U", "T"])].mode()
    if not mode_value.empty:
        group[group.isin(["U", "T"])] = mode_value[0]
    return group


# Apply the function to each group
raw_df["cabin_level_1"] = raw_df.groupby(["passenger_class", "embarked"])[
    "cabin_level_1"
].transform(replace_U_with_mode)
raw_df.cabin_level_1.value_counts()

cabin_level_1
G    488
F    455
C    162
D     70
B     63
E     47
A     22
Name: count, dtype: int64

In [42]:
raw_df["cabin_level_2"] = raw_df["cabin"].str[1:]
raw_df["cabin_level_2"] = raw_df["cabin_level_2"].str.extract(r"(\d+)")
raw_df["cabin_level_2"] = raw_df["cabin_level_2"].fillna(0)
raw_df["cabin_level_2"] = raw_df["cabin_level_2"].astype(int)
raw_df["cabin_level_2"].unique()

array([  0,  85, 123,  46,   6, 103,  56,  23,  78,  33,  30,  52,  83,
        73,  31,   5,  10,  26, 110,  58, 101,  69,  47,  86,   2,  19,
         7,  49,   4,  32,  80,  36,  15,  93,  35,  87,  77,  67,  94,
       125,  99, 118,  22, 106,  65,  54,  57,  34,  18, 124,  91,  40,
       128,  37,  50,  82,  96,  44, 104, 111,  92,  38,  21,  12,  63,
        14,  20,  79,  25,  95,  39,  70,  16,  68,  41,   9,  48, 126,
        71,  51,  62,  24,  90,  45,   8, 121,  11,   3,  17, 102,  28,
        42, 148,  61,  53,  43, 130, 132,  55, 116,  29,  97,  89,  60,
       105])

## **Data Type Transformations**

In [43]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1307 entries, 1 to 1309
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   survived          1307 non-null   int64  
 1   passenger_class   1307 non-null   int64  
 2   full_name         1307 non-null   object 
 3   sex               1307 non-null   object 
 4   age               1307 non-null   float64
 5   siblings_spouses  1307 non-null   int64  
 6   parents_children  1307 non-null   int64  
 7   ticket            1307 non-null   object 
 8   fare              1307 non-null   float64
 9   cabin             1307 non-null   object 
 10  embarked          1307 non-null   object 
 11  split             1307 non-null   object 
 12  surname           1307 non-null   object 
 13  title             1307 non-null   object 
 14  first_name        1307 non-null   object 
 15  num_rooms         1307 non-null   int64  
 16  cabin_level_1     1307 non-null   object 
 17  

In [44]:
raw_df.columns

Index(['survived', 'passenger_class', 'full_name', 'sex', 'age',
       'siblings_spouses', 'parents_children', 'ticket', 'fare', 'cabin',
       'embarked', 'split', 'surname', 'title', 'first_name', 'num_rooms',
       'cabin_level_1', 'cabin_level_2'],
      dtype='object')

### **Discrete Variables**

In [45]:
DISCRETE_VARS_LIST = ["siblings_spouses", "parents_children", "num_rooms"]

for col in DISCRETE_VARS_LIST:
    raw_df[col] = raw_df[col].astype("int")

### **Continuous Variables**

Continuous (numerical) variables should usually be stored as the `float` data type because they allow us to store decimal values.

In [46]:
CONT_VARS_LIST = ["age", "fare"]

for col in CONT_VARS_LIST:
    raw_df[col] = raw_df[col].astype("float")

### **Binary Variables**

In [47]:
BOOL_VARS_LIST = ["survived"]
for col in BOOL_VARS_LIST:
    raw_df[col] = raw_df[col].astype("bool")

### **Ordinal Variables**

In [48]:
ORD_VARS_DICT = {"passenger_class": {3: "Lower", 2: "Middle", 1: "Upper"}}

for col in ORD_VARS_DICT.keys():
    raw_df[col] = raw_df[col].astype("category")
    print(raw_df[col].unique())
    print(raw_df[col].sort_values().head())

[3, 1, 2]
Categories (3, int64): [1, 2, 3]
passenger_id
843     1
988     1
1144    1
582     1
584     1
Name: passenger_class, dtype: category
Categories (3, int64): [1, 2, 3]


In [49]:
for col, ordered_dict in ORD_VARS_DICT.items():
    print(col)
    raw_df[col] = pd.Categorical(raw_df[col], ordered_dict.keys(), ordered=True)
    raw_df[col] = raw_df[col].cat.rename_categories(ordered_dict.values())
    print(raw_df[col].unique())
    print(raw_df[col].sort_values().head())

passenger_class
['Lower', 'Upper', 'Middle']
Categories (3, object): ['Lower' < 'Middle' < 'Upper']
passenger_id
1      Lower
732    Lower
736    Lower
737    Lower
739    Lower
Name: passenger_class, dtype: category
Categories (3, object): ['Lower' < 'Middle' < 'Upper']


### **Nominal Variables**

In [50]:
NOM_VARS_LIST = [
    "full_name",
    "surname",
    "title",
    "first_name",
    "cabin",
    "ticket",
    "cabin_level_2",
]

for col in NOM_VARS_LIST:
    raw_df[col] = raw_df[col].astype("string")

In [51]:
NOM_CAT_VARS_LIST = [
    "sex",
    "embarked",
    "cabin_level_1",
]

for col in NOM_CAT_VARS_LIST:
    raw_df[col] = raw_df[col].astype("category")

In [52]:
raw_df.dtypes

survived                      bool
passenger_class           category
full_name           string[python]
sex                       category
age                        float64
siblings_spouses             int64
parents_children             int64
ticket              string[python]
fare                       float64
cabin               string[python]
embarked                  category
split                       object
surname             string[python]
title               string[python]
first_name          string[python]
num_rooms                    int64
cabin_level_1             category
cabin_level_2       string[python]
dtype: object

## **Reorder Columns**

In [53]:
new_col_order = []
feature_types = ["int", "float", np.bool, "category", "string", "object"]

for feature_type in feature_types:
    new_col_order += raw_df.select_dtypes(include=[feature_type]).columns.to_list()

raw_df = raw_df[new_col_order]
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1307 entries, 1 to 1309
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   siblings_spouses  1307 non-null   int64   
 1   parents_children  1307 non-null   int64   
 2   num_rooms         1307 non-null   int64   
 3   age               1307 non-null   float64 
 4   fare              1307 non-null   float64 
 5   survived          1307 non-null   bool    
 6   passenger_class   1307 non-null   category
 7   sex               1307 non-null   category
 8   embarked          1307 non-null   category
 9   cabin_level_1     1307 non-null   category
 10  full_name         1307 non-null   string  
 11  ticket            1307 non-null   string  
 12  cabin             1307 non-null   string  
 13  surname           1307 non-null   string  
 14  title             1307 non-null   string  
 15  first_name        1307 non-null   string  
 16  cabin_level_2     1307 non-nu

In [54]:
os.chdir(INTERIM_DATA_DIR)
raw_df.to_parquet("cleaned.parquet")