## Exercise 02 - Preprocessing


In [4]:
import pandas as pd


## Load data and set index


In [5]:
df = pd.read_csv('../data/auto.csv', index_col='ID')

# Align column name with exercise text
df = df.rename(columns={'Make_n_model': 'Make_n_Model'})

initial_counts = df.count()
initial_counts


CarNumber       931
Make_n_Model    931
Refund          914
Fines           869
History          82
dtype: int64

## Drop duplicates on CarNumber, Make_n_Model, Fines (keep last)


In [6]:
df = df.drop_duplicates(subset=['CarNumber', 'Make_n_Model', 'Fines'], keep='last')

after_dedup_counts = df.count()
after_dedup_counts


CarNumber       725
Make_n_Model    725
Refund          713
Fines           665
History          65
dtype: int64

## Missing values: inspect and clean


In [7]:
missing_before = df.isna().sum()
missing_before


CarNumber         0
Make_n_Model      0
Refund           12
Fines            60
History         660
dtype: int64

In [8]:
# Drop columns with over 500 missing values
df = df.dropna(axis=1, thresh=len(df) - 500)

missing_after_drop = df.isna().sum()
missing_after_drop


CarNumber        0
Make_n_Model     0
Refund          12
Fines           60
dtype: int64

In [9]:
# Fill Refund missing values with previous value in column (forward fill)
if 'Refund' in df.columns:
    df['Refund'] = df['Refund'].fillna(method='ffill')

missing_after_refund = df.isna().sum()
missing_after_refund


  df['Refund'] = df['Refund'].fillna(method='ffill')


CarNumber        0
Make_n_Model     0
Refund           0
Fines           60
dtype: int64

In [10]:
# Fill Fines missing values with column mean
if 'Fines' in df.columns:
    fines_mean = df['Fines'].mean()
    df['Fines'] = df['Fines'].fillna(fines_mean)

missing_after_fines = df.isna().sum()
missing_after_fines


CarNumber       0
Make_n_Model    0
Refund          0
Fines           0
dtype: int64

## Split Make_n_Model into Make and Model


In [11]:
def extract_make(value):
    parts = str(value).split(' ', 1)
    return parts[0] if len(parts) > 0 else None

def extract_model(value):
    parts = str(value).split(' ', 1)
    return parts[1] if len(parts) == 2 else None

df['Make'] = df['Make_n_Model'].apply(extract_make)
df['Model'] = df['Make_n_Model'].apply(extract_model)

df = df.drop(columns=['Make_n_Model'])
df.head()


Unnamed: 0_level_0,CarNumber,Refund,Fines,Make,Model
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Y163O8161RUS,2.0,3200.0,Ford,Focus
1,E432XX77RUS,1.0,6500.0,Toyota,Camry
2,7184TT36RUS,1.0,2100.0,Ford,Focus
3,X582HE161RUS,2.0,2000.0,Ford,Focus
5,92918M178RUS,1.0,5700.0,Ford,Focus


## Save to JSON


In [12]:
df_to_save = df[['CarNumber', 'Refund', 'Fines', 'Make', 'Model']]
df_to_save.to_json('../data/auto.json', orient='records')
df_to_save.head()


Unnamed: 0_level_0,CarNumber,Refund,Fines,Make,Model
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Y163O8161RUS,2.0,3200.0,Ford,Focus
1,E432XX77RUS,1.0,6500.0,Toyota,Camry
2,7184TT36RUS,1.0,2100.0,Ford,Focus
3,X582HE161RUS,2.0,2000.0,Ford,Focus
5,92918M178RUS,1.0,5700.0,Ford,Focus
