## Preprocessing

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("data/auto.csv", index_col="ID")
df

Unnamed: 0_level_0,CarNumber,Make_n_model,Refund,Fines,History
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Y163O8161RUS,Ford Focus,2.0,3200.0,
1,E432XX77RUS,Toyota Camry,1.0,6500.0,
2,7184TT36RUS,Ford Focus,1.0,2100.0,
3,X582HE161RUS,Ford Focus,2.0,2000.0,
4,E34877152RUS,Ford Focus,2.0,6100.0,
...,...,...,...,...,...
926,Y163O8161RUS,Ford Focus,2.0,1600.0,
927,M0309X197RUS,Ford Focus,1.0,22300.0,
928,O673E8197RUS,Ford Focus,2.0,600.0,
929,8610T8154RUS,Ford Focus,1.0,2000.0,


### Count number of observations

In [3]:
df.count()

CarNumber       931
Make_n_model    931
Refund          914
Fines           869
History          82
dtype: int64

### Drop duplicates based on CarNumber, Make_n_model, Fines, keeping the last

In [4]:
df = df.drop_duplicates(subset=['CarNumber', 'Make_n_model', 'Fines'], keep="last")
df.count()

CarNumber       725
Make_n_model    725
Refund          713
Fines           665
History          65
dtype: int64

### Checked how many values are missing from each column

In [5]:
df.isna().sum()

CarNumber         0
Make_n_model      0
Refund           12
Fines            60
History         660
dtype: int64

### 500 dan ortiq NaN qiymat bo‘lgan ustunlarni olib tashlash

In [6]:
df = df.dropna(axis=1, thresh=(len(df) - 500))
df.isna().sum()

CarNumber        0
Make_n_model     0
Refund          12
Fines           60
dtype: int64

### Refund columndagi NaN qiymatlarni oldingi qiymat bilan to‘ldirish

In [None]:
df['Refund'] = df['Refund'].ffill()
df.isna().sum()


CarNumber        0
Make_n_model     0
Refund           0
Fines           60
dtype: int64

### Fines ustunidagi NaN qiymatlarni o‘rtacha bilan to‘ldirish

In [13]:
mean_fines = df['Fines'].mean()
df['Fines'] = df['Fines'].fillna(mean_fines)
df.isna().sum()
mean_fines

np.float64(8594.586466165412)

### Make_n_model ustunini ikkiga bo‘lish: Make va Model

In [9]:
df[['Make', 'Model']] = df['Make_n_model'].apply(lambda x: pd.Series(str(x).split(' ', 1)))
df['Make']

ID
0        Ford
1      Toyota
2        Ford
3        Ford
5        Ford
        ...  
926      Ford
927      Ford
928      Ford
929      Ford
930    Toyota
Name: Make, Length: 725, dtype: object

In [10]:
df['Model']

ID
0        Focus
1        Camry
2        Focus
3        Focus
5        Focus
        ...   
926      Focus
927      Focus
928      Focus
929      Focus
930    Corolla
Name: Model, Length: 725, dtype: object

### Removed Make_n_model

In [11]:
df = df.drop(columns=["Make_n_model"])
df

Unnamed: 0_level_0,CarNumber,Refund,Fines,Make,Model
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Y163O8161RUS,2.0,3200.000000,Ford,Focus
1,E432XX77RUS,1.0,6500.000000,Toyota,Camry
2,7184TT36RUS,1.0,2100.000000,Ford,Focus
3,X582HE161RUS,2.0,2000.000000,Ford,Focus
5,92918M178RUS,1.0,5700.000000,Ford,Focus
...,...,...,...,...,...
926,Y163O8161RUS,2.0,1600.000000,Ford,Focus
927,M0309X197RUS,1.0,22300.000000,Ford,Focus
928,O673E8197RUS,2.0,600.000000,Ford,Focus
929,8610T8154RUS,1.0,2000.000000,Ford,Focus


### saved to json

In [12]:
df.to_json("auto.json", orient="records", indent=4)