## Exercise 03 - Selects and aggregations


In [1]:
import pandas as pd


## Load data and set index


In [2]:
df = pd.read_json('../data/auto.json')
df = df.set_index('CarNumber')

df.head()


Unnamed: 0_level_0,Refund,Fines,Make,Model
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Y163O8161RUS,2,3200.0,Ford,Focus
E432XX77RUS,1,6500.0,Toyota,Camry
7184TT36RUS,1,2100.0,Ford,Focus
X582HE161RUS,2,2000.0,Ford,Focus
92918M178RUS,1,5700.0,Ford,Focus


## Selections


In [3]:
# Fines greater than 2100
sel_fines_gt_2100 = df[df['Fines'] > 2100]
sel_fines_gt_2100


Unnamed: 0_level_0,Refund,Fines,Make,Model
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Y163O8161RUS,2,3200.000000,Ford,Focus
E432XX77RUS,1,6500.000000,Toyota,Camry
92918M178RUS,1,5700.000000,Ford,Focus
H234YH197RUS,2,6000.000000,Ford,Focus
E40577152RUS,1,8594.586466,Ford,Focus
...,...,...,...,...
O718MM163RUS,2,8594.586466,Ford,Focus
7065C8197RUS,2,11400.000000,Volkswagen,Passat
O22097197RUS,1,24300.000000,Ford,Focus
M0309X197RUS,1,22300.000000,Ford,Focus


In [4]:
# Fines greater than 2100 and refund equals 2
sel_fines_gt_2100_refund_2 = df[(df['Fines'] > 2100) & (df['Refund'] == 2)]
sel_fines_gt_2100_refund_2


Unnamed: 0_level_0,Refund,Fines,Make,Model
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Y163O8161RUS,2,3200.000000,Ford,Focus
H234YH197RUS,2,6000.000000,Ford,Focus
707987163RUS,2,2200.000000,Ford,Focus
K330T8197RUS,2,8200.000000,Skoda,Octavia
M592CH197RUS,2,8594.586466,Skoda,Octavia
...,...,...,...,...
O136HO197RUS,2,7800.000000,Toyota,Corolla
O68897197RUS,2,12300.000000,Ford,Focus
O718MM163RUS,2,8594.586466,Ford,Focus
7065C8197RUS,2,11400.000000,Volkswagen,Passat


In [5]:
# Models from list ['Focus', 'Corolla']
models_list = ['Focus', 'Corolla']
sel_models_in_list = df[df['Model'].isin(models_list)]
sel_models_in_list


Unnamed: 0_level_0,Refund,Fines,Make,Model
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Y163O8161RUS,2,3200.000000,Ford,Focus
7184TT36RUS,1,2100.000000,Ford,Focus
X582HE161RUS,2,2000.000000,Ford,Focus
92918M178RUS,1,5700.000000,Ford,Focus
H234YH197RUS,2,6000.000000,Ford,Focus
...,...,...,...,...
Y163O8161RUS,2,1600.000000,Ford,Focus
M0309X197RUS,1,22300.000000,Ford,Focus
O673E8197RUS,2,600.000000,Ford,Focus
8610T8154RUS,1,2000.000000,Ford,Focus


In [6]:
# Car numbers from given list
carnumbers_list = [
    'Y7689C197RUS',
    '92928M178RUS',
    '7788KT197RUS',
    'H115YO163RUS',
    'X758HY197RUS',
]
sel_carnumbers_in_list = df[df.index.isin(carnumbers_list)]
sel_carnumbers_in_list


Unnamed: 0_level_0,Refund,Fines,Make,Model
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
92928M178RUS,1,8594.586466,Ford,Focus
H115YO163RUS,1,2200.0,Ford,Focus
7788KT197RUS,2,12000.0,Ford,Focus
X758HY197RUS,2,24200.0,Ford,Focus
X758HY197RUS,2,72600.0,Ford,Focus
Y7689C197RUS,1,27000.0,Ford,Focus
92928M178RUS,1,600.0,Ford,Focus
H115YO163RUS,1,8594.586466,Ford,Focus
H115YO163RUS,2,1100.0,Ford,Focus
7788KT197RUS,2,8594.586466,Ford,Focus


## Aggregations by make and model


In [7]:
# Median fines grouped by make
median_fines_by_make = df.groupby('Make')['Fines'].median()
median_fines_by_make


Make
Audi          4200.0
BMW           6500.0
Ford          3500.0
Skoda         3250.0
Toyota        7700.0
Volkswagen    4300.0
Volvo         8500.0
Name: Fines, dtype: float64

In [8]:
# Median fines grouped by make and model
median_fines_by_make_model = df.groupby(['Make', 'Model'])['Fines'].median()
median_fines_by_make_model


Make        Model  
Ford        Focus      3500.0
            Mondeo     7650.0
Skoda       Octavia    3250.0
Toyota      Camry      7700.0
            Corolla    7700.0
Volkswagen  Golf       4800.0
            Jetta      2800.0
            Passat     3500.0
            Touareg    5800.0
Name: Fines, dtype: float64

In [9]:
# Number of fines grouped by make and model
count_fines_by_make_model = df.groupby(['Make', 'Model'])['Fines'].count()
count_fines_by_make_model


Make        Model  
Ford        Focus      575
            Mondeo       6
Skoda       Octavia     48
Toyota      Camry       16
            Corolla     18
Volkswagen  Golf        20
            Jetta        6
            Passat      22
            Touareg      5
Name: Fines, dtype: int64

In [10]:
# Min and max fines grouped by make and model
min_max_fines_by_make_model = df.groupby(['Make', 'Model'])['Fines'].agg(['min', 'max'])
min_max_fines_by_make_model


Unnamed: 0_level_0,Unnamed: 1_level_0,min,max
Make,Model,Unnamed: 2_level_1,Unnamed: 3_level_1
Ford,Focus,100.0,180000.0
Ford,Mondeo,1100.0,46200.0
Skoda,Octavia,300.0,145000.0
Toyota,Camry,500.0,22400.0
Toyota,Corolla,900.0,34300.0
Volkswagen,Golf,200.0,168000.0
Volkswagen,Jetta,500.0,46000.0
Volkswagen,Passat,100.0,29700.0
Volkswagen,Touareg,500.0,8594.586466


In [11]:
# Standard deviation of fines grouped by make and model
std_fines_by_make_model = df.groupby(['Make', 'Model'])['Fines'].std()
std_fines_by_make_model


Make        Model  
Ford        Focus      15041.269437
            Mondeo     18987.329108
Skoda       Octavia    24339.742174
Toyota      Camry       6410.250654
            Corolla     9629.325617
Volkswagen  Golf       36950.839950
            Jetta      17743.026799
            Passat      6969.739135
            Touareg     3461.778173
Name: Fines, dtype: float64

## Aggregations by car number


In [12]:
# Car numbers grouped by number of fines (descending)
fines_count_by_car = df.groupby(df.index)['Fines'].count().sort_values(ascending=False)
fines_count_by_car


CarNumber
Y7689C197RUS    4
7788KT197RUS    4
92928M178RUS    4
Y7129Y50RUS     3
X758HY197RUS    3
               ..
Y967O8197RUS    1
Y965O8197RUS    1
Y965EE197RUS    1
Y964O8197RUS    1
Y964EE197RUS    1
Name: Fines, Length: 531, dtype: int64

In [13]:
# Zoom in: all rows for top-1 car number by count of fines
top1_car_by_count = fines_count_by_car.index[0] if not fines_count_by_car.empty else None
top1_car_by_count_rows = df.loc[top1_car_by_count] if top1_car_by_count is not None else None
top1_car_by_count, top1_car_by_count_rows


('Y7689C197RUS',
               Refund    Fines  Make  Model
 CarNumber                                 
 Y7689C197RUS       1  27000.0  Ford  Focus
 Y7689C197RUS       2   9000.0  Ford  Focus
 Y7689C197RUS       2  45000.0  Ford  Focus
 Y7689C197RUS       1  36000.0  Ford  Focus)

In [14]:
# Car numbers grouped by sum of fines (descending)
fines_sum_by_car = df.groupby(df.index)['Fines'].sum().sort_values(ascending=False)
fines_sum_by_car


CarNumber
X758HY197RUS    242000.0
9020YC197RUS    217500.0
M0279X197RUS    216000.0
Y352O8197RUS    207200.0
Y778EE197RUS    192000.0
                  ...   
Y166O8161RUS       100.0
K326T8197RUS       100.0
Y195O8161RUS       100.0
C58078163RUS       100.0
705787163RUS       100.0
Name: Fines, Length: 531, dtype: float64

In [15]:
# Zoom in: all rows for top-1 car number by sum of fines
top1_car_by_sum = fines_sum_by_car.index[0] if not fines_sum_by_car.empty else None
top1_car_by_sum_rows = df.loc[top1_car_by_sum] if top1_car_by_sum is not None else None
top1_car_by_sum, top1_car_by_sum_rows


('X758HY197RUS',
               Refund     Fines  Make  Model
 CarNumber                                  
 X758HY197RUS       2   24200.0  Ford  Focus
 X758HY197RUS       2   72600.0  Ford  Focus
 X758HY197RUS       2  145200.0  Ford  Focus)

## Car numbers connected to different models


In [16]:
models_per_car = df.groupby(df.index)['Model'].nunique()
cars_with_multiple_models = models_per_car[models_per_car > 1]
cars_with_multiple_models_rows = df.loc[cars_with_multiple_models.index] if not cars_with_multiple_models.empty else df.head(0)
cars_with_multiple_models, cars_with_multiple_models_rows


(Series([], Name: Model, dtype: int64),
 Empty DataFrame
 Columns: [Refund, Fines, Make, Model]
 Index: [])