In [291]:
import pandas as pd
import numpy as np
import requests

### 0. Read the JSON file that you saved in ex02

In [292]:
pd.options.display.float_format = '{:.2f}'.format
df = pd.read_json('../ex02/auto.json')
df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
0,Y163O8161RUS,2,3200.00,Ford,Focus
1,E432XX77RUS,1,6500.00,Toyota,Camry
2,7184TT36RUS,1,2100.00,Ford,Focus
3,X582HE161RUS,2,2000.00,Ford,Focus
4,92918M178RUS,1,5700.00,Ford,Focus
...,...,...,...,...,...
720,Y163O8161RUS,2,1600.00,Ford,Focus
721,M0309X197RUS,1,22300.00,Ford,Focus
722,O673E8197RUS,2,600.00,Ford,Focus
723,8610T8154RUS,1,2000.00,Ford,Focus


In [293]:
# df.Refund.describe()
df.Fines.describe()

count      725.00
mean      8516.01
std      16173.34
min        100.00
25%       1300.00
50%       4000.00
75%       8594.59
max     180000.00
Name: Fines, dtype: float64

### 1. Enrich the dataframe using a sample from that dataframe

In [294]:
sample = df.sample(n=200, random_state=21, ignore_index=True)
sample

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
0,M0299X197RUS,2,19200.00,Ford,Focus
1,83298C154RUS,2,8594.59,Ford,Focus
2,H957HY161RUS,1,2000.00,Ford,Focus
3,T941CC96RUS,1,2000.00,Ford,Focus
4,H966HY161RUS,1,500.00,Ford,Focus
...,...,...,...,...,...
195,8182XX154RUS,1,200.00,Ford,Focus
196,X796TH96RUS,1,500.00,Ford,Focus
197,T011MY163RUS,2,4000.00,Ford,Focus
198,T341CC96RUS,2,1000.00,Volkswagen,Passat


> change np.random.seed() to get different values compared to when seed was 21

In [295]:
np.random.seed(42)
sample['Refund'] = sample['Refund'].apply(lambda x: np.random.randint(1,3))
sample

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
0,M0299X197RUS,1,19200.00,Ford,Focus
1,83298C154RUS,2,8594.59,Ford,Focus
2,H957HY161RUS,1,2000.00,Ford,Focus
3,T941CC96RUS,1,2000.00,Ford,Focus
4,H966HY161RUS,1,500.00,Ford,Focus
...,...,...,...,...,...
195,8182XX154RUS,2,200.00,Ford,Focus
196,X796TH96RUS,2,500.00,Ford,Focus
197,T011MY163RUS,2,4000.00,Ford,Focus
198,T341CC96RUS,1,1000.00,Volkswagen,Passat


In [296]:
sample['Fines'] = sample['Fines'].apply(lambda x: f'{np.random.randint(1300,8594):.2f}')
sample

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
0,M0299X197RUS,1,2163.00,Ford,Focus
1,83298C154RUS,2,4090.00,Ford,Focus
2,H957HY161RUS,1,1863.00,Ford,Focus
3,T941CC96RUS,1,6416.00,Ford,Focus
4,H966HY161RUS,1,5491.00,Ford,Focus
...,...,...,...,...,...
195,8182XX154RUS,2,5733.00,Ford,Focus
196,X796TH96RUS,2,5077.00,Ford,Focus
197,T011MY163RUS,2,4169.00,Ford,Focus
198,T341CC96RUS,1,4534.00,Volkswagen,Passat


In [297]:
concat_rows = pd.concat([df, sample], ignore_index=True)
concat_rows

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
0,Y163O8161RUS,2,3200.00,Ford,Focus
1,E432XX77RUS,1,6500.00,Toyota,Camry
2,7184TT36RUS,1,2100.00,Ford,Focus
3,X582HE161RUS,2,2000.00,Ford,Focus
4,92918M178RUS,1,5700.00,Ford,Focus
...,...,...,...,...,...
920,8182XX154RUS,2,5733.00,Ford,Focus
921,X796TH96RUS,2,5077.00,Ford,Focus
922,T011MY163RUS,2,4169.00,Ford,Focus
923,T341CC96RUS,1,4534.00,Volkswagen,Passat


### 2. Enrich the dataframe concat_rows by a new column with the data generated

In [298]:
np.random.seed(21)
concat_rows['Year'] = np.random.randint(1980, 2020, 925)
concat_rows

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2,3200.00,Ford,Focus,1989
1,E432XX77RUS,1,6500.00,Toyota,Camry,1995
2,7184TT36RUS,1,2100.00,Ford,Focus,1984
3,X582HE161RUS,2,2000.00,Ford,Focus,2015
4,92918M178RUS,1,5700.00,Ford,Focus,2014
...,...,...,...,...,...,...
920,8182XX154RUS,2,5733.00,Ford,Focus,1981
921,X796TH96RUS,2,5077.00,Ford,Focus,1992
922,T011MY163RUS,2,4169.00,Ford,Focus,2007
923,T341CC96RUS,1,4534.00,Volkswagen,Passat,2005


### 3. Enrich the dataframe with the data from another dataframe

* Create a new dataframe with the car numbers and their owners

In [299]:
surnames = pd.read_json('surname.json')
surnames.columns = surnames.iloc[0]
surnames = surnames[1:].drop(['COUNT', 'RANK'], axis=1).rename(columns={'NAME': 'SURNAME'})
surnames

Unnamed: 0,SURNAME
1,ADAMS
2,ALLEN
3,ALVAREZ
4,ANDERSON
5,BAILEY
...,...
96,WILLIAMS
97,WILSON
98,WOOD
99,WRIGHT


> They should not have special characters like commas, brackets, etc.

In [300]:
surnames[surnames['SURNAME'].str.contains('[^a-zA-Z]')]

Unnamed: 0,SURNAME


> Unique Car Numbers in source table

In [301]:
uniq_nums = df['CarNumber'].nunique()
uniq_nums

531

In [302]:
owners_df = pd.DataFrame({'CarNumber': df.drop_duplicates('CarNumber', ignore_index=True)['CarNumber'],
                          'SURNAME': surnames.sample(n=uniq_nums, replace=True, random_state=21, ignore_index=True)['SURNAME']})
owners_df

Unnamed: 0,CarNumber,SURNAME
0,Y163O8161RUS,RICHARDSON
1,E432XX77RUS,ROSS
2,7184TT36RUS,MORGAN
3,X582HE161RUS,BAILEY
4,92918M178RUS,LOPEZ
...,...,...
526,O136HO197RUS,CAMPBELL
527,O22097197RUS,HALL
528,M0309X197RUS,BAKER
529,O673E8197RUS,DIAZ


In [303]:
tmp5 = pd.DataFrame({'CarNumber': ['M777EP777RUS', 'L007OX07RUS', 'G040AY300RUS', 'A802YE666RUS', 'X017CM69RUS'],
                    'Refund': [0, 3, 3, 3, 3],
                    'Fines': [0, 20000, 50000, 6666, 17.69],
                    'Make': ['Aurus', 'Nissan', 'MINI', 'Daewoo','Mazda'],
                    'Model': ['Senat', 'Juke', 'Cooper', 'Matiz', 'RX-7'],
                    'Year': [2018, 2010, 2001, 1998, 2002]})
tmp5

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,M777EP777RUS,0,0.0,Aurus,Senat,2018
1,L007OX07RUS,3,20000.0,Nissan,Juke,2010
2,G040AY300RUS,3,50000.0,MINI,Cooper,2001
3,A802YE666RUS,3,6666.0,Daewoo,Matiz,1998
4,X017CM69RUS,3,17.69,Mazda,RX-7,2002


In [304]:
fines_df = pd.concat([concat_rows, tmp5], ignore_index=True)
fines_df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2,3200.00,Ford,Focus,1989
1,E432XX77RUS,1,6500.00,Toyota,Camry,1995
2,7184TT36RUS,1,2100.00,Ford,Focus,1984
3,X582HE161RUS,2,2000.00,Ford,Focus,2015
4,92918M178RUS,1,5700.00,Ford,Focus,2014
...,...,...,...,...,...,...
925,M777EP777RUS,0,0.00,Aurus,Senat,2018
926,L007OX07RUS,3,20000.00,Nissan,Juke,2010
927,G040AY300RUS,3,50000.00,MINI,Cooper,2001
928,A802YE666RUS,3,6666.00,Daewoo,Matiz,1998


In [305]:
owners_df = pd.concat([owners_df[:-20],
                       pd.DataFrame({'CarNumber': ['B004KO21RUS', 'O010AO702RUS', 'K007XP21RUS'],
                                     'SURNAME': ['USTAL', 'PRIDUMIVAT', 'NOMERA']})],
                      ignore_index=True)
owners_df

Unnamed: 0,CarNumber,SURNAME
0,Y163O8161RUS,RICHARDSON
1,E432XX77RUS,ROSS
2,7184TT36RUS,MORGAN
3,X582HE161RUS,BAILEY
4,92918M178RUS,LOPEZ
...,...,...
509,O50197197RUS,WRIGHT
510,7608EE777RUS,HILL
511,B004KO21RUS,USTAL
512,O010AO702RUS,PRIDUMIVAT


Create 4 dataframes:
* the new dataframe should have only the car numbers that exist in both dataframes
* the new dataframe should have all the car numbers that exist in both dataframes
* the new dataframe should have only the car numbers from the fines dataframe
* the new dataframe should have only the car numbers from the owners dataframe

1. New dataframe 1 (car numbers that exist in both dataframes)

In [306]:
new_df1 = fines_df.merge(owners_df, on='CarNumber', how='inner')
new_df1

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2,3200.00,Ford,Focus,1989,RICHARDSON
1,Y163O8161RUS,2,1600.00,Ford,Focus,1980,RICHARDSON
2,E432XX77RUS,1,6500.00,Toyota,Camry,1995,ROSS
3,E432XX77RUS,2,13000.00,Toyota,Camry,2018,ROSS
4,7184TT36RUS,1,2100.00,Ford,Focus,1984,MORGAN
...,...,...,...,...,...,...,...
894,E41977152RUS,2,2400.00,Ford,Focus,1989,BAKER
895,9464EX178RUS,2,2100.00,Ford,Focus,1988,MARTIN
896,O50197197RUS,2,7800.00,Ford,Focus,1992,WRIGHT
897,7608EE777RUS,1,4000.00,Skoda,Octavia,2000,HILL


2. New dataframe 2 (all the car numbers that exist in both dataframes)

In [307]:
new_df2 = fines_df.merge(owners_df, on='CarNumber', how='outer')
new_df2

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2.00,3200.00,Ford,Focus,1989.00,RICHARDSON
1,Y163O8161RUS,2.00,1600.00,Ford,Focus,1980.00,RICHARDSON
2,E432XX77RUS,1.00,6500.00,Toyota,Camry,1995.00,ROSS
3,E432XX77RUS,2.00,13000.00,Toyota,Camry,2018.00,ROSS
4,7184TT36RUS,1.00,2100.00,Ford,Focus,1984.00,MORGAN
...,...,...,...,...,...,...,...
928,A802YE666RUS,3.00,6666.00,Daewoo,Matiz,1998.00,
929,X017CM69RUS,3.00,17.69,Mazda,RX-7,2002.00,
930,B004KO21RUS,,,,,,USTAL
931,O010AO702RUS,,,,,,PRIDUMIVAT


3. New dataframe 3 (only the car numbers from the fines dataframe)

In [308]:
new_df3 = fines_df.merge(owners_df, on='CarNumber', how='left')
new_df3

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2,3200.00,Ford,Focus,1989,RICHARDSON
1,E432XX77RUS,1,6500.00,Toyota,Camry,1995,ROSS
2,7184TT36RUS,1,2100.00,Ford,Focus,1984,MORGAN
3,X582HE161RUS,2,2000.00,Ford,Focus,2015,BAILEY
4,92918M178RUS,1,5700.00,Ford,Focus,2014,LOPEZ
...,...,...,...,...,...,...,...
925,M777EP777RUS,0,0.00,Aurus,Senat,2018,
926,L007OX07RUS,3,20000.00,Nissan,Juke,2010,
927,G040AY300RUS,3,50000.00,MINI,Cooper,2001,
928,A802YE666RUS,3,6666.00,Daewoo,Matiz,1998,


4. New dataframe 4 (only the car numbers from the owners dataframe)

In [309]:
new_df4 = fines_df.merge(owners_df, on='CarNumber', how='right')
new_df4

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2.00,3200.00,Ford,Focus,1989.00,RICHARDSON
1,Y163O8161RUS,2.00,1600.00,Ford,Focus,1980.00,RICHARDSON
2,E432XX77RUS,1.00,6500.00,Toyota,Camry,1995.00,ROSS
3,E432XX77RUS,2.00,13000.00,Toyota,Camry,2018.00,ROSS
4,7184TT36RUS,1.00,2100.00,Ford,Focus,1984.00,MORGAN
...,...,...,...,...,...,...,...
897,7608EE777RUS,1.00,4000.00,Skoda,Octavia,2000.00,HILL
898,7608EE777RUS,1.00,2948.00,Skoda,Octavia,1991.00,HILL
899,B004KO21RUS,,,,,,USTAL
900,O010AO702RUS,,,,,,PRIDUMIVAT


### 4. Create a pivot table from the fines dataframe

In [310]:
fines_df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2,3200.00,Ford,Focus,1989
1,E432XX77RUS,1,6500.00,Toyota,Camry,1995
2,7184TT36RUS,1,2100.00,Ford,Focus,1984
3,X582HE161RUS,2,2000.00,Ford,Focus,2015
4,92918M178RUS,1,5700.00,Ford,Focus,2014
...,...,...,...,...,...,...
925,M777EP777RUS,0,0.00,Aurus,Senat,2018
926,L007OX07RUS,3,20000.00,Nissan,Juke,2010
927,G040AY300RUS,3,50000.00,MINI,Cooper,2001
928,A802YE666RUS,3,6666.00,Daewoo,Matiz,1998


In [317]:
fines_df['Fines'] = pd.to_numeric(fines_df['Fines'], downcast='float')

In [318]:
pd.pivot_table(fines_df, values='Fines', index=['Make', 'Model'], columns='Year', aggfunc=np.sum)

Unnamed: 0_level_0,Year,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
Make,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Aurus,Senat,,,,,,,,,,,...,,,,,,,,,0.0,
Daewoo,Matiz,,,,,,,,,,,...,,,,,,,,,,
Ford,Focus,64798.59,394317.19,144899.77,68842.0,95294.59,125887.76,91150.59,92759.0,102105.59,68625.0,...,122427.17,95426.17,98373.0,167018.59,123250.59,214765.0,84630.59,263641.0,274406.59,71347.0
Ford,Mondeo,,,,,,,,,,8600.0,...,,,34400.0,,,,46200.0,,,
MINI,Cooper,,,,,,,,,,,...,,,,,,,,,,
Mazda,RX-7,,,,,,,,,,,...,,,,,,,,,,
Nissan,Juke,,,,,,,,,,,...,20000.0,,,,,,,,,
Skoda,Octavia,6131.0,,14816.0,11594.59,,10294.59,600.0,5200.0,7319.0,91400.0,...,3100.0,500.0,500.0,12594.59,300.0,46394.59,300.0,6985.0,156200.0,9500.0
Toyota,Camry,12000.0,8594.59,,7200.0,,,,,,22400.0,...,,,8594.59,,6834.0,,,,19579.0,18100.0
Toyota,Corolla,,,2000.0,,,,,12604.0,,4000.0,...,24000.0,8594.59,4996.0,,,,4927.0,9600.0,4882.0,


In [319]:
fines_df.to_csv('fines.csv', index=False)
owners_df.to_csv('owners.csv', index=False)