In [1]:
import pandas as pd
import numpy as np
import requests

## read the JSON file that you saved in ex02

In [2]:
df = pd.read_json("../data/auto.json", orient="records")
df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
0,Y163O8161RUS,2,3200.000000,Ford,Focus
1,E432XX77RUS,1,6500.000000,Toyota,Camry
2,7184TT36RUS,1,2100.000000,Ford,Focus
3,X582HE161RUS,2,2000.000000,Ford,Focus
4,92918M178RUS,1,5700.000000,Ford,Focus
...,...,...,...,...,...
720,Y163O8161RUS,2,1600.000000,Ford,Focus
721,M0309X197RUS,1,22300.000000,Ford,Focus
722,O673E8197RUS,2,600.000000,Ford,Focus
723,8610T8154RUS,1,2000.000000,Ford,Focus


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 725 entries, 0 to 724
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   CarNumber  725 non-null    object 
 1   Refund     725 non-null    int64  
 2   Fines      725 non-null    float64
 3   Make       725 non-null    object 
 4   Model      716 non-null    object 
dtypes: float64(1), int64(1), object(3)
memory usage: 28.4+ KB


## enrich the dataframe using a sample from that dataframe

In [4]:
pd.options.display.float_format = "{:.2f}".format
df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
0,Y163O8161RUS,2,3200.00,Ford,Focus
1,E432XX77RUS,1,6500.00,Toyota,Camry
2,7184TT36RUS,1,2100.00,Ford,Focus
3,X582HE161RUS,2,2000.00,Ford,Focus
4,92918M178RUS,1,5700.00,Ford,Focus
...,...,...,...,...,...
720,Y163O8161RUS,2,1600.00,Ford,Focus
721,M0309X197RUS,1,22300.00,Ford,Focus
722,O673E8197RUS,2,600.00,Ford,Focus
723,8610T8154RUS,1,2000.00,Ford,Focus


In [5]:
df_sample = df[["CarNumber", "Make", "Model"]].sample(200, random_state=21)
df_sample["Refund"] = df["Refund"].sample(200, random_state=22).values
df_sample["Fines"] = df["Fines"].sample(200, random_state=23).values
df_sample

Unnamed: 0,CarNumber,Make,Model,Refund,Fines
445,M0299X197RUS,Ford,Focus,1,8594.59
22,83298C154RUS,Ford,Focus,2,4500.00
93,H957HY161RUS,Ford,Focus,1,500.00
173,T941CC96RUS,Ford,Focus,2,1000.00
697,H966HY161RUS,Ford,Focus,1,900.00
...,...,...,...,...,...
14,8182XX154RUS,Ford,Focus,2,300.00
623,X796TH96RUS,Ford,Focus,2,2000.00
498,T011MY163RUS,Ford,Focus,1,8594.59
536,T341CC96RUS,Volkswagen,Passat,1,2900.00


In [6]:
concat_rows = pd.concat([df, df_sample])
concat_rows.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 925 entries, 0 to 520
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   CarNumber  925 non-null    object 
 1   Refund     925 non-null    int64  
 2   Fines      925 non-null    float64
 3   Make       925 non-null    object 
 4   Model      914 non-null    object 
dtypes: float64(1), int64(1), object(3)
memory usage: 43.4+ KB


## enrich the dataframe concat_rows by a new column with the data generated

In [7]:
np.random.seed(21) 

In [8]:
Year = pd.Series(np.random.randint(1980, 2020, len(concat_rows)), name="Year")
Year

0      1989
1      1995
2      1984
3      2015
4      2014
       ... 
920    1981
921    1992
922    2007
923    2005
924    1997
Name: Year, Length: 925, dtype: int64

In [9]:
fines = concat_rows.copy()
fines["Year"] = Year
fines

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2,3200.00,Ford,Focus,1989
1,E432XX77RUS,1,6500.00,Toyota,Camry,1995
2,7184TT36RUS,1,2100.00,Ford,Focus,1984
3,X582HE161RUS,2,2000.00,Ford,Focus,2015
4,92918M178RUS,1,5700.00,Ford,Focus,2014
...,...,...,...,...,...,...
14,8182XX154RUS,2,300.00,Ford,Focus,2017
623,X796TH96RUS,2,2000.00,Ford,Focus,2009
498,T011MY163RUS,1,8594.59,Ford,Focus,2002
536,T341CC96RUS,1,2900.00,Volkswagen,Passat,1999


## enrich the dataframe with the data from another dataframe

In [10]:
df_surname = pd.read_json("../data/surname.json")
df_surname

Unnamed: 0,0,1,2
0,NAME,COUNT,RANK
1,ADAMS,427865,42
2,ALLEN,482607,33
3,ALVAREZ,233983,92
4,ANDERSON,784404,15
...,...,...,...
96,WILLIAMS,1625252,3
97,WILSON,801882,14
98,WOOD,250715,84
99,WRIGHT,458980,35


In [11]:
df_surname.loc[0]

0     NAME
1    COUNT
2     RANK
Name: 0, dtype: object

In [12]:
df_surname.columns = df_surname.loc[0]
df_surname

Unnamed: 0,NAME,COUNT,RANK
0,NAME,COUNT,RANK
1,ADAMS,427865,42
2,ALLEN,482607,33
3,ALVAREZ,233983,92
4,ANDERSON,784404,15
...,...,...,...
96,WILLIAMS,1625252,3
97,WILSON,801882,14
98,WOOD,250715,84
99,WRIGHT,458980,35


In [13]:
df_surname.drop(0, inplace=True)
df_surname

Unnamed: 0,NAME,COUNT,RANK
1,ADAMS,427865,42
2,ALLEN,482607,33
3,ALVAREZ,233983,92
4,ANDERSON,784404,15
5,BAILEY,277845,72
...,...,...,...
96,WILLIAMS,1625252,3
97,WILSON,801882,14
98,WOOD,250715,84
99,WRIGHT,458980,35


In [14]:
df_surname["COUNT"] = df_surname["COUNT"].astype(int)
df_surname["RANK"] = df_surname["RANK"].astype(int)

In [15]:
df_surname.sort_values("COUNT", inplace=True, ascending=False)
df_surname.reset_index(inplace=True, drop=True)
df_surname

Unnamed: 0,NAME,COUNT,RANK
0,SMITH,2442977,1
1,JOHNSON,1932812,2
2,WILLIAMS,1625252,3
3,BROWN,1437026,4
4,JONES,1425470,5
...,...,...,...
95,MYERS,229895,96
96,LONG,229374,97
97,ROSS,229368,98
98,FOSTER,227764,99


In [16]:
df_surname[df_surname["NAME"].str.isalpha()==True]

Unnamed: 0,NAME,COUNT,RANK
0,SMITH,2442977,1
1,JOHNSON,1932812,2
2,WILLIAMS,1625252,3
3,BROWN,1437026,4
4,JONES,1425470,5
...,...,...,...
95,MYERS,229895,96
96,LONG,229374,97
97,ROSS,229368,98
98,FOSTER,227764,99


In [17]:
len(fines["CarNumber"].unique())

531

In [18]:
fines["CarNumber"].value_counts()

7788KT197RUS    7
K2797K154RUS    6
H115YO163RUS    6
O718MM163RUS    5
X256HE161RUS    5
               ..
Y207O8161RUS    1
M578CH197RUS    1
M302CH161RUS    1
9671EC178RUS    1
8610T8154RUS    1
Name: CarNumber, Length: 531, dtype: int64

In [19]:
fines["CarNumber"].nunique()

531

In [20]:
surnames = df_surname["NAME"].sample(len(fines["CarNumber"].unique()), replace=True, random_state=21)\
                .reset_index(drop=True)
surnames.name = "SURNAME"
surnames

0           KELLY
1      RICHARDSON
2            CRUZ
3           JONES
4          CARTER
          ...    
526      MARTINEZ
527         YOUNG
528        GARCIA
529           LEE
530          CRUZ
Name: SURNAME, Length: 531, dtype: object

In [21]:
owners = pd.concat([pd.Series(fines["CarNumber"].unique(), name="CarNumber").reset_index(drop=True), surnames], axis=1)
owners

Unnamed: 0,CarNumber,SURNAME
0,Y163O8161RUS,KELLY
1,E432XX77RUS,RICHARDSON
2,7184TT36RUS,CRUZ
3,X582HE161RUS,JONES
4,92918M178RUS,CARTER
...,...,...
526,O136HO197RUS,MARTINEZ
527,O22097197RUS,YOUNG
528,M0309X197RUS,GARCIA
529,O673E8197RUS,LEE


In [22]:
fines_5 = pd.DataFrame({'CarNumber': ["a000aa16RUS", "a111aa16RUS","a222aa16RUS","a333aa16RUS","a444aa16RUS"]})
fines_5

Unnamed: 0,CarNumber
0,a000aa16RUS
1,a111aa16RUS
2,a222aa16RUS
3,a333aa16RUS
4,a444aa16RUS


In [23]:
fines.count()

CarNumber    925
Refund       925
Fines        925
Make         925
Model        914
Year         925
dtype: int64

In [24]:
for col in fines.columns[1:]:
    fines_5[col] = fines[col].sample(5, random_state=21).reset_index(drop=True)
fines_5   

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,a000aa16RUS,1,500.0,Ford,Focus,2014
1,a111aa16RUS,2,15200.0,Ford,Focus,2016
2,a222aa16RUS,1,500.0,Ford,Focus,1994
3,a333aa16RUS,2,8594.59,Ford,Focus,1982
4,a444aa16RUS,2,600.0,Ford,Focus,1987


In [25]:
fines = fines.append(fines_5).reset_index(drop=True)
fines

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2,3200.00,Ford,Focus,1989
1,E432XX77RUS,1,6500.00,Toyota,Camry,1995
2,7184TT36RUS,1,2100.00,Ford,Focus,1984
3,X582HE161RUS,2,2000.00,Ford,Focus,2015
4,92918M178RUS,1,5700.00,Ford,Focus,2014
...,...,...,...,...,...,...
925,a000aa16RUS,1,500.00,Ford,Focus,2014
926,a111aa16RUS,2,15200.00,Ford,Focus,2016
927,a222aa16RUS,1,500.00,Ford,Focus,1994
928,a333aa16RUS,2,8594.59,Ford,Focus,1982


In [26]:
owners = owners.head(len(owners) - 20)
owners

Unnamed: 0,CarNumber,SURNAME
0,Y163O8161RUS,KELLY
1,E432XX77RUS,RICHARDSON
2,7184TT36RUS,CRUZ
3,X582HE161RUS,JONES
4,92918M178RUS,CARTER
...,...,...
506,T914CT197RUS,KING
507,E41977152RUS,GARCIA
508,9464EX178RUS,ROBERTS
509,O50197197RUS,FOSTER


In [27]:
fines 

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2,3200.00,Ford,Focus,1989
1,E432XX77RUS,1,6500.00,Toyota,Camry,1995
2,7184TT36RUS,1,2100.00,Ford,Focus,1984
3,X582HE161RUS,2,2000.00,Ford,Focus,2015
4,92918M178RUS,1,5700.00,Ford,Focus,2014
...,...,...,...,...,...,...
925,a000aa16RUS,1,500.00,Ford,Focus,2014
926,a111aa16RUS,2,15200.00,Ford,Focus,2016
927,a222aa16RUS,1,500.00,Ford,Focus,1994
928,a333aa16RUS,2,8594.59,Ford,Focus,1982


In [28]:
owners_3 = pd.DataFrame({'CarNumber': ["b000bb16RUS", "b111bb16RUS","b222bb16RUS"]})
for col in owners.columns[1:]:
    owners_3[col] = owners[col].sample(3, random_state=21).reset_index(drop=True)
owners_3 

Unnamed: 0,CarNumber,SURNAME
0,b000bb16RUS,SMITH
1,b111bb16RUS,ROBERTS
2,b222bb16RUS,COX


In [29]:
owners = owners.append(owners_3).reset_index(drop=True)

In [30]:
owners.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 514 entries, 0 to 513
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   CarNumber  514 non-null    object
 1   SURNAME    514 non-null    object
dtypes: object(2)
memory usage: 8.2+ KB


In [31]:
fines.merge(owners, how="inner", on="CarNumber")

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2,3200.00,Ford,Focus,1989,KELLY
1,Y163O8161RUS,2,1600.00,Ford,Focus,1980,KELLY
2,E432XX77RUS,1,6500.00,Toyota,Camry,1995,RICHARDSON
3,E432XX77RUS,2,13000.00,Toyota,Camry,2018,RICHARDSON
4,7184TT36RUS,1,2100.00,Ford,Focus,1984,CRUZ
...,...,...,...,...,...,...,...
894,E41977152RUS,2,2400.00,Ford,Focus,1989,GARCIA
895,9464EX178RUS,2,2100.00,Ford,Focus,1988,ROBERTS
896,O50197197RUS,2,7800.00,Ford,Focus,1992,FOSTER
897,7608EE777RUS,1,4000.00,Skoda,Octavia,2000,WRIGHT


In [32]:
fines.merge(owners, how="outer", on="CarNumber")

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2.00,3200.00,Ford,Focus,1989.00,KELLY
1,Y163O8161RUS,2.00,1600.00,Ford,Focus,1980.00,KELLY
2,E432XX77RUS,1.00,6500.00,Toyota,Camry,1995.00,RICHARDSON
3,E432XX77RUS,2.00,13000.00,Toyota,Camry,2018.00,RICHARDSON
4,7184TT36RUS,1.00,2100.00,Ford,Focus,1984.00,CRUZ
...,...,...,...,...,...,...,...
928,a333aa16RUS,2.00,8594.59,Ford,Focus,1982.00,
929,a444aa16RUS,2.00,600.00,Ford,Focus,1987.00,
930,b000bb16RUS,,,,,,SMITH
931,b111bb16RUS,,,,,,ROBERTS


In [33]:
fines.merge(owners, how="left", on="CarNumber")

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2,3200.00,Ford,Focus,1989,KELLY
1,E432XX77RUS,1,6500.00,Toyota,Camry,1995,RICHARDSON
2,7184TT36RUS,1,2100.00,Ford,Focus,1984,CRUZ
3,X582HE161RUS,2,2000.00,Ford,Focus,2015,JONES
4,92918M178RUS,1,5700.00,Ford,Focus,2014,CARTER
...,...,...,...,...,...,...,...
925,a000aa16RUS,1,500.00,Ford,Focus,2014,
926,a111aa16RUS,2,15200.00,Ford,Focus,2016,
927,a222aa16RUS,1,500.00,Ford,Focus,1994,
928,a333aa16RUS,2,8594.59,Ford,Focus,1982,


In [34]:
fines.merge(owners, how="right", on="CarNumber")

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2.00,3200.00,Ford,Focus,1989.00,KELLY
1,Y163O8161RUS,2.00,1600.00,Ford,Focus,1980.00,KELLY
2,E432XX77RUS,1.00,6500.00,Toyota,Camry,1995.00,RICHARDSON
3,E432XX77RUS,2.00,13000.00,Toyota,Camry,2018.00,RICHARDSON
4,7184TT36RUS,1.00,2100.00,Ford,Focus,1984.00,CRUZ
...,...,...,...,...,...,...,...
897,7608EE777RUS,1.00,4000.00,Skoda,Octavia,2000.00,WRIGHT
898,7608EE777RUS,2.00,3000.00,Skoda,Octavia,2000.00,WRIGHT
899,b000bb16RUS,,,,,,SMITH
900,b111bb16RUS,,,,,,ROBERTS


##  create a pivot table from the fines dataframe, it should look like this (the values are the sums of the fines), but with all the years

In [35]:
pivot = pd.pivot_table(fines, values="Fines", index=["Make", "Model"], columns="Year", aggfunc="sum")
pivot

Unnamed: 0_level_0,Year,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
Make,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Ford,Focus,61894.59,458483.76,159372.93,54500.0,110794.59,138383.76,111294.59,73900.0,96094.59,96900.0,...,106589.17,111789.17,98100.0,173489.17,263389.17,223200.0,93889.17,299394.59,276183.76,75700.0
Ford,Mondeo,,,,,,,,,,8600.0,...,,,34400.0,,,,46200.0,,,
Skoda,Octavia,1900.0,,6900.0,19694.59,,28294.59,600.0,23300.0,,96200.0,...,6100.0,3500.0,500.0,12594.59,300.0,46394.59,300.0,,185394.59,9500.0
Toyota,Camry,12000.0,10194.59,,7200.0,,,,,,24000.0,...,,,8594.59,,,,,,13000.0,18100.0
Toyota,Corolla,,,2000.0,,,,,8000.0,,4000.0,...,24000.0,8594.59,,,,,,9600.0,,
Volkswagen,Golf,38100.0,,,8594.59,300.0,24000.0,,9300.0,,5800.0,...,,1300.0,,,,2300.0,,,,
Volkswagen,Jetta,,,,,,,,,,,...,,,,,,,,,,
Volkswagen,Passat,,5800.0,,5700.0,10000.0,5000.0,15300.0,12300.0,,,...,2800.0,,,,,600.0,2100.0,,,
Volkswagen,Touareg,,,,,,5800.0,,,,,...,6300.0,,,,1300.0,500.0,,,,


## save both the fines and owners dataframes to CSV files without an index

In [36]:
fines.to_csv("../data/fines.csv", index=False)
owners.to_csv("../data/owners.csv", index=False)