In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import statsmodels.api as sm
from scipy import stats
import seaborn as sns
import os
import glob
import datetime

### Combining 2016 Data

In [2]:
#read the path
file_path_2016 = "../data/parking-tickets-2016"
#list all the files from the directory
file_list_2016 = os.listdir(file_path_2016)

In [3]:
df_2016 = pd.DataFrame()

# Append all files together
for file in file_list_2016:
    df_temp = pd.read_csv(f'../data/parking-tickets-2016/{file}')
    df_2016 = pd.concat([df_2016, df_temp], ignore_index=True)

### Combining 2017 Data

In [4]:
#read the path
file_path_2017 = "../data/parking-tickets-2017"
#list all the files from the directory
file_list_2017 = os.listdir(file_path_2017)

In [5]:
df_2017 = pd.DataFrame()

# Append all files together
for file in file_list_2017:
    df_temp = pd.read_csv(f'../data/parking-tickets-2017/{file}')
    df_2017 = pd.concat([df_2017, df_temp], ignore_index=True)

### Combining 2018 Data

In [6]:
#read the path
file_path_2018 = "../data/parking-tickets-2018"
#list all the files from the directory
file_list_2018 = os.listdir(file_path_2018)

In [7]:
df_2018 = pd.DataFrame()

# Append all files together
for file in file_list_2018:
    df_temp = pd.read_csv(f'../data/parking-tickets-2018/{file}')
    df_2018 = pd.concat([df_2018, df_temp], ignore_index=True)

### Combining 2019 Data

In [8]:
#read the path
file_path_2019 = "../data/parking-tickets-2019"
#list all the files from the directory
file_list_2019 = os.listdir(file_path_2019)

In [9]:
df_2019 = pd.DataFrame()

# Append all files together
for file in file_list_2019:
    df_temp = pd.read_csv(f'../data/parking-tickets-2019/{file}')
    df_2019 = pd.concat([df_2019, df_temp], ignore_index=True)

### Combining 2020 Data

In [10]:
#read the path
file_path_2020 = "../data/parking-tickets-2020"
#list all the files from the directory
file_list_2020 = os.listdir(file_path_2020)

In [11]:
df_2020 = pd.DataFrame()

# Append all files together
for file in file_list_2020:
    df_temp = pd.read_csv(f'../data/parking-tickets-2020/{file}')
    df_2020 = pd.concat([df_2020, df_temp], ignore_index=True)

### Combining 2021 Data

In [12]:
#read the path
file_path_2021 = "../data/parking-tickets-2021"
#list all the files from the directory
file_list_2021 = os.listdir(file_path_2021)

In [13]:
df_2021 = pd.DataFrame()

# Append all files together
for file in file_list_2021:
    df_temp = pd.read_csv(f'../data/parking-tickets-2021/{file}')
    df_2021 = pd.concat([df_2021, df_temp], ignore_index=True)

### Combining 2022 Data

In [14]:
#read the path
file_path_2022 = "../data/parking-tickets-2022"
#list all the files from the directory
file_list_2022 = os.listdir(file_path_2022)

In [15]:
df_2022 = pd.DataFrame()

# Append all files together
for file in file_list_2022:
    df_temp = pd.read_csv(f'../data/parking-tickets-2022/{file}')
    df_2022 = pd.concat([df_2022, df_temp], ignore_index=True)

### Merging Data Frames

In [16]:
frames = [df_2016, df_2017, df_2018, df_2019, df_2020, df_2021, df_2022]
df_merged = pd.concat(frames)

In [17]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13364473 entries, 0 to 1821886
Data columns (total 11 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   tag_number_masked       object 
 1   date_of_infraction      int64  
 2   infraction_code         float64
 3   infraction_description  object 
 4   set_fine_amount         int64  
 5   time_of_infraction      float64
 6   location1               object 
 7   location2               object 
 8   location3               object 
 9   location4               object 
 10  province                object 
dtypes: float64(2), int64(2), object(7)
memory usage: 1.2+ GB


In [18]:
df_merged.shape

(13364473, 11)

#### Going to check what each column means, starting with `tag_number_masked`. Seems like they are the id number for each ticket made. Let's see if there are any unique values.

In [19]:
df_merged['tag_number_masked'].nunique()

100022

In [20]:
df_merged['tag_number_masked'].nunique()/df_merged.shape[0]*100

0.7484170905953418

#### There's 100k unique values of almost 13 million entries. Not sure how this column can help us, or what findings I may have, but it is best to leave it alone until I find out the best use for this column.

#### Next is `date_of_infraction`, we can change this to a datetime64[ns] format. But I noticed another column that gives me the time, `time_of_infraction` displayed in minutes. We can change both of these into datetime by combining them together into a new column. 

Checking for info or null values first. 

In [21]:
df_merged['date_of_infraction'].isna().sum()

0

We have no null values for `date_of_infraction` but we do for `time_of_infraction`. So once we combine them we will get 9337 null values. Let's get the average time for each day of the year and use that value to fill in the null values. Let's move on to changing its `dtype`.

In [22]:
df_merged['date_of_infraction'].head()

0    20161230
1    20161230
2    20161230
3    20161230
4    20161230
Name: date_of_infraction, dtype: int64

In [23]:
df_merged['date_of_infraction'] = pd.to_datetime(df_merged['date_of_infraction'], format='%Y%m%d')
df_merged['date_of_infraction'].head()

0   2016-12-30
1   2016-12-30
2   2016-12-30
3   2016-12-30
4   2016-12-30
Name: date_of_infraction, dtype: datetime64[ns]

In [24]:
df_merged['time_of_infraction'].head()

0    1637.0
1    1637.0
2    1637.0
3    1637.0
4    1637.0
Name: time_of_infraction, dtype: float64

I cannot just change this by making unit into minutes, otherwise 1637 will give me a combination of minutes instead of separating hours and minutes into 16:37:00, 
let's do a format by breaking it down to hours and minutes.

In [25]:
df_merged['hours'] = df_merged['time_of_infraction'] // 100
df_merged['minutes'] = df_merged['time_of_infraction'] % 100

In [26]:
df_merged['hours'] = pd.to_timedelta(df_merged['hours'], unit='h')
df_merged['minutes'] = pd.to_timedelta(df_merged['minutes'], unit='m')

In [27]:
# Combine date and time columns into a single datetime column
df_merged['time_of_infraction'] = (df_merged['hours'] + df_merged['minutes']).astype(str)
df_merged['time_of_infraction'] = df_merged['time_of_infraction'].astype(str).str.split().str[-1]

In [28]:
df_merged['time_of_infraction']

0          16:37:00
1          16:37:00
2          16:37:00
3          16:37:00
4          16:37:00
             ...   
1821882    09:46:00
1821883    09:47:00
1821884    09:47:00
1821885    09:47:00
1821886    09:47:00
Name: time_of_infraction, Length: 13364473, dtype: object

#### Since I have date and time, I don't think I need a necessary datetime of both, so I shall leave that separate for now, unless necessary. 

In [29]:
#not running, non existent
df_merged['datetime_of_infraction'] = df_merged['date_of_infraction'] + df_merged['hours'] + df_merged['minutes']

Now we can see this as its own column. Best to remove `hours` and `minutes` columns.

In [30]:
df_merged.drop(columns = ['hours', 'minutes'], inplace=True)

As mentioned before, we'll have some null values in `time_of_infraction` we'd have to look into. 9337 time entries. Let's see if we can get average time by day. 

In [31]:
df_merged[df_merged['time_of_infraction'].astype(str) == '00:00:00'].count()

tag_number_masked         2996
date_of_infraction        2996
infraction_code           2996
infraction_description    2996
set_fine_amount           2996
time_of_infraction        2996
location1                 2621
location2                 2995
location3                  248
location4                  250
province                  2996
datetime_of_infraction    2996
dtype: int64

This gives us the NaT values that we need to fill, which is different from the '00:00:00' entries we've seen. This is good because we don't need to worry if the conversion changed null values changed to 0, they simply changed to NaT.

In [32]:
df_merged[df_merged['time_of_infraction'].str.contains('NaT') == True].count()

tag_number_masked         9337
date_of_infraction        9337
infraction_code           9337
infraction_description    9337
set_fine_amount           9337
time_of_infraction        9337
location1                 1525
location2                 9185
location3                  107
location4                  465
province                  9337
datetime_of_infraction       0
dtype: int64

Now that I know there's NaT values to look out for and not the 0 times. I can fill the NaT values with the average of each offence. 

Let's look into `infraction_description` and group by that with NaT values.

In [33]:
df_merged['infraction_description'].nunique()

291

In [34]:
times_per_infraction = df_merged.groupby('infraction_description')['time_of_infraction'].apply(lambda x: np.array(x))
descriptions = times_per_infraction.index
times_per_infraction

infraction_description
ANGLE PARK-METERED SPACE-FRONT                                 [17:55:00, 14:56:00]
ANGLE PARK-TOO FAR FROM METER                                  [11:23:00, 10:23:00]
ANGLE PARK-TOO FAR FROM METER                                            [11:33:00]
FAIL ANGLE PARK/STOP AT 45 DEG    [08:59:00, 08:59:00, 21:00:00, 21:02:00, 16:01...
FAIL PARK/STOP PAR RT HAND LTD                                           [14:43:00]
                                                        ...                        
STOP/STAND/PARK - NO VEND ZONE                                 [13:39:00, 20:37:00]
STOP/STAND/PARK DESIGNATE AREA                                 [23:19:00, 08:20:00]
STOP/STAND/PARK NOT DESIG AREA                                           [04:26:00]
STOP/STAND/PARK VEND CONT ZONE                                           [20:06:00]
STOP/STAND/PARK VEND NO PERMIT    [14:30:00, 15:30:00, 15:40:00, 17:07:00, 16:48...
Name: time_of_infraction, Length: 291, dtype: object

### Test for Normality

I am not sure if I need to get the mean or median of the times based on distribution. Therefore, I need to check the time's histogram to see if it's normally distributed or skewed. The issue is, I cannot go through every infraction, they have 291 unique values. Let's see if I can do a statistical test and depending on the p-value, I can take the mean or the median of the values. 

Let's get mean for now. 

In [35]:
df_merged['time_of_infraction'] = pd.to_datetime(df_merged['time_of_infraction'])

# Group by infraction_description and calculate the mean time
average_times = df_merged.groupby('infraction_description')['time_of_infraction'].apply(lambda x: np.mean(x)).reset_index()

average_times['time_of_infraction'] = average_times['time_of_infraction'].apply(lambda x: x.round('T'))

print(average_times)

  df_merged['time_of_infraction'] = pd.to_datetime(df_merged['time_of_infraction'])


             infraction_description  time_of_infraction
0    ANGLE PARK-METERED SPACE-FRONT 2024-03-03 16:26:00
1     ANGLE PARK-TOO FAR FROM METER 2024-03-03 10:53:00
2    ANGLE PARK-TOO FAR FROM METER  2024-03-03 11:33:00
3    FAIL ANGLE PARK/STOP AT 45 DEG 2024-03-03 13:47:00
4    FAIL PARK/STOP PAR RT HAND LTD 2024-03-03 14:43:00
..                              ...                 ...
286  STOP/STAND/PARK - NO VEND ZONE 2024-03-03 17:08:00
287  STOP/STAND/PARK DESIGNATE AREA 2024-03-03 15:50:00
288  STOP/STAND/PARK NOT DESIG AREA 2024-03-03 04:26:00
289  STOP/STAND/PARK VEND CONT ZONE 2024-03-03 20:06:00
290  STOP/STAND/PARK VEND NO PERMIT 2024-03-03 15:00:00

[291 rows x 2 columns]


Let's do a sanity check to see if there's no null values.


#### Now let's apply these times to the NaT values based on the infraction description. 

In [36]:
nat_counts = df_merged['time_of_infraction'].isna()
nat_counts

0          False
1          False
2          False
3          False
4          False
           ...  
1821882    False
1821883    False
1821884    False
1821885    False
1821886    False
Name: time_of_infraction, Length: 13364473, dtype: bool

In [37]:
#info on itterrows - https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.iterrows.html

for index, row in df_merged[nat_counts].iterrows():
    infraction_description = row['infraction_description']
        
    average_time = average_times[average_times['infraction_description'] == infraction_description]['time_of_infraction'].values[0]
    
    df_merged.at[index, 'time_of_infraction'] = average_time

In [38]:
df_merged['time_of_infraction'].isna().sum()

15

In [39]:
rows_with_nan = df_merged[df_merged['time_of_infraction'].isna()]

# Display the rows with NaN values
print(rows_with_nan)

        tag_number_masked date_of_infraction  infraction_code  \
259120           ***15741         2016-10-06             15.0   
1743918          ***12465         2016-02-09             28.0   
2089001          ***54675         2016-04-05              7.0   
259120           ***51151         2017-02-11             18.0   
1743918          ***78426         2017-06-28              5.0   
2089001          ***21138         2017-08-25              5.0   
259120           ***38476         2018-11-10            207.0   
1743918          ***61354         2018-03-25            207.0   
259120           ***66808         2019-04-08             77.0   
1743918          ***80112         2019-10-21              5.0   
2089001          ***09600         2019-09-27              5.0   
259120           ***57030         2020-11-23              5.0   
259120           ***21491         2021-02-02            134.0   
259120           ***95569         2022-06-08              4.0   
1743918          ***79165