# Data Validation Inclass activity 

In [1]:
#reading the csv file

In [2]:
import pandas as pd 
import numpy as np
df = pd.read_csv("Hwy26Crashes2019_S23.csv")
df.head()

Unnamed: 0,Crash ID,Record Type,Vehicle ID,Participant ID,Participant Display Seq#,Vehicle Coded Seq#,Participant Vehicle Seq#,Serial #,Crash Month,Crash Day,...,Participant Cause 2 Code,Participant Cause 3 Code,Participant Event 1 Code,Participant Event 2 Code,Participant Event 3 Code,BAC Test Results Code,Alcohol Use Reported,Drug Use Reported,Participant Marijuana Use Reported,Participant Striker Flag
0,1809119,1,,,,,,99992.0,3.0,14.0,...,,,,,,,,,,
1,1809119,2,3409578.0,,,1.0,,,,,...,,,,,,,,,,
2,1809119,3,3409578.0,3887884.0,1.0,1.0,1.0,,,,...,,0.0,9.0,0.0,0.0,,,,,
3,1809119,2,3409579.0,,,2.0,,,,,...,,,,,,,,,,
4,1809119,3,3409579.0,3887885.0,2.0,2.0,1.0,,,,...,,,9.0,9.0,0.0,,,,,


In [3]:
a = list(df.columns)
a

['Crash ID',
 'Record Type',
 'Vehicle ID',
 'Participant ID',
 'Participant Display Seq#',
 'Vehicle Coded Seq#',
 'Participant Vehicle Seq#',
 'Serial #',
 'Crash Month',
 'Crash Day',
 'Crash Year',
 'Week Day Code',
 'Crash Hour',
 'County Code',
 'City Section ID',
 'Urban Area Code',
 'Functional Class Code',
 'NHS Flag',
 'Highway Number',
 'Highway Suffix',
 'Roadway Number',
 'Highway Component',
 'Mileage Type',
 'Connection Number',
 'Linear Reference System (LRS)',
 'Latitude Degrees',
 'Latitude Minutes',
 'Latitude Seconds',
 'Longitude Degrees',
 'Longitude Minutes',
 'Longitude Seconds',
 'Latitude (Decimal Degrees)',
 'Longitude (Decimal Degrees)',
 'Special Jurisdiction',
 'Jurisdiction Group',
 'Street Number',
 'Nearest Intersecting Street Number',
 'Intersection Sequence Number',
 'Distance from Intersection',
 'Direction From Intersection',
 'Milepoint',
 'Posted Speed Limit',
 'Road Character',
 'Off Roadway Flag',
 'Intersection Type',
 'Intersection Related Fla

# Existence Assertion

In [4]:
#Created a date column from other 3 columns.(Day,Month,Year)

df['date'] = pd.to_datetime(dict(year=df["Crash Year"], month=df["Crash Month"], day=df["Crash Day"]))

# check that the 'date' column does not have any null values
assert pd.notnull(df['date']).all(), "Not every crash occurred on a date"

AssertionError: Not every crash occurred on a date

In [5]:
null_count_date = df['date'].isnull().sum()
null_count_date

2231

In [6]:
# Every crash record should have a Crash ID
assert pd.notnull(df['Crash ID']).all(), "Some crash records do not have a Crash ID"

# Limit assertions

1

In [7]:
# check that all dates are in year 2019
assert (df['date'].dt.year == 2019).all(), "Not every crash occurred during year 2019"

AssertionError: Not every crash occurred during year 2019

In [8]:
count = len(df[~df['Crash Year'].isin([2019, np.nan])])
print(f"Number of values in 'year' column that are not 2019 or null: {count}")

Number of values in 'year' column that are not 2019 or null: 0


2

In [34]:
# The Latitude and Longitude coordinates for all crashes should fall within a certain range
#I filtered out record type because that is necessary for logitude and latitude data 
assert df.loc[df['Record Type'] == 1, 'Longitude Degrees'].between(-180, 180).all(), "Some crash records have longitude coordinates outside the specified range"
assert df.loc[df['Record Type'] == 1, 'Latitude Degrees'].between(-180, 180).all(), "Some crash records have latitude coordinates outside the specified range"

In [35]:
df[(df['Longitude Degrees'].between(-180, 180)) & (df['Record Type'] == 2) & (df['Record Type'] == 3)]


Unnamed: 0,Crash ID,Record Type,Vehicle ID,Participant ID,Participant Display Seq#,Vehicle Coded Seq#,Participant Vehicle Seq#,Serial #,Crash Month,Crash Day,...,Participant Cause 3 Code,Participant Event 1 Code,Participant Event 2 Code,Participant Event 3 Code,BAC Test Results Code,Alcohol Use Reported,Drug Use Reported,Participant Marijuana Use Reported,Participant Striker Flag,date


# intra-record assertions

In [67]:
# If a crash record has a Participant ID, it should also have a Participant Display Seq
assert df[df['Participant ID'].notnull()]['Participant Display Seq#'].notnull().all(), "Some crash records with a Participant ID do not have a Participant Type Code"

In [66]:
#If a crash record has a latitude coordinate, then it should also have a longitude coordinate
assert df['Latitude Degrees'].notnull().all() == df['Longitude Degrees'].notnull().all(), "Latitude and longitude coordinates should have corresponding values"


# inter-record assertions

In [46]:
# check that all vehicle IDs in the 'vehicle_id' column are present in the 'crash_id' column
assert set(df['Vehicle ID']).issubset(set(df['Crash ID'])), "Not every vehicle listed in the crash data was part of a known crash"


AssertionError: Not every vehicle listed in the crash data was part of a known crash

In [48]:
# Every Vehicle ID in the dataset should be associated with a valid Crash ID
vehicle_ids = df['Vehicle ID'].unique()
for vehicle_id in vehicle_ids:
    crash_ids = df[df['Vehicle ID'] == vehicle_id]['Crash ID'].unique()
    assert len(crash_ids) == 1, f"Vehicle ID {vehicle_id} is associated with {len(crash_ids)} Crash IDs"



AssertionError: Vehicle ID nan is associated with 0 Crash IDs

In [51]:
# Every Participant ID in the dataset should be associated with a valid Vehicle ID
participant_ids = df['Participant ID'].unique()
for participant_id in participant_ids:
    vehicle_ids = df[df['Participant ID'] == participant_id]['Vehicle ID'].unique()
    assert len(vehicle_ids) == 1, f"Participant ID {participant_id} is associated with {len(vehicle_ids)} Vehicle IDs"

AssertionError: Participant ID nan is associated with 0 Vehicle IDs

# summary assertions

In [49]:
# check that the number of rows in the dataframe is greater than 1000 but less than 1000000
assert 1000 <= df.shape[0] < 1000000, "The number of crashes is not in the expected range"


# statistical distribution assertions

In [50]:
# check that the number of crashes in each month is roughly the same
counts = df['date'].dt.month.value_counts()
mean_count = counts.mean()
std_count = counts.std()
assert all((counts >= mean_count - std_count) & (counts <= mean_count + std_count)), "Crashes are not evenly distributed throughout the months of the year"


AssertionError: Crashes are not evenly distributed throughout the months of the year

In [68]:
df1 = df[df['Record Type'] == 1]
df1 = df1.dropna(axis=1)
df1.to_csv('crashes.csv', index=False)

In [69]:
df2 = df[df['Record Type'] == 2]
df2 = df2.dropna(axis=1)
df2.to_csv('vehicles.csv', index=False)

In [70]:
df3 = df[df['Record Type'] == 3]
df3 = df3.dropna(axis=1)
df3.to_csv('participants.csv', index=False)