In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
data_path = '/Users/xenokian/Desktop/ma_statewide_2020_04_01.csv'

In [3]:
df = pd.read_csv(data_path)

  df = pd.read_csv(data_path)


In [4]:
#the dataset here contains information on traffic stops in the state of massachusetts (my home state) from december 2006 to december 2015. 
#the data includes the date of the stop, the location, age, race, sex, and information on whether or not a citation was issued.
#my null hypothesis is/was that being issued a citation correlates with race. 

In [5]:
df.head(5)

Unnamed: 0,raw_row_number,date,location,county_name,subject_age,subject_race,subject_sex,type,arrest_made,citation_issued,...,contraband_weapons,contraband_alcohol,contraband_other,frisk_performed,search_conducted,search_basis,reason_for_stop,vehicle_type,vehicle_registration_state,raw_Race
0,1,2007-06-06,MIDDLEBOROUGH,Plymouth County,33.0,white,male,vehicular,False,True,...,,False,,,False,,Speed,Passenger,MA,White
1,2,2007-06-07,SEEKONK,Bristol County,36.0,white,male,vehicular,False,False,...,False,False,False,False,True,other,,Commercial,MA,White
2,3,2007-06-07,MEDFORD,Middlesex County,56.0,white,female,vehicular,False,False,...,,False,,,False,,,Passenger,MA,White
3,4,2007-06-07,MEDFORD,Middlesex County,37.0,white,male,vehicular,False,False,...,,False,,,False,,,Commercial,MA,White
4,5,2007-06-07,EVERETT,Middlesex County,22.0,hispanic,female,vehicular,False,True,...,,False,,,False,,,Commercial,MA,Hispanic


In [6]:
def preprocess_traffic_stops(df):
    df = df.copy() 
    categorical_cols = ['subject_race', 'subject_sex', 'vehicle_type', 'reason_for_stop']
    numerical_cols = ['subject_age']
    
    for col in categorical_cols:
        df[col] = df[col].fillna('Unknown')
    
    for col in numerical_cols:
        df[col] = df[col].fillna(df[col].median())

    df['citation_issued'] = df['citation_issued'].fillna(False)
    
   
    dummy_columns = ['subject_race', 'subject_sex', 'vehicle_type', 'reason_for_stop']
    df_dummies = pd.get_dummies(df[dummy_columns], prefix=dummy_columns)
    
    X = pd.concat([
        df_dummies,
        pd.DataFrame(df['subject_age'])
    ], axis=1)
    
    # converted citation_issued to int after handling nan values; was getting originally getting error
    y = df['citation_issued'].astype(bool).astype(int)
     
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
    
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, random_state=42, stratify=y
    )
    
    return {
        'X_train': X_train,
        'X_test': X_test,
        'y_train': y_train,
        'y_test': y_test,
        'scaler': scaler,
        'feature_names': X.columns
    }

In [7]:
def generate_preprocessing_summary(data_dict):
    summary = {
        'train_size': len(data_dict['X_train']),
        'test_size': len(data_dict['X_test']),
        'feature_count': len(data_dict['feature_names']),
        'class_distribution_train': pd.Series(data_dict['y_train']).value_counts(normalize=True),
        'class_distribution_test': pd.Series(data_dict['y_test']).value_counts(normalize=True)
    }
    return summary

In [8]:
processed_data = preprocess_traffic_stops(df)

In [9]:
summary = generate_preprocessing_summary(processed_data)

In [10]:
print("Preprocessing Summary:")
for key, value in summary.items():
    print(f"\n{key}:")
    print(value)

Preprocessing Summary:

train_size:
2732990

test_size:
683248

feature_count:
27

class_distribution_train:
citation_issued
1    0.635577
0    0.364423
Name: proportion, dtype: float64

class_distribution_test:
citation_issued
1    0.635577
0    0.364423
Name: proportion, dtype: float64


In [11]:
#summary:
#as we can see, the class distribution is identical between training and test sets, indicating a successful split.
#citations were being issued in roughly 2/3 of stops. a moderate class imbalance, but the class proportions across splits
#suggests that this data is ready for modeling. i followed all steps of the rubric, as i created dummy features, performed scale
#standardization, and split the data into training and test splits. 