# Austin Traffic

## Imports

In [1]:
# ! pip install scikit-learn --user

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
%matplotlib inline

## Basic Loading

In [3]:
df = pd.read_csv("./data/Real-Time_Traffic_Incident_Reports_20250818.csv")
df.head()

Unnamed: 0,Traffic Report ID,Published Date,Issue Reported,Location,Latitude,Longitude,Address,Status,Status Date,Agency
0,F350D780EA8AAA48030B4DB64F790C14DBCD757F_17096...,03/06/2024 01:29:39 AM +0000,Stalled Vehicle,POINT (-97.705874 30.32358),30.32358,-97.705874,E 290 Svrd Wb To Ih 35 Nb Ramp / N Ih 35 Svrd ...,ARCHIVED,03/06/2024 02:10:12 AM +0000,AUSTIN PD
1,042338D46C2028808F298DEC735DC7ADF0C162F6_17096...,03/06/2024 01:26:42 AM +0000,Crash Urgent,POINT (-97.698567 30.342505),30.342505,-97.698567,8024 N Ih 35 Svrd Sb,ARCHIVED,03/06/2024 02:45:13 AM +0000,AUSTIN PD
2,BE73E73B642ECB6C6EDB4A52875A8017F69DFE5E_17080...,02/15/2024 10:17:48 PM +0000,LOOSE LIVESTOCK,POINT (-97.717911 30.0578),30.0578,-97.717911,12400 Glass Rd,ARCHIVED,02/15/2024 11:33:08 PM +0000,TRAVIS COUNTY SHERIF
3,A5909010064EF753C5F090A07CFCDDFBF25E38F3_15865...,04/10/2020 09:31:11 PM +0000,TRFC HAZD/ DEBRIS,POINT (-97.866676 30.140878),30.140878,-97.866676,S Sh 45 W Eb & S Sh 45 W Eb To Bliss Spillar Ramp,ARCHIVED,04/10/2020 09:55:03 PM +0000,
4,4A6C39763AD109EB75CE31C1EBFD53A0E883EE68_16543...,06/04/2022 05:51:05 PM +0000,Traffic Hazard,POINT (-97.825791 30.455444),30.455444,-97.825791,Anderson Mill Rd & N Fm 620 Rd,ARCHIVED,06/04/2022 06:20:03 PM +0000,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 434151 entries, 0 to 434150
Data columns (total 10 columns):
Traffic Report ID    434151 non-null object
Published Date       434151 non-null object
Issue Reported       434151 non-null object
Location             433869 non-null object
Latitude             434072 non-null float64
Longitude            434072 non-null float64
Address              434151 non-null object
Status               432458 non-null object
Status Date          434151 non-null object
Agency               77567 non-null object
dtypes: float64(2), object(8)
memory usage: 33.1+ MB


## Feature Engineering

### Cleaning

In [5]:
df = df[df['Longitude'] != 0]
df = df[df['Latitude'] != 0]
df = df[df['Latitude'] <= 35]

### Feature Creation

In [6]:
df['Date'] = pd.to_datetime(df['Published Date'])

In [7]:
# Highway keywords
highway_keywords = ["I-35", "IH", "US", "Mopac", "Loop 1", "SH", "RESEARCH BLVD", "BASTROP HWY", "BEN WHITE"]

# Classify using simple filter
df['road_type'] = df['Address'].fillna("").str.upper().apply(
    lambda x: "Highway" if any(k.upper() in x for k in highway_keywords) else "Local"
)

print(df['road_type'].value_counts())

Local      290619
Highway    142433
Name: road_type, dtype: int64


## Show Result

In [8]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Latitude,433052.0,30.301428,0.095707,30.000064,30.231034,30.295088,30.374668,31.077333
Longitude,433052.0,-97.73003,0.090787,-98.816154,-97.776315,-97.731784,-97.681224,-97.108986


In [9]:
# colors = ['red', 'cyan', 'green', 'blue', 'yellow']
# top_issues = df['Issue Reported'].value_counts().head(5).index.tolist()
# df_top_issues = df[df['Issue Reported'].isin(top_issues)].copy()

# issue_colors = {
#     top_issues[0]: colors[0],
#     top_issues[1]: colors[1],
#     top_issues[2]: colors[2],
#     top_issues[3]: colors[3],
#     top_issues[4]: colors[4],
# }
# df_top_issues['Color'] = df_top_issues['Issue Reported'].apply(lambda x: issue_colors[x] )
# plt.figure(figsize=(10,6))
# plt.scatter(df_top_issues['Longitude'], df_top_issues['Latitude'], c=df_top_issues['Color'], cmap='vidris', alpha=0.2, s=.01)
# plt.title('Traffic Incidents in Austin')
# plt.xlabel('Longitude')
# plt.ylabel('Latitude')
# plt.xlim((-98.20,-97.30))
# plt.ylim((30., 30.6))
# legend_handles = [mpatches.Patch(color=color, label=issue) for issue, color in issue_colors.items()]
# plt.legend(handles=legend_handles, title="Issue Reported", loc='lower left')
# plt.grid(True)
# plt.show()

## Scikit Learn Time

In [10]:
# Features and labels
X = df['Address'].fillna("")
y = (df['road_type'] == "Highway").astype(int)

# Text to numeric features
vectorizer = CountVectorizer()
X_vec = vectorizer.fit_transform(X)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

# Train Logistic Regression
clf = LogisticRegression(max_iter=200)
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     58051
           1       1.00      0.99      1.00     28560

    accuracy                           1.00     86611
   macro avg       1.00      1.00      1.00     86611
weighted avg       1.00      1.00      1.00     86611



In [14]:
# Features and labels
subdataset = df[df['Issue Reported'] == "COLLISION WITH INJURY"] 
subdataset["road_type"].value_counts().tolist()

[8288, 3184]

In [25]:
def percentage(df, x_label, y_label):
    labels = df[y_label].unique()
    for x in df[x_label].unique().tolist():
        subdataset = df[df[x_label] == x]
        totals = subdataset[y_label].value_counts().tolist()
        results = list(map(lambda x: x/sum(totals) ,totals))
        if len(totals) != 2:
            continue
        print(f"{x:27} | {labels[0]:7}: {results[0]:.3f} | {labels[1]:5}: {results[1]:.3f}")
        
percentage(df, "Issue Reported", "road_type")

Stalled Vehicle             | Highway: 0.541 | Local: 0.459
Crash Urgent                | Highway: 0.690 | Local: 0.310
LOOSE LIVESTOCK             | Highway: 0.823 | Local: 0.177
TRFC HAZD/ DEBRIS           | Highway: 0.649 | Local: 0.351
Traffic Hazard              | Highway: 0.613 | Local: 0.387
COLLISION                   | Highway: 0.683 | Local: 0.317
COLLISION/PRIVATE PROPERTY  | Highway: 0.803 | Local: 0.197
Crash Service               | Highway: 0.733 | Local: 0.267
COLLISION WITH INJURY       | Highway: 0.722 | Local: 0.278
COLLISN/ LVNG SCN           | Highway: 0.721 | Local: 0.279
zSTALLED VEHICLE            | Highway: 0.518 | Local: 0.482
BOAT ACCIDENT               | Highway: 0.985 | Local: 0.015
VEHICLE FIRE                | Highway: 0.654 | Local: 0.346
BLOCKED DRIV/ HWY           | Highway: 0.921 | Local: 0.079
FLEET ACC/ INJURY           | Highway: 0.733 | Local: 0.267
TRAFFIC FATALITY            | Highway: 0.546 | Local: 0.454
AUTO/ PED                   | Highway: 0

In [None]:
percentage(df, "", "road_type")