In [121]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics.pairwise import cosine_similarity

Importing the datasets.
The accidents dataset is only with 10000 columns for analysis.

In [122]:
pollution = pd.read_csv("pollution_2000_2023.csv")
nba = pd.read_csv("nba_elo.csv")
accidents = pd.read_csv("US_Accidents_March23.csv", nrows = 10000)
teams = pd.read_csv("team.csv")


In [123]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import jaccard_score
# Preprocess
accidents["City"] = accidents["City"].str.upper().str.strip()
pollution["City"] = pollution["City"].str.upper().str.strip()

accident_cities = accidents["City"].dropna().unique()
pollution_cities = pollution["City"].dropna().unique()

# Set up vectorizer
vectorizer = CountVectorizer(binary=True)

similarity_scores = []

for city_a in accident_cities:
    for city_b in pollution_cities:
        # Fit and transform both cities into binary vectors
        vec = vectorizer.fit_transform([city_a, city_b]).toarray()
        # Compute Jaccard similarity on the binary vectors
        score = jaccard_score(vec[0], vec[1])
        similarity_scores.append((city_a, city_b, score))

# Convert to DataFrame
similarity_df = pd.DataFrame(similarity_scores, columns=["Accident_City", "Pollution_City", "Jaccard_Similarity"])
print(similarity_df.sort_values(by="Jaccard_Similarity", ascending=False).head(50))

             Accident_City Pollution_City  Jaccard_Similarity
24756            PITTSBURG      PITTSBURG            1.000000
11034               NEWARK         NEWARK            1.000000
959              JOHNSTOWN      JOHNSTOWN            1.000000
13615            CUPERTINO      CUPERTINO            1.000000
14553              CONCORD        CONCORD            1.000000
13076              VALLEJO        VALLEJO            1.000000
22955            SAN PABLO      SAN PABLO            1.000000
7059               BURBANK        BURBANK            1.000000
13763             SAN JOSE       SAN JOSE            1.000000
26800             BERKELEY       BERKELEY            1.000000
60625            DAVENPORT      DAVENPORT            1.000000
32541              JACKSON        JACKSON            1.000000
37751              BENICIA        BENICIA            1.000000
13970        SAN FRANCISCO  SAN FRANCISCO            1.000000
5222            WILMINGTON     WILMINGTON            1.000000
32028   

The resulting DataFrame displays the top 50 city name pairs from the accidents and pollution datasets with the highest Jaccard similarity scores.
A Jaccard similarity close to 1 indicates that the city names are nearly identical (e.g., "LOS ANGELES" vs. "LOS ANGELES"), while lower scores suggest partial matches.
These results are useful for identifying and potentially aligning city names that refer to the same place but may have been written differently.

In [124]:
# Filter and normalize the relevant columns
features = accidents[["Severity", "Temperature(F)"]].dropna()
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(features)

# Compute Euclidean distance between all rows
distance_matrix = squareform(pdist(scaled_features, metric="euclidean"))

# Put into a DataFrame for readability
euclidean_df = pd.DataFrame(distance_matrix, index=features.index, columns=features.index)
print(euclidean_df.iloc[:5, :5])

          0         1         2        3         4
0  0.000000  0.333477  0.333450  0.01763  0.333450
1  0.333477  0.000000  0.018609  0.33446  0.018609
2  0.333450  0.018609  0.000000  0.33345  0.000000
3  0.017630  0.334460  0.333450  0.00000  0.333450
4  0.333450  0.018609  0.000000  0.33345  0.000000


The resulting DataFrame displays the pairwise Euclidean distances between accident records based on their normalized 'Severity' and 'Temperature(F)' values.
A smaller distance (closer to 0) indicates a higher similarity between two events in terms of accident severity and the temperature at which they occurred.
For example, a distance of 0.017 means that the two incidents occurred under nearly identical conditions.
These results can be useful for identifying clusters or patterns in accidents that share similar characteristics.

In [125]:
# Select pollution and weather features
pollution_features = pollution[["O3 Mean", "SO2 Mean", "CO Mean", "NO2 Mean"]].dropna()

# Updated: Extract weather features from the accidents dataset
weather_features = accidents[["Temperature(F)", "Humidity(%)", "Wind_Speed(mph)", "Pressure(in)"]].dropna()

# Match the number of rows for alignment (optional, here taking the minimum)
min_len = min(len(pollution_features), len(weather_features))
pollution_features = pollution_features.iloc[:min_len]
weather_features = weather_features.iloc[:min_len]

# Normalize both datasets
scaler = MinMaxScaler()
pollution_scaled = scaler.fit_transform(pollution_features)
weather_scaled = scaler.fit_transform(weather_features)

# Compute Cosine Similarity
cos_sim_matrix = cosine_similarity(pollution_scaled, weather_scaled)

# Extract identifying columns
pollution_ids = pollution.iloc[:min_len]["Date"].astype(str).values
weather_ids = accidents.iloc[:min_len]["Start_Time"].astype(str).values

# Create labeled DataFrame
cos_sim_df = pd.DataFrame(cos_sim_matrix, index=pollution_ids, columns=weather_ids)

# Show the top left corner
print(cos_sim_df.iloc[:5, :5])



            2016-02-08 05:46:00  2016-02-08 06:07:59  2016-02-08 06:49:27  \
2000-01-01             0.708563             0.717924             0.721959   
2000-01-02             0.699723             0.715317             0.721630   
2000-01-03             0.831177             0.847566             0.842922   
2000-01-04             0.860439             0.873117             0.867923   
2000-01-05             0.818031             0.833741             0.822241   

            2016-02-08 07:23:34  2016-02-08 07:39:07  
2000-01-01             0.720223             0.700232  
2000-01-02             0.712014             0.692894  
2000-01-03             0.835503             0.829900  
2000-01-04             0.865262             0.857588  
2000-01-05             0.818917             0.818355  


The resulting DataFrame displays the pairwise cosine similarities between pollution records (rows) and weather condition records (columns) based on their normalized values for pollutants (O3 Mean, SO2 Mean, CO Mean, NO2 Mean) and weather metrics (Temperature(F), Humidity(%), Wind_Speed(mph), Pressure(in)).

Each cell represents how similar the environmental conditions were on a specific pollution measurement date compared to a specific accident weather record. A value closer to 1 indicates a stronger alignment in the pattern of values across the features, even if their absolute magnitudes differ.

For instance, a cosine similarity of 0.94 means that the pollution and weather values on those specific days followed a very similar directional trend across all measured features. This allows you to detect consistent environmental profiles or match pollution conditions with real accident-time weather data.

