In [None]:
import pandas as pd
import copy
street_signs = pd.read_csv('MergedData/intersections_collision_streetlights_streets_street_sign.csv')

## functions for linear regression and random forest

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import shap

def linear_regression(data, Y_name):

    X = data.drop(Y_name, axis=1)
    y = data[Y_name]

    categorical_features = ['SIGNAL_TYPE']
    numeric_features = ['COUNT_RELATED_STREETS', 'AVG_PVMT_CONDITION', 'AVG_SPEEDLIMIT', 'TRAFFIC_CIRCLE_EXISTS', 'COUNT_STOPSIGNS', 'COUNT_YEILDSIGNS', 'LIGHTPOLE_COUNT', 'LIGHTPOLE_AVG_HEIGHT']

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', 'passthrough', numeric_features),
            ('cat', OneHotEncoder(), categorical_features)])
    
    X_preprocessed = preprocessor.fit_transform(X)


    X_preprocessed = X_preprocessed.astype(float)


    model = LinearRegression()

    # Split the dataset into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

    # Fit the model with the training data
    model.fit(X_train, y_train)


    scores = cross_val_score(model, X_preprocessed, y, cv=5, scoring='neg_mean_squared_error')
    mean_score = np.mean(scores)
    print('Mean Cross-Validated Mean Squared Error:', -mean_score)


    X_preprocessed = sm.add_constant(X_preprocessed)


    sm_model = sm.OLS(y, X_preprocessed)
    results = sm_model.fit()


    X_numeric = data[numeric_features]
    correlations = X_numeric.corrwith(data[Y_name])
    print('\nFeature Correlations:')
    print(correlations)


    p_values = results.pvalues


    column_names = numeric_features + preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features).tolist()
    column_names.insert(0, 'Intercept')


    feature_p_values = dict(zip(column_names, p_values))


    sorted_features = sorted(feature_p_values.items(), key=lambda x: x[1])


    print('\nSorted Features by P-values:')
    for feature, p_value in sorted_features:
        print(f"{feature}: {p_value}")

    # Create a SHAP explainer object for the random forest model
    explainer = shap.Explainer(model, X_train)

    # Calculate SHAP values for the test set
    shap_values = explainer(X_test)

    # Generate the summary plot
    shap.summary_plot(shap_values, X_test, feature_names=column_names[1:])

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import shap

def random_forest(data,Y_name):
   # Split the dataset into input features X and target variable y
    X = data.drop(columns=[Y_name])
    y = data[Y_name]

    # Convert boolean values to integers (0 and 1)
    X['TRAFFIC_CIRCLE_EXISTS'] = X['TRAFFIC_CIRCLE_EXISTS'].astype(int)

    # One-hot encode categorical variables
    categorical_columns = ['SIGNAL_TYPE']
    X = pd.get_dummies(X, columns=categorical_columns)

    # Convert boolean columns to integer
    bool_columns = X.select_dtypes(include='bool').columns
    X[bool_columns] = X[bool_columns].astype(int)

    # Split the dataset into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


    # Create a random forest regressor
    rf = RandomForestRegressor(n_estimators=100, random_state=42)

    # Evaluate model performance using cross-validation
    cv_scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

    # Train the model
    rf.fit(X_train, y_train)

    # Make predictions using the trained model
    y_pred = rf.predict(X_test)

    # Calculate mean squared error on the test set
    mse = mean_squared_error(y_test, y_pred)

    print("Cross-validation scores:", -cv_scores)
    print("Mean squared error on test set:", mse)

    # Create a SHAP explainer object for the random forest model
    explainer = shap.Explainer(rf, X_train)
    
    # Calculate SHAP values for the test set
    shap_values = explainer(X_test,check_additivity=False)

    # Generate the summary plot
    shap.summary_plot(shap_values, X_test)

In [None]:
rows_with_1 = street_signs[street_signs["related_street_num"] == 1]
rows_with_1

## Part I street_signs modeling
### import data and pre-process

In [None]:
street_signs.head(10)

In [None]:
street_signs["traffic_flow_index"] = street_signs["ARTERIALCLASSCD"] + 1
street_signs["collision_rate "] = street_signs["collision_count"] / street_signs["traffic_flow_index"]

street_signs = street_signs.drop(columns=["ARTERIALCLASSCD", "traffic_flow_index", "collision_count"])

street_signs.head(10)

In [None]:
street_signs = street_signs.drop(columns=["X", "Y", "UNITDESC"])

street_signs.head(10)

In [None]:
street_signs.info()

### Use cross validation+random forest to do the modelling

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Split the dataset into input features X and target variable y
X = street_signs.drop(columns=["collision_rate "])
y = street_signs["collision_rate "]

# One-hot encode categorical variables
X = pd.get_dummies(X)

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a random forest regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Evaluate model performance using cross-validation
cv_scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

# Train the model
rf.fit(X_train, y_train)

# Make predictions using the trained model
y_pred = rf.predict(X_test)

# Calculate mean squared error on the test set
mse = mean_squared_error(y_test, y_pred)

print("Cross-validation scores:", -cv_scores)
print("Mean squared error on test set:", mse)

In [None]:
print(f"Mean of collision_rate: {street_signs['collision_rate '].mean()}")

### ChatGPT interpolate the result
##### The training results show the cross-validation scores and the mean squared error on the test set when modeling the data using a random forest regressor.

##### Cross-Validation Scores:
##### The cross-validation scores are a list containing 5 values, each representing the performance of the model on a different training/validation data split. Here, the negative mean squared error is used as the scoring metric. Generally, a lower mean squared error indicates better model performance. However, in this case, we use the negative mean squared error, so values closer to 0 indicate better performance.

##### 1st cross-validation score: 0.08300897
##### 2nd cross-validation score: 0.08858813
##### 3rd cross-validation score: 0.08617691
##### 4th cross-validation score: 0.08685302
##### 5th cross-validation score: 0.08686454
##### Test Set Mean Squared Error:
##### The mean squared error (MSE) on the test set is a single value that represents the error the model produces when predicting unseen data. The MSE is a commonly used metric for measuring prediction accuracy. A lower value indicates better performance on the test set.

##### Test set mean squared error: 0.08771023589859456

In [None]:
import shap

# Create a SHAP explainer object for the random forest model
explainer = shap.Explainer(rf, X_train)

# Calculate SHAP values for the test set
shap_values = explainer(X_test,check_additivity=False)

# Generate the summary plot
shap.summary_plot(shap_values, X_test)

## Part II traffic_signals modeling
### import data and pre-process

In [None]:
traffic_signals = pd.read_csv('MergedData/intersections_collision_streetlights_streets_traffic_signals.csv')
first_row = traffic_signals.iloc[1]
for column_name, value in zip(traffic_signals.columns, first_row):
    print(f"{column_name}: {value}")

In [None]:
traffic_signals["traffic_flow_index"] = traffic_signals["ARTERIALCLASSCD"] + 1
traffic_signals["collision_rate "] = traffic_signals["collision_count"] / traffic_signals["traffic_flow_index"]

traffic_signals = traffic_signals.drop(columns=["ARTERIALCLASSCD", "traffic_flow_index", "collision_count"])
traffic_signals = traffic_signals.drop(columns=["INT_SIGNAL_TYPE_CD"])
traffic_signals = traffic_signals.drop(columns=["X", "Y", "UNITDESC"])

In [None]:
first_row = traffic_signals.iloc[1]
for column_name, value in zip(traffic_signals.columns, first_row):
    print(f"{column_name}: {value}")

In [None]:
traffic_signals.info()


### Use cross validation+random forest to do the modelling

In [None]:
print(traffic_signals.columns)

In [None]:

X = traffic_signals.drop(columns=["collision_rate "])
y = traffic_signals["collision_rate "]

# One-hot encode categorical variables
X = pd.get_dummies(X)

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.001, random_state=42)

# Create a random forest regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Evaluate model performance using cross-validation
cv_scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

# Train the model
rf.fit(X_train, y_train)

# Make predictions using the trained model
y_pred = rf.predict(X_test)

# Calculate mean squared error on the test set
mse = mean_squared_error(y_test, y_pred)

print("Cross-validation scores:", -cv_scores)
print("Mean squared error on test set:", mse)


In [None]:
print(f"Mean of collision_rate: {traffic_signals['collision_rate '].mean()}")

In [None]:
import shap

# Create a SHAP explainer object for the random forest model
explainer = shap.Explainer(rf, X_train)

# Calculate SHAP values for the test set
shap_values = explainer(X_test,check_additivity=False)

# Generate the summary plot
shap.summary_plot(shap_values, X_test)

# Part III try Minimal data set

In [None]:
import pandas as pd
import copy
MVP = pd.read_csv('data/JOINED_INTERSECTIONS_WO_NULLS.csv')

In [None]:
MVP.info()

In [None]:
MVP = MVP.dropna()

In [None]:
MVP.info()

In [None]:
MVP["ARTERIAL_CLASSIFICATION"].unique()

## Try ARTERIAL_CLASSIFICATION as traffic flow

In [None]:
#5 - Interstate Freeway
#4 - State Highway
#1 - Principal Arterial
#2 - Minor Arterial
#3 - Collector Arterial
#0 - Not Designated (not an arterial) 

In [None]:
MVP_art = MVP
MVP_art['ARTERIAL_CLASSIFICATION'] = MVP_art['ARTERIAL_CLASSIFICATION'].replace({
    5: 6,
    4: 5,
    1: 4,
    2: 3,
    3: 2,
    0: 1
})

In [None]:
MVP_art["RATE_COLLISIONS"] = MVP_art["COUNT_COLLISIONS"] / MVP_art["ARTERIAL_CLASSIFICATION"]

MVP_art = MVP_art.drop(columns=["COUNT_COLLISIONS", "ARTERIAL_CLASSIFICATION"])
MVP_art = MVP_art.drop(columns=["X", "Y", "INTERSECTION_NAME","INT_KEY","INTERSECTION_SUBAREA"])

In [None]:
MVP_art.info()

### Linear regression

In [None]:
data = MVP_art
linear_regression(data,"RATE_COLLISIONS")

In [None]:
data = MVP_art
random_forest(data,"RATE_COLLISIONS")

# Try Collision (arterial classification way)

In [None]:
MVP.info()
MVP["ARTERIAL_CLASSIFICATION"].unique()

In [None]:
MVP_all= MVP

#5 - Interstate Freeway
#4 - State Highway
#1 - Principal Arterial
#2 - Minor Arterial
#3 - Collector Arterial

# 假设您的数据集名为MVP_art
subsets = MVP_all.groupby('ARTERIAL_CLASSIFICATION')

# 分组后，您可以通过组名（0,1,2,3,4,5）访问每个子集
Collector_Arterial = subsets.get_group(2)
Minor_Arterial = subsets.get_group(3)
Principal_Arterial = subsets.get_group(4)
State_Highway = subsets.get_group(5)
Interstate_Freeway = subsets.get_group(6)



In [None]:
Collector_Arterial = Collector_Arterial.drop(columns=["ARTERIAL_CLASSIFICATION","RATE_COLLISIONS"])
Collector_Arterial = Collector_Arterial.drop(columns=["X", "Y", "INTERSECTION_NAME","INT_KEY","INTERSECTION_SUBAREA"])

Minor_Arterial = Minor_Arterial.drop(columns=["ARTERIAL_CLASSIFICATION","RATE_COLLISIONS"])
Minor_Arterial = Minor_Arterial.drop(columns=["X", "Y", "INTERSECTION_NAME","INT_KEY","INTERSECTION_SUBAREA"])

Principal_Arterial = Principal_Arterial.drop(columns=["ARTERIAL_CLASSIFICATION","RATE_COLLISIONS"])
Principal_Arterial = Principal_Arterial.drop(columns=["X", "Y", "INTERSECTION_NAME","INT_KEY","INTERSECTION_SUBAREA"])

State_Highway = State_Highway.drop(columns=["ARTERIAL_CLASSIFICATION","RATE_COLLISIONS"])
State_Highway = State_Highway.drop(columns=["X", "Y", "INTERSECTION_NAME","INT_KEY","INTERSECTION_SUBAREA"])

Interstate_Freeway = Interstate_Freeway.drop(columns=["ARTERIAL_CLASSIFICATION","RATE_COLLISIONS"])
Interstate_Freeway = Interstate_Freeway.drop(columns=["X", "Y", "INTERSECTION_NAME","INT_KEY","INTERSECTION_SUBAREA"])

### Collector_Arterial

In [None]:
data = Collector_Arterial
linear_regression(data,"COUNT_COLLISIONS")

In [None]:
data = Collector_Arterial
random_forest(data,"COUNT_COLLISIONS")

### Minor_Arterial

In [None]:
data = Minor_Arterial
linear_regression(data,"COUNT_COLLISIONS")

In [None]:
data = Minor_Arterial
random_forest(data,"COUNT_COLLISIONS")

### Principal_Arterial

In [None]:
data = Principal_Arterial
linear_regression(data,"COUNT_COLLISIONS")

In [None]:
data = Principal_Arterial
random_forest(data,"COUNT_COLLISIONS")

### State_Highway

In [None]:
data = State_Highway
linear_regression(data,"COUNT_COLLISIONS")

In [None]:
data = State_Highway
random_forest(data,"COUNT_COLLISIONS")

### Interstate_Freeway

In [None]:
data = Interstate_Freeway
linear_regression(data,"COUNT_COLLISIONS")

In [None]:
data = Interstate_Freeway
random_forest(data,"COUNT_COLLISIONS")