In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import metrics
data=pd.read_excel("Data Sheet.xlsx")

In [14]:
data.dtypes

Crash_Severity             object
Vehicle_Speed               int64
Crash_Time                  int64
Age                         int64
Gender                     object
Vehicle_Type               object
Number_of_Lanes             int64
Lane_Width                float64
Road_Type                  object
Alcohol_Consumption        object
Crash_Type                 object
Seatbelt_Usage             object
Speed_Limit                 int64
Road_Surface_Condition     object
dtype: object

In [5]:
data.isna().sum()

Crash_Severity            0
Vehicle_Speed             0
Crash_Time                0
Age                       0
Gender                    0
Vehicle_Type              0
Number_of_Lanes           0
Lane_Width                0
Road_Type                 0
Alcohol_Consumption       0
Crash_Type                0
Seatbelt_Usage            0
Speed_Limit               0
Road_Surface_Condition    0
dtype: int64

In [15]:
data

Unnamed: 0,Crash_Severity,Vehicle_Speed,Crash_Time,Age,Gender,Vehicle_Type,Number_of_Lanes,Lane_Width,Road_Type,Alcohol_Consumption,Crash_Type,Seatbelt_Usage,Speed_Limit,Road_Surface_Condition
0,Minor injury,107,11,27,Male,Heavy Vehicle,2,3.484386,Urban,Yes,Rear-end,No,30,Icy
1,Minor injury,27,16,39,Male,Car,2,3.293091,Rural,Yes,Rear-end,Yes,110,Dry
2,Minor injury,87,14,42,Female,Car,3,3.218911,Urban,No,Rear-end,No,59,Dry
3,Minor injury,43,3,60,Female,Heavy Vehicle,2,3.113012,Rural,No,Rear-end,No,73,Wet
4,Minor injury,72,8,70,Male,T.W,3,3.106580,Urban,Yes,Rear-end,Yes,42,Wet
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,Fatal crash,112,4,30,Male,Car,1,3.248476,Urban,No,Head-on,No,118,Icy
296,Fatal crash,87,21,52,Female,Heavy Vehicle,2,3.037739,Urban,Yes,Rear-end,No,38,Icy
297,Fatal crash,85,20,79,Female,Heavy Vehicle,2,3.350668,Rural,Yes,Rear-end,No,80,Wet
298,Fatal crash,44,5,23,Female,Car,3,3.419304,Urban,Yes,Rear-end,Yes,47,Wet


In [8]:
data.columns.unique()

Index(['Crash_Severity', 'Vehicle_Speed', 'Crash_Time', 'Age', 'Gender',
       'Vehicle_Type', 'Number_of_Lanes', 'Lane_Width', 'Road_Type',
       'Alcohol_Consumption', 'Crash_Type', 'Seatbelt_Usage', 'Speed_Limit',
       'Road_Surface_Condition'],
      dtype='object')

In [11]:
{col: len(data[col].unique()) for col in data.columns if data.dtypes[col]=='object'}

{'Crash_Severity': 3,
 'Gender': 2,
 'Vehicle_Type': 3,
 'Road_Type': 2,
 'Alcohol_Consumption': 2,
 'Crash_Type': 2,
 'Seatbelt_Usage': 2,
 'Road_Surface_Condition': 3}

In [17]:
def onehot_encode(df,columns,prefixes):
    df=df.copy()
    for column,prefix in zip(columns,prefixes):
        dummies=pd.get_dummies(df[column],prefix=prefix)
        df=pd.concat([df,dummies],axis=1)
        df=df.drop(column,axis=1)
    return df

In [18]:
data=onehot_encode(
    data,
    columns=['Crash_Severity', 'Gender',
       'Vehicle_Type', 'Road_Type',
       'Alcohol_Consumption', 'Crash_Type', 'Seatbelt_Usage',
       'Road_Surface_Condition'],
    prefixes=['cs','g','vt','rt','ac','ct','su','rsc']

)

In [28]:
y=data[['cs_Fatal crash',	'cs_Major injury',	'cs_Minor injury']].copy()

In [32]:
x=data.drop(['cs_Fatal crash',	'cs_Major injury',	'cs_Minor injury'],axis=1).copy()

In [41]:
x=x.astype(np.float64)

In [42]:
x

Unnamed: 0,Vehicle_Speed,Crash_Time,Age,Number_of_Lanes,Lane_Width,Speed_Limit,g_Female,g_Male,vt_Car,vt_Heavy Vehicle,...,rt_Urban,ac_No,ac_Yes,ct_Head-on,ct_Rear-end,su_No,su_Yes,rsc_Dry,rsc_Icy,rsc_Wet
0,107.0,11.0,27.0,2.0,3.484386,30.0,0.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
1,27.0,16.0,39.0,2.0,3.293091,110.0,0.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
2,87.0,14.0,42.0,3.0,3.218911,59.0,1.0,0.0,1.0,0.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
3,43.0,3.0,60.0,2.0,3.113012,73.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
4,72.0,8.0,70.0,3.0,3.106580,42.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,112.0,4.0,30.0,1.0,3.248476,118.0,0.0,1.0,1.0,0.0,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
296,87.0,21.0,52.0,2.0,3.037739,38.0,1.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
297,85.0,20.0,79.0,2.0,3.350668,80.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
298,44.0,5.0,23.0,3.0,3.419304,47.0,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0


In [48]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X=scaler.fit_transform(x)

In [53]:
X.shape

(300, 22)

In [74]:
x_train,x_test,y_train,y_test=train_test_split(X,y,train_size=0.85,random_state=100)

In [47]:
import tensorflow as tf

In [80]:
inputs=tf.keras.Input(shape=(X.shape[1],))
x=tf.keras.layers.Dense(8,activation='relu')(inputs)
x=tf.keras.layers.Dense(8,activation='relu')(x)
outputs=tf.keras.layers.Dense(3,activation='softmax')(x)
model=tf.keras.Model(inputs,outputs)
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)
batch_size=10
epochs=20
history=model.fit(
    x_train,
    y_train,
    validation_split=0.2,
    batch_size=batch_size,
    epochs=epochs,
    callbacks=[
        tf.keras.callbacks.ReduceLROnPlateau(),
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=3,
            restore_best_weights=True
        )
    ]
)

Epoch 1/20
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 25ms/step - accuracy: 0.3724 - loss: 1.1521 - val_accuracy: 0.2941 - val_loss: 1.1549 - learning_rate: 0.0010
Epoch 2/20
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.2855 - loss: 1.1571 - val_accuracy: 0.3137 - val_loss: 1.1424 - learning_rate: 0.0010
Epoch 3/20
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.3818 - loss: 1.1091 - val_accuracy: 0.3137 - val_loss: 1.1412 - learning_rate: 0.0010
Epoch 4/20
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.4135 - loss: 1.0862 - val_accuracy: 0.3333 - val_loss: 1.1360 - learning_rate: 0.0010
Epoch 5/20
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.4452 - loss: 1.0875 - val_accuracy: 0.3529 - val_loss: 1.1344 - learning_rate: 0.0010
Epoch 6/20
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/

In [81]:
y.shape

(300, 3)

In [82]:
y.ndim

2

In [63]:
x_train.ndim

2

In [83]:
print('test accuracy',model.evaluate(x_test,y_test,verbose=0)[1])

test accuracy 0.2666666805744171


In [87]:
from sklearn.utils.class_weight import compute_class_weight

# Ensure y_train is a NumPy array
if isinstance(y_train, pd.DataFrame):
    y_train = y_train.to_numpy()

# Convert one-hot encoded labels to class indices
y_train_classes = y_train.argmax(axis=1)

# Compute class weights
class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(y_train_classes),
    y=y_train_classes
)
class_weights = dict(enumerate(class_weights))

# Use class_weights in the model training
history = model.fit(
    x_train,
    y_train,
    validation_split=0.2,
    batch_size=10,
    epochs=50,
    class_weight=class_weights,  # Apply class weights
    callbacks=[
        tf.keras.callbacks.ReduceLROnPlateau(),
        tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    ]
)


Epoch 1/50
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.4143 - loss: 1.0681 - val_accuracy: 0.3333 - val_loss: 1.1325 - learning_rate: 0.0010
Epoch 2/50
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.4128 - loss: 1.0731 - val_accuracy: 0.3137 - val_loss: 1.1325 - learning_rate: 0.0010
Epoch 3/50
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.3959 - loss: 1.0733 - val_accuracy: 0.3333 - val_loss: 1.1342 - learning_rate: 0.0010
Epoch 4/50
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.4286 - loss: 1.0627 - val_accuracy: 0.3333 - val_loss: 1.1345 - learning_rate: 0.0010
Epoch 5/50
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.4605 - loss: 1.0445 - val_accuracy: 0.3137 - val_loss: 1.1349 - learning_rate: 0.0010
Epoch 6/50
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/

In [88]:
print('test accuracy',model.evaluate(x_test,y_test,verbose=0)[1])

test accuracy 0.2666666805744171


In [89]:
# Handling imbalance (if necessary)
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight('balanced', classes=np.unique(y_train.argmax(axis=1)), y=y_train.argmax(axis=1))
class_weights = dict(enumerate(class_weights))

# Update model architecture
inputs = tf.keras.Input(shape=(X.shape[1],))
x = tf.keras.layers.Dense(32, activation='relu')(inputs)  # Increase neurons
x = tf.keras.layers.Dropout(0.5)(x)  # Add dropout
x = tf.keras.layers.Dense(16, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(x)  # Regularization
x = tf.keras.layers.Dense(8, activation='relu')(x)
outputs = tf.keras.layers.Dense(3, activation='softmax')(x)

model = tf.keras.Model(inputs, outputs)
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),  # Adjust learning rate
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Train with class weights
history = model.fit(
    x_train,
    y_train,
    validation_split=0.2,
    batch_size=10,
    epochs=50,  # Increase epochs
    class_weight=class_weights,  # Use class weights
    callbacks=[
        tf.keras.callbacks.ReduceLROnPlateau(),
        tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    ]
)


Epoch 1/50
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 26ms/step - accuracy: 0.3968 - loss: 1.3490 - val_accuracy: 0.2941 - val_loss: 1.3528 - learning_rate: 0.0010
Epoch 2/50
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.3588 - loss: 1.3324 - val_accuracy: 0.2745 - val_loss: 1.3281 - learning_rate: 0.0010
Epoch 3/50
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.3526 - loss: 1.2990 - val_accuracy: 0.2157 - val_loss: 1.3115 - learning_rate: 0.0010
Epoch 4/50
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.3743 - loss: 1.3035 - val_accuracy: 0.2353 - val_loss: 1.2984 - learning_rate: 0.0010
Epoch 5/50
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.3434 - loss: 1.2696 - val_accuracy: 0.2941 - val_loss: 1.2865 - learning_rate: 0.0010
Epoch 6/50
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/

In [90]:
print('test accuracy',model.evaluate(x_test,y_test,verbose=0)[1])

test accuracy 0.4000000059604645


In [91]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
import tensorflow as tf

# Load data
data = pd.read_excel("Data Sheet.xlsx")

# Function to one-hot encode categorical columns
def onehot_encode(df, columns, prefixes):
    df = df.copy()
    for column, prefix in zip(columns, prefixes):
        dummies = pd.get_dummies(df[column], prefix=prefix)
        df = pd.concat([df, dummies], axis=1)
        df = df.drop(column, axis=1)
    return df

data = onehot_encode(
    data,
    columns=['Crash_Severity', 'Gender', 'Vehicle_Type', 'Road_Type', 'Alcohol_Consumption',
             'Crash_Type', 'Seatbelt_Usage', 'Road_Surface_Condition'],
    prefixes=['cs', 'g', 'vt', 'rt', 'ac', 'ct', 'su', 'rsc']
)

# Separate features (X) and target variable (y)
X = data.drop(['Target_Column'], axis=1)  # Replace 'Target_Column' with your target column name
y = pd.get_dummies(data['Target_Column']).values  # Assuming one-hot encoding for target

# Feature scaling
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Train-test split
x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=0.85, random_state=100)

# Handle imbalance using SMOTE
smote = SMOTE(random_state=42)
x_train, y_train_classes = smote.fit_resample(x_train, y_train.argmax(axis=1))

# Convert back to one-hot encoding for TensorFlow
y_train = tf.keras.utils.to_categorical(y_train_classes)

# Compute class weights for imbalance
class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(y_train_classes),
    y=y_train_classes
)
class_weights = dict(enumerate(class_weights))

# Visualize feature importance (using Random Forest)
rf = RandomForestClassifier(random_state=42)
rf.fit(x_train, y_train_classes)
importance = rf.feature_importances_

plt.figure(figsize=(10, 6))
plt.bar(range(len(importance)), importance)
plt.title("Feature Importance")
plt.show()

# Neural Network model
inputs = tf.keras.Input(shape=(X.shape[1],))
x = tf.keras.layers.Dense(64, activation='relu')(inputs)
x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Dropout(0.3)(x)
x = tf.keras.layers.Dense(32, activation='relu')(x)
x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Dropout(0.3)(x)
x = tf.keras.layers.Dense(16, activation='relu')(x)
outputs = tf.keras.layers.Dense(3, activation='softmax')(x)  # Adjust '3' to the number of classes
model = tf.keras.Model(inputs, outputs)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Train the model
history = model.fit(
    x_train,
    y_train,
    validation_split=0.2,
    batch_size=10,
    epochs=50,
    class_weight=class_weights,
    callbacks=[
        tf.keras.callbacks.ReduceLROnPlateau(),
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=5,
            restore_best_weights=True
        )
    ]
)

# Evaluate on test set
test_loss, test_accuracy = model.evaluate(x_test, y_test, verbose=0)
print(f"Test Accuracy: {test_accuracy:.2f}")

# Plot training and validation accuracy
plt.figure(figsize=(8, 6))
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title("Model Accuracy")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend()
plt.show()


KeyError: "['Target_Column'] not found in axis"