# Classification Report on Traffic Congestion
- Logistic Regression
- KNN
- SVM
- Decision Trees

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss

In [2]:
df=pd.read_csv("freeway_i64.csv")
df=df.sample(frac=0.1, replace=True, random_state=1)

In [3]:
df.shape

(15376, 7)

# Import and Pre-process data

In [4]:
import pandas as pd
#df=pd.read_csv("freeway_i64.csv")
#df = df[['sd','bt','bti','pt','pti','tt','tti']]

#create congested category
df['tti'] = ['Congested' if x >=1.2 else 'Not Congested' for x in df['tti']]
#df.tti = df.tti.astype(int)
# define features and label
features = ['sd','bt','bti','pt','pti','tt']
label='tti'

X,y = df[features].values,df[label].values

#transform features
X = preprocessing.StandardScaler().fit(X).transform(X)

# split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (12300, 6) (12300,)
Test set: (3076, 6) (3076,)


In [5]:
df.value_counts('tti')

tti
Congested        11313
Not Congested     4063
dtype: int64

# Binary Logistic Regression

In [6]:
#LOGISTIC

# define models and parameters
model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]

# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
logreg = grid_search.fit(X_train, y_train)

#predict test set
LR_yhat = logreg.predict(X_test)
LR_yhat_prob = logreg.predict_proba(X_test)

#metrics
lr_acc = logreg.best_score_*100
lr_js  = jaccard_score(y_test, LR_yhat,pos_label = "Congested")*100
lr_fs  = f1_score(y_test, LR_yhat, average='weighted')*100

# K-Nearest Neighbor(KNN)

In [7]:
# define models and parameters
model = KNeighborsClassifier()
n_neighbors = range(1, 21, 2)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']

# define grid search
grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
knnmod = grid_search.fit(X_train, y_train)

#predict test set
knn_yhat = knnmod.predict(X_test)

#metrics
knn_acc = knnmod.best_score_*100
knn_js  = jaccard_score(y_test, knn_yhat,pos_label = "Congested")*100
knn_fs  = f1_score(y_test, knn_yhat, average='weighted')*100

# Support Vector Machines

In [None]:
# define model and parameters
model = svm.SVC()
kernel = ['poly', 'rbf', 'sigmoid']
C = [50, 10, 1.0, 0.1, 0.01]
gamma = ['scale']

# define grid search
grid = dict(kernel=kernel,C=C,gamma=gamma)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
svmmod = grid_search.fit(X_train, y_train)

#predict test set
svm_yhat = svmmod.predict(X_test)

#metrics
svm_acc = svmmod.best_score_*100
svm_js  = jaccard_score(y_test, svm_yhat,pos_label = "Congested")*100
svm_fs  = f1_score(y_test, svm_yhat, average='weighted')*100

# Bagged Decision Trees

In [None]:
# define models and parameters
model = BaggingClassifier()
n_estimators = [10, 100, 1000]

# define grid search
grid = dict(n_estimators=n_estimators)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
dtmod = grid_search.fit(X_train, y_train)

#predict test set
dt_yhat = dtmod.predict(X_test)

#metrics
dt_acc = dtmod.best_score_*100
dt_js  = jaccard_score(y_test, dt_yhat,pos_label = "Congested")*100
dt_fs  = f1_score(y_test, dt_yhat, average='weighted')*100

# Plot Performance Metrics

In [None]:
#create df for plotting metrics
df = {'Algorithm': ['KNN','Decision Tree','SVM','Logistic'],
        'Accuracy' : [knn_acc,dt_acc,svm_acc,lr_acc],
        'Jaccard'  : [knn_js,dt_js,svm_js,lr_js],
        'F1_Score' : [knn_fs,dt_fs,svm_fs,lr_fs]}
df=pd.DataFrame.from_dict(df)
df=pd.melt(df, id_vars='Algorithm', value_vars=['Accuracy', 'Jaccard', 'F1_Score'])

fig = px.bar(df, x="Algorithm", y="value",color='variable', barmode='group',height=400)
fig.show()

In [None]:
#best model confusion matrix= Logistic
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(y_test, LR_yhat, labels=logreg.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=logreg.classes_)
disp.plot()
plt.show()