In [44]:
# Mount Google Drive to access the data file
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


 Import necessary libraries

In [45]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

Load the dataset

In [46]:
df = pd.read_csv("/content/drive/MyDrive/Predicting City Traffic Flow Based on Weather and Events/cleaned_traffic_weather_data.csv")

In [47]:
df.tail()

Unnamed: 0,_id_x,updated_at_x,segment_id,velocity,updated_at_30min,province,max,min,wind,wind_d,...,e_node_id,length,street_id,max_velocity,street_level,street_name,street_type,_id,long,lat
26157,88746,2021-04-14 00:14:59.294000+00:00,81334,30,2021-04-14,Ho Chi Minh City,34.0,27.0,12.0,SSE,...,5772136714,25,609378590,,4,Đường số 13,tertiary,5772136721,106.763374,10.865894
26158,89935,2021-04-21 23:53:39.289000+00:00,21263,1,2021-04-22,Ho Chi Minh City,34.0,28.0,8.0,S,...,4608690975,38,35114023,,4,Trương Định,tertiary,5444117929,106.696624,10.771206
26159,89936,2021-04-21 23:54:03.902000+00:00,21263,1,2021-04-22,Ho Chi Minh City,34.0,28.0,8.0,S,...,4608690975,38,35114023,,4,Trương Định,tertiary,5444117929,106.696624,10.771206
26160,89937,2021-04-22 00:04:32.099000+00:00,21263,1,2021-04-22,Ho Chi Minh City,34.0,28.0,8.0,S,...,4608690975,38,35114023,,4,Trương Định,tertiary,5444117929,106.696624,10.771206
26161,89938,2021-04-22 00:04:41.609000+00:00,21263,2,2021-04-22,Ho Chi Minh City,34.0,28.0,8.0,S,...,4608690975,38,35114023,,4,Trương Định,tertiary,5444117929,106.696624,10.771206



Display data types and summary information

In [48]:
print("Data Types:\n",df.dtypes)
print("Display first 5 records:\n",df.head())
print("Stats Summary:\n",df.describe())

Data Types:
 _id_x                 int64
updated_at_x         object
segment_id            int64
velocity              int64
updated_at_30min     object
province             object
max                 float64
min                 float64
wind                float64
wind_d               object
rain                float64
humidi              float64
cloud               float64
pressure            float64
date                 object
year                float64
month               float64
day                 float64
date_30min           object
_id_y                 int64
created_at           object
updated_at_y         object
s_node_id             int64
e_node_id             int64
length                int64
street_id             int64
max_velocity        float64
street_level          int64
street_name          object
street_type          object
_id                   int64
long                float64
lat                 float64
dtype: object
Display first 5 records:
    _id_x               

# Data Preprocessing

Select only relevant columns for modeling

In [49]:
# Here we focus on traffic velocity as target and other numeric features related to weather and road conditions
df = df[['velocity', 'max', 'min', 'wind', 'rain', 'humidi', 'cloud', 'pressure', 'length']]

Handling missing values by filling with the mean (for simplicity)

In [50]:
df.fillna(df.mean(), inplace=True)

Define features (X) and target variable (y)

In [51]:
X = df.drop(columns='velocity')
y = df['velocity']

In [52]:
print(X.head())

    max   min  wind  rain  humidi  cloud  pressure  length
0  29.0  26.0  14.0  20.0    83.0   77.0    1006.0      14
1  29.0  26.0  14.0  20.0    83.0   77.0    1006.0     205
2  29.0  26.0  14.0  20.0    83.0   77.0    1006.0     120
3  29.0  26.0  14.0  20.0    83.0   77.0    1006.0     111
4  29.0  26.0  14.0  20.0    83.0   77.0    1006.0      81


Split data into training and testing sets

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Initialize and train the Linear Regression model

In [54]:
model = LinearRegression()
model.fit(X_train, y_train)

Make predictions

In [55]:
y_pred = model.predict(X_test)

Evaluate the model

In [56]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 141.29103985244026
R-squared: 0.010010626587908411



Display the first few predictions and actual values for comparison

In [57]:
print("Predicted values:", y_pred[:5])
print("Actual values:", y_test.values[:5])

Predicted values: [20.03354685 19.95195602 19.96112353 21.09465291 19.97029104]
Actual values: [ 1 12 35  1 28]
