In [6]:
# 🚦 Advanced Traffic Volume Estimation Project

# 1. Importing necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
from collections import Counter

from sklearn import linear_model, tree, ensemble, svm, metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import xgboost
import joblib  # For saving models

# 2. Importing the data
data = pd.read_csv(r"C:\Users\venka\Downloads\traffic_volume_project\traffic volume.csv")

# 3. Filling missing numerical values with mean
for col in ['temp', 'rain', 'snow']:
    data[col] = data[col].fillna(data[col].mean())

# 4. Filling missing categorical values in 'weather' with 'Clouds'
print("Weather value counts before filling nulls:\n", data['weather'].value_counts(dropna=False))
data['weather'] = data['weather'].fillna('Clouds')

# 5. Filling missing 'holiday' values with 'None' and encoding as category codes
data['holiday'] = data['holiday'].fillna('None')
data['holiday'] = data['holiday'].astype('category').cat.codes

# 6. Splitting date into day, month, year
date_split = data['date'].str.split("-", expand=True)
date_split.columns = ['day', 'month', 'year']
data = pd.concat([data, date_split], axis=1)

# 7. Splitting Time into hours, minutes, seconds
time_split = data['Time'].str.split(":", expand=True)
time_split.columns = ['hours', 'minutes', 'seconds']
data = pd.concat([data, time_split], axis=1)

# 8. Dropping original 'date' and 'Time' columns
data = data.drop(columns=['date', 'Time'])

# 9. Encoding categorical 'weather' column into numeric codes
data['weather'] = data['weather'].astype('category').cat.codes

# 10. Converting all columns to numeric types if necessary
data = data.apply(pd.to_numeric, errors='coerce')

# 11. Dropping any remaining rows with nulls after conversions
data = data.dropna()

# 12. Viewing processed data sample
print("\n✅ Processed data sample:\n", data.head())

# 13. Separating target and features
y = data['traffic_volume']
x = data.drop(columns=['traffic_volume'], axis=1)

# 14. Scaling features for better model performance using StandardScaler
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)
x = pd.DataFrame(x_scaled, columns=x.columns)

# Saving scaler for future use
joblib.dump(scaler, r"C:\Users\venka\Downloads\traffic_volume_project\scaler.pkl")

# 15. Splitting data into train-test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# 16. Defining models
lin_reg = linear_model.LinearRegression()
Dtree = tree.DecisionTreeRegressor(random_state=42)
Rand = ensemble.RandomForestRegressor(random_state=42)
svr_model = svm.SVR()
XGB = xgboost.XGBRegressor(random_state=42)

# 17. Training models
lin_reg.fit(x_train, y_train)
Dtree.fit(x_train, y_train)
Rand.fit(x_train, y_train)
svr_model.fit(x_train, y_train)
XGB.fit(x_train, y_train)

# 18. Saving trained models for future deployment
joblib.dump(lin_reg, r"C:\Users\venka\Downloads\traffic_volume_project\lin_reg_model.pkl")
joblib.dump(Dtree, r"C:\Users\venka\Downloads\traffic_volume_project\decision_tree_model.pkl")
joblib.dump(Rand, r"C:\Users\venka\Downloads\traffic_volume_project\random_forest_model.pkl")
joblib.dump(svr_model, r"C:\Users\venka\Downloads\traffic_volume_project\svr_model.pkl")
joblib.dump(XGB, r"C:\Users\venka\Downloads\traffic_volume_project\xgboost_model.pkl")

print("✅ Models saved successfully in your project folder.")

# 19. Predicting on test data
predictions = {
    "Linear Regression": lin_reg.predict(x_test),
    "Decision Tree": Dtree.predict(x_test),
    "Random Forest": Rand.predict(x_test),
    "SVR": svr_model.predict(x_test),
    "XGBoost": XGB.predict(x_test)
}

# 20. Evaluating model performance using R^2 score
print("\n📊 Model R^2 Scores:")
for model_name, preds in predictions.items():
    score = metrics.r2_score(y_test, preds)
    print(f"{model_name}: {score:.4f}")


Weather value counts before filling nulls:
 weather
Clouds          15144
Clear           13383
Mist             5942
Rain             5665
Snow             2875
Drizzle          1818
Haze             1359
Thunderstorm     1033
Fog               912
NaN                49
Smoke              20
Squall              4
Name: count, dtype: int64

✅ Processed data sample:
    holiday    temp  rain  snow  weather  traffic_volume  day  month  year  \
0        7  288.28   0.0   0.0        1            5545    2     10  2012   
1        7  289.36   0.0   0.0        1            4516    2     10  2012   
2        7  289.58   0.0   0.0        1            4767    2     10  2012   
3        7  290.13   0.0   0.0        1            5026    2     10  2012   
4        7  291.14   0.0   0.0        1            4918    2     10  2012   

   hours  minutes  seconds  
0      9        0        0  
1     10        0        0  
2     11        0        0  
3     12        0        0  
4     13        0      

In [7]:
import pickle

# Assuming your scaler object is named 'scaler'
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)


In [8]:
with open('xgboost_model.pkl', 'wb') as f:
    pickle.dump(XGB, f)


In [9]:
with open('scaler.pkl', 'rb') as f:
    loaded_scaler = pickle.load(f)

print("Scaler loaded successfully.")


Scaler loaded successfully.


In [11]:
# save_model.py
import pandas as pd
import numpy as np
from sklearn import ensemble
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pickle

# Load data
data = pd.read_csv(r"C:\Users\venka\Downloads\traffic_volume_project\traffic volume.csv")

# Preprocessing
for col in ['temp', 'rain', 'snow']:
    data[col].fillna(data[col].mean(), inplace=True)
data['weather'].fillna('Clouds', inplace=True)
data['holiday'].fillna('None', inplace=True)

# Date split
date_split = data['date'].str.split("-", expand=True)
date_split.columns = ['day', 'month', 'year']
data = pd.concat([data, date_split], axis=1)

# Time split
time_split = data['Time'].str.split(":", expand=True)
time_split.columns = ['hours', 'minutes', 'seconds']
data = pd.concat([data, time_split], axis=1)

# Drop unused columns
data.drop(columns=['date', 'Time'], inplace=True)

# Encode categoricals
data['weather'] = data['weather'].astype('category').cat.codes
data['holiday'] = data['holiday'].astype('category').cat.codes

# Ensure all numeric
data = data.apply(pd.to_numeric, errors='coerce')
data.dropna(inplace=True)

# Features and target
y = data['traffic_volume']
x = data.drop(columns=['traffic_volume'], axis=1)

# Scaling
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

# Train-test split
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.2, random_state=42)

# Model
Rand = ensemble.RandomForestRegressor(random_state=42)
Rand.fit(x_train, y_train)

# Save model and scaler
pickle.dump(Rand, open('RandomForest_model.pkl', 'wb'))
pickle.dump(scaler, open('scaler.pkl', 'wb'))

print("✅ Model and scaler saved successfully.")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['weather'].fillna('Clouds', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting valu

✅ Model and scaler saved successfully.
