In [3]:
# 5.1 Loading the Dataset
import pandas as pd

# Load the petrol consumption dataset
data = pd.read_csv('petrol_consumption.csv')

# Display the first few rows
print(data.head())


   Petrol_tax  Average_income  Paved_Highways  Population_Driver_licence(%)  \
0         9.0            3571            1976                         0.525   
1         9.0            4092            1250                         0.572   
2         9.0            3865            1586                         0.580   
3         7.5            4870            2351                         0.529   
4         8.0            4399             431                         0.544   

   Petrol_Consumption  
0                 541  
1                 524  
2                 561  
3                 414  
4                 410  


In [8]:
# 5.2 Exploring the Data
# Check basic statistics
print(data.describe())

print("===========================================================")

# Check for any missing values
print(data.isnull().sum())


       Petrol_tax  Average_income  Paved_Highways  \
count   48.000000       48.000000       48.000000   
mean     7.668333     4241.833333     5565.416667   
std      0.950770      573.623768     3491.507166   
min      5.000000     3063.000000      431.000000   
25%      7.000000     3739.000000     3110.250000   
50%      7.500000     4298.000000     4735.500000   
75%      8.125000     4578.750000     7156.000000   
max     10.000000     5342.000000    17782.000000   

       Population_Driver_licence(%)  Petrol_Consumption  
count                     48.000000           48.000000  
mean                       0.570333          576.770833  
std                        0.055470          111.885816  
min                        0.451000          344.000000  
25%                        0.529750          509.500000  
50%                        0.564500          568.500000  
75%                        0.595250          632.750000  
max                        0.724000          968.000000  


In [5]:
# 5.3 Splitting the Data
from sklearn.model_selection import train_test_split

# Independent variables (features)
X = data[['Petrol_tax', 'Average_income', 'Paved_Highways', 'Population_Driver_licence(%)']]

# Dependent variable (target)
y = data['Petrol_Consumption']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shapes of the training and testing data
print("Training data shape:", X_train.shape)
print("Testing data shape:", X_test.shape)


Training data shape: (38, 4)
Testing data shape: (10, 4)


In [9]:
# 5.4 Training the Multiple Linear Regression Model
from sklearn.linear_model import LinearRegression

# Initialize the model
model = LinearRegression()

# Train the model using the training data
model.fit(X_train, y_train)

# Output the coefficients (slope) and intercept
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)


Coefficients: [-3.69937459e+01 -5.65355145e-02 -4.38217137e-03  1.34686930e+03]
Intercept: 361.45087906653225


In [10]:
# 5.5 Making Predictions
# Make predictions using the test data
y_pred = model.predict(X_test)

# Compare the predicted and actual petrol consumption
comparison = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(comparison)


    Actual   Predicted
27     631  606.692665
40     587  673.779442
26     577  584.991490
43     591  563.536910
24     460  519.058672
37     704  643.461003
12     525  572.897614
19     640  687.077036
4      410  547.609366
25     566  530.037630


In [11]:
# 5.6 Evaluating the Model
from sklearn.metrics import mean_absolute_error, r2_score

# Calculate Mean Absolute Error
mae = mean_absolute_error(y_test, y_pred)

# Calculate R-squared
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")


Mean Absolute Error: 53.468541282916625
R-squared: 0.3913664001428886
