# Homework: Car Fuel Efficiency Regression Model

The goal is to create a regression model for predicting car fuel efficiency (column 'fuel_efficiency_mpg').

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

## 1. Preparing the Dataset

Load the data and select only the required columns:
- engine_displacement
- horsepower
- vehicle_weight
- model_year
- fuel_efficiency_mpg

In [None]:
# Load the dataset
df = pd.read_csv('car_fuel_efficiency.csv')

print(f"Original dataset shape: {df.shape}")
print(f"\nAll columns: {df.columns.tolist()}")

In [None]:
# Select only the required columns
columns_to_use = [
    'engine_displacement',
    'horsepower',
    'vehicle_weight',
    'model_year',
    'fuel_efficiency_mpg'
]

df_selected = df[columns_to_use].copy()

print(f"Selected dataset shape: {df_selected.shape}")
print(f"\nColumns: {df_selected.columns.tolist()}")

In [None]:
# Display first few rows
df_selected.head()

In [None]:
# Basic statistics
df_selected.describe()

In [None]:
# Check for missing values
print("Missing values per column:")
print(df_selected.isnull().sum())
print(f"\nTotal missing values: {df_selected.isnull().sum().sum()}")

## 2. EDA: Does fuel_efficiency_mpg have a long tail?

We'll examine the distribution of fuel_efficiency_mpg to determine if it has a long tail.

In [None]:
# Create histogram to visualize the distribution
plt.figure(figsize=(12, 5))

# Subplot 1: Histogram
plt.subplot(1, 2, 1)
plt.hist(df_selected['fuel_efficiency_mpg'], bins=50, edgecolor='black', alpha=0.7)
plt.xlabel('Fuel Efficiency (MPG)')
plt.ylabel('Frequency')
plt.title('Distribution of Fuel Efficiency')
plt.grid(True, alpha=0.3)

# Subplot 2: Box plot
plt.subplot(1, 2, 2)
plt.boxplot(df_selected['fuel_efficiency_mpg'])
plt.ylabel('Fuel Efficiency (MPG)')
plt.title('Box Plot of Fuel Efficiency')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Calculate skewness to quantify the tail
from scipy import stats

skewness = df_selected['fuel_efficiency_mpg'].skew()
print(f"Skewness of fuel_efficiency_mpg: {skewness:.4f}")
print("\nInterpretation:")
print(f"  - Skewness > 0: Right-skewed (long tail on the right)")
print(f"  - Skewness < 0: Left-skewed (long tail on the left)")
print(f"  - Skewness ≈ 0: Symmetric distribution")
print(f"\n  - |Skewness| < 0.5: Fairly symmetric")
print(f"  - 0.5 < |Skewness| < 1: Moderately skewed")
print(f"  - |Skewness| > 1: Highly skewed")

In [None]:
# Additional statistics
print("Fuel Efficiency Statistics:")
print(f"Mean: {df_selected['fuel_efficiency_mpg'].mean():.2f}")
print(f"Median: {df_selected['fuel_efficiency_mpg'].median():.2f}")
print(f"Mode: {df_selected['fuel_efficiency_mpg'].mode()[0]:.2f}")
print(f"Std Dev: {df_selected['fuel_efficiency_mpg'].std():.2f}")
print(f"\nMin: {df_selected['fuel_efficiency_mpg'].min():.2f}")
print(f"Max: {df_selected['fuel_efficiency_mpg'].max():.2f}")
print(f"Range: {df_selected['fuel_efficiency_mpg'].max() - df_selected['fuel_efficiency_mpg'].min():.2f}")

In [None]:
# Q-Q plot to check normality
plt.figure(figsize=(8, 6))
stats.probplot(df_selected['fuel_efficiency_mpg'], dist="norm", plot=plt)
plt.title('Q-Q Plot: Fuel Efficiency vs Normal Distribution')
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Percentile analysis
percentiles = [25, 50, 75, 90, 95, 99]
print("Percentile Analysis:")
for p in percentiles:
    value = np.percentile(df_selected['fuel_efficiency_mpg'], p)
    print(f"{p}th percentile: {value:.2f} MPG")

### Conclusion

Based on the analysis above:
- The histogram shows the shape of the distribution
- The skewness value indicates whether there's asymmetry
- The Q-Q plot shows how the distribution compares to a normal distribution

If the distribution has a long tail, we might see:
- A non-zero skewness value
- Deviation from the diagonal line in the Q-Q plot
- A histogram that extends further in one direction