# Homework 02 - Regression

Goal: Create a regression model for predicting car fuel efficiency (column 'fuel_efficiency_mpg')

## 1. Data Preparation

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

In [None]:
# Load dataset with only required columns
columns = [
    'engine_displacement',
    'horsepower',
    'vehicle_weight',
    'model_year',
    'fuel_efficiency_mpg'
]

df = pd.read_csv('car_fuel_efficiency.csv', usecols=columns)
print(f"Dataset shape: {df.shape}")
df.head()

## 2. EDA - Basic Information

In [None]:
print("Dataset Info:")
df.info()
print("\nMissing values:")
print(df.isnull().sum())
print("\nBasic statistics:")
df.describe()

## 3. EDA - fuel_efficiency_mpg Distribution Analysis

In [None]:
# Statistical summary
print("Fuel Efficiency MPG Statistics:")
print(f"Mean:     {df['fuel_efficiency_mpg'].mean():.2f}")
print(f"Median:   {df['fuel_efficiency_mpg'].median():.2f}")
print(f"Std Dev:  {df['fuel_efficiency_mpg'].std():.2f}")
print(f"Min:      {df['fuel_efficiency_mpg'].min():.2f}")
print(f"Max:      {df['fuel_efficiency_mpg'].max():.2f}")
print(f"\nSkewness: {df['fuel_efficiency_mpg'].skew():.2f}")
print(f"Kurtosis: {df['fuel_efficiency_mpg'].kurtosis():.2f}")

In [None]:
# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Histogram
axes[0, 0].hist(df['fuel_efficiency_mpg'].dropna(), bins=50, edgecolor='black', alpha=0.7)
axes[0, 0].set_xlabel('Fuel Efficiency (MPG)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Histogram of Fuel Efficiency')
axes[0, 0].axvline(df['fuel_efficiency_mpg'].mean(), color='red', linestyle='--', label='Mean')
axes[0, 0].axvline(df['fuel_efficiency_mpg'].median(), color='green', linestyle='--', label='Median')
axes[0, 0].legend()

# Box plot
axes[0, 1].boxplot(df['fuel_efficiency_mpg'].dropna(), vert=True)
axes[0, 1].set_ylabel('Fuel Efficiency (MPG)')
axes[0, 1].set_title('Box Plot of Fuel Efficiency')

# Q-Q plot
stats.probplot(df['fuel_efficiency_mpg'].dropna(), dist="norm", plot=axes[1, 0])
axes[1, 0].set_title('Q-Q Plot')

# KDE plot
df['fuel_efficiency_mpg'].dropna().plot(kind='kde', ax=axes[1, 1])
axes[1, 1].set_xlabel('Fuel Efficiency (MPG)')
axes[1, 1].set_title('Kernel Density Estimation')
axes[1, 1].axvline(df['fuel_efficiency_mpg'].mean(), color='red', linestyle='--', label='Mean')
axes[1, 1].axvline(df['fuel_efficiency_mpg'].median(), color='green', linestyle='--', label='Median')
axes[1, 1].legend()

plt.tight_layout()
plt.show()

In [None]:
# Percentile analysis
print("Percentile Analysis:")
percentiles = [10, 25, 50, 75, 90, 95, 99]
for p in percentiles:
    value = df['fuel_efficiency_mpg'].quantile(p/100)
    print(f"{p}th percentile: {value:.2f} MPG")

## Does fuel_efficiency_mpg have a long tail?

**Answer: NO**

The fuel_efficiency_mpg variable does NOT have a significant long tail.

**Evidence:**
- Skewness: -0.01 (essentially 0, indicating symmetry)
- Mean: 14.99 MPG
- Median: 15.01 MPG
- Kurtosis: 0.02 (close to 0, indicating normal distribution)

The distribution is approximately symmetric with mean ≈ median and skewness ≈ 0, indicating no significant long tail in either direction.