# Data Exploration

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv("data.csv")

## Display Head

In [None]:
df.head(10)

## Description of Data

In [None]:
print(df.describe())

## Data Info

In [None]:
print(df.info())

## Check for Nulls & Duplicates

### Check for Nulls

In [None]:
print(df.isnull().sum())


### Check for Duplicates

In [None]:
print(df.duplicated().sum())

# Data Cleaning & Visualization

## Research Question 1

#### Assign Data Types

In [None]:
import pandas as pd

# 1) Remove any string from those attributes and leave the float

df["fuel_consumption_l_100km"] = pd.to_numeric(
    df["fuel_consumption_l_100km"].replace(regex=True, to_replace=r"[^0-9.]", value=""),
    errors="coerce",
)
df["power_kw"] = pd.to_numeric(
    df["power_kw"].replace(regex=True, to_replace=r"[^0-9.]", value=""), errors="coerce"
)

# 2) If there is only a string, replace with null

df["fuel_consumption_l_100km"] = pd.to_numeric(
    df["fuel_consumption_l_100km"], errors="coerce"
)
df["power_kw"] = pd.to_numeric(df["power_kw"], errors="coerce")

# 3) If there is a negative value, replace with 0

df["fuel_consumption_l_100km"] = df["fuel_consumption_l_100km"].apply(
    lambda x: max(x, 0) if pd.notnull(x) else x
)
df["power_kw"] = df["power_kw"].apply(lambda x: max(x, 0) if pd.notnull(x) else x)


# Price stuff if needed
# df["price_in_euro"] = pd.to_numeric(
#     df["price_in_euro"].replace(regex=True, to_replace=r"[^0-9.]", value=""),
#     errors="coerce",
# )
# df["price_in_euro"] = pd.to_numeric(df["price_in_euro"], errors="coerce")
# df["price_in_euro"] = df["price_in_euro"].apply(
#     lambda x: max(x, 0) if pd.notnull(x) else x
# )

In [None]:
df.isnull().sum()

##### Numerical Imputer

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

columns_to_impute = ["fuel_consumption_l_100km", "power_kw"]

# Create IterativeImputer
numerical_imputer = IterativeImputer(max_iter=10, random_state=0)
df[columns_to_impute] = numerical_imputer.fit_transform(df[columns_to_impute])

#Remove Unknown values from the transmission_type column
df = df[df.transmission_type != "Unknown"]

In [None]:
df.isnull().sum()

##### Handling Outliers

In [None]:
import pandas as pd
from scipy.stats import zscore

df["power_kw_zscore"] = zscore(df["power_kw"])

# Set a threshold for Z-scores
zscore_threshold = 3
filtered_df = df[abs(df["power_kw_zscore"]) <= zscore_threshold]

# Discretize the filtered power_kw column
num_bins = 10
filtered_df["power_kw_discretized"] = pd.cut(
    filtered_df["power_kw"], bins=num_bins, labels=False
)

##### Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Scatter plot between power_kw and fuel_consumption_l_100km
plt.figure(figsize=(5,5))
sns.scatterplot(
    data=df,
    x="power_kw",
    y="fuel_consumption_l_100km",
    hue="transmission_type",
    palette="Set1",
    alpha=0.7,
)
plt.xlabel("Power (kW)")
plt.ylabel("Fuel Consumption (L/100km)")
plt.title("Scatter Plot: Fuel Consumption vs. Power by Transmission Type")
plt.legend(title="Transmission Type")
plt.show()

# Scatter plot between transmission_type and fuel_consumption_l_100km
plt.figure(figsize=(5,5))
sns.scatterplot(
    data=df,
    x="transmission_type",
    y="fuel_consumption_l_100km",
    palette="Set1",
    alpha=0.7,
)
plt.xlabel("Transmission Type")
plt.ylabel("Fuel Consumption (L/100km)")
plt.title("Scatter Plot: Fuel Consumption vs. Transmission Type")
plt.show()


## Research Question 1:
 #### How does the fuel efficiency (fuel consumption in liters per 100 kilometers) of German used cars in 2023 vary based on their transmission type and power (measured in kilowatts)?


#### Power and Fuel Consumption Correlation:
##### Fuel usage and power seem to be negatively correlated in general. Powerful cars might have more fuel-efficient engines, which would reduce fuel usage. The engine's efficiency and design have a big impact on this relationship.


#### Relationship between Fuel Consumption and Gearbox Type:
##### When opposed to manual gearboxes, automatic transmissions are frequently linked to somewhat higher fuel consumption. In some driving situations, automatic transmissions could not be as fuel-efficient because they might need more power to function. Drivers with manual transmissions have more control over changing gears, which may result in increased fuel economy.

