In [2]:
# Importing core data manipulation and analysis library
import pandas as pd  # For handling and analyzing data

# Importing a web application framework to create interactive apps
import streamlit as st  # For building the web interface

# Importing libraries for data visualization
import plotly.express as px  # For creating interactive plots
import altair as alt  # For advanced, declarative visualizations

# Importing basic Python modules
import numpy as np  # For numerical operations and handling arrays
import matplotlib.pyplot as plt  # For basic plotting (optional, if needed)

# Additional utility imports (optional based on future needs)
import os  # For interacting with the operating system
import datetime  # For handling date and time data


In [8]:
# finding the file path
file_path = os.path.abspath('vehicles_us.csv')
print(file_path)

/Users/tobiasboegel/vehicles_us.csv


In [20]:
# creating a data frame to work with
df = pd.read_csv('/Users/tobiasboegel/Downloads/vehicles_us.csv')  # Ensure the file path is correct
print(df.info())
print(df.head())  # Changed from df.header() to df.head() to display the first few rows

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    47906 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     46265 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      43633 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   42258 non-null  object 
 10  is_4wd        25572 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 5.1+ MB
None
   price  model_year           model  condition  cylinders fuel  odometer  \
0   9400      2011.0          bmw x5       good        6.0  gas  145000.0   
1  25500         NaN     

Summary of Missing Values:
1) model_year: Missing in 3,619 entries.
2) cylinders: Missing in 5,260 entries.
3) odometer: Missing in 7,892 entries.
4) paint_color: Missing in 9,267 entries.
5) is_4wd: Missing in 25,953 entries (almost half the dataset).

Needs Adjustment:
1) model_year: Currently float64, should be converted to Int64 or int after filling missing values.
2) date_posted: Convert from object to datetime format for date-based analysis.

In [23]:
# Fill missing 'model_year' with the median value
median_model_year = df['model_year'].median()
df['model_year'].fillna(median_model_year, inplace=True)
st.write(f"Filled missing 'model_year' with median: {median_model_year}")

# Convert 'model_year' to an integer type after filling missing values
df['model_year'] = df['model_year'].astype(int)

# Fill missing 'cylinders' with the median
median_cylinders = df['cylinders'].median()
df['cylinders'].fillna(median_cylinders, inplace=True)
st.write(f"Filled missing 'cylinders' with median: {median_cylinders}")

# Fill missing 'odometer' with the median or remove rows if needed
median_odometer = df['odometer'].median()
df['odometer'].fillna(median_odometer, inplace=True)
st.write(f"Filled missing 'odometer' with median: {median_odometer}")

# Convert 'date_posted' to datetime format
df['date_posted'] = pd.to_datetime(df['date_posted'])
st.write("Converted 'date_posted' to datetime format.")

# Fill missing 'paint_color' with 'unknown' to keep categorical data intact
df['paint_color'].fillna('unknown', inplace=True)
st.write("Filled missing 'paint_color' with 'unknown'.")

# Replace NaNs in 'is_4wd' with 0 (assuming missing means not 4WD)
df['is_4wd'].fillna(0, inplace=True)
df['is_4wd'] = df['is_4wd'].astype(int)
st.write("Replaced missing 'is_4wd' values with 0 and converted to integer.")

# Display updated DataFrame info
st.write("#### Updated DataFrame Info:")
st.write(df.info())


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['model_year'].fillna(median_model_year, inplace=True)
2024-11-03 14:29:35.724 
  command:

    streamlit run /opt/anaconda3/lib/python3.12/site-packages/ipykernel_launcher.py [ARGUMENTS]


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   price         51525 non-null  int64         
 1   model_year    51525 non-null  int64         
 2   model         51525 non-null  object        
 3   condition     51525 non-null  object        
 4   cylinders     51525 non-null  float64       
 5   fuel          51525 non-null  object        
 6   odometer      51525 non-null  float64       
 7   transmission  51525 non-null  object        
 8   type          51525 non-null  object        
 9   paint_color   51525 non-null  object        
 10  is_4wd        51525 non-null  int64         
 11  date_posted   51525 non-null  datetime64[ns]
 12  days_listed   51525 non-null  int64         
dtypes: datetime64[ns](1), float64(2), int64(4), object(6)
memory usage: 5.1+ MB


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['cylinders'].fillna(median_cylinders, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['odometer'].fillna(median_odometer, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are 

Feature Engineering:
1) Adding a new column for car_age to simplify age-related analysis
2) Creating an indicator for common or rare models, if necessary

In [29]:
# Feature: Car age based on the model year
current_year = df['date_posted'].dt.year.max()
df['car_age'] = current_year - df['model_year']
st.write("Added 'car_age' column.")

print(df.info())
print(df.sample(5))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   price         51525 non-null  int64         
 1   model_year    51525 non-null  int64         
 2   model         51525 non-null  object        
 3   condition     51525 non-null  object        
 4   cylinders     51525 non-null  float64       
 5   fuel          51525 non-null  object        
 6   odometer      51525 non-null  float64       
 7   transmission  51525 non-null  object        
 8   type          51525 non-null  object        
 9   paint_color   51525 non-null  object        
 10  is_4wd        51525 non-null  int64         
 11  date_posted   51525 non-null  datetime64[ns]
 12  days_listed   51525 non-null  int64         
 13  car_age       51525 non-null  int64         
dtypes: datetime64[ns](1), float64(2), int64(5), object(6)
memory usage: 5.5+ MB
None
     