In [None]:
import pandas as pd
import plotly.express as px
import streamlit as st
import matplotlib.pyplot as plt



#df = pd.read_csv("file:///Users/sallyhuang/SDTProject/vehicles_us.csv")
#df = pd.read_csv("/Users/sallyhuang/SDTProject/vehicles_us.csv")
df = pd.read_csv("vehicles_us.csv")


#view general data set information
print(df.head(10))
print()
print(df.info())


   price  model_year           model  condition  cylinders fuel  odometer  \
0   9400      2011.0          bmw x5       good        6.0  gas  145000.0   
1  25500         NaN      ford f-150       good        6.0  gas   88705.0   
2   5500      2013.0  hyundai sonata   like new        4.0  gas  110000.0   
3   1500      2003.0      ford f-150       fair        8.0  gas       NaN   
4  14900      2017.0    chrysler 200  excellent        4.0  gas   80903.0   
5  14990      2014.0    chrysler 300  excellent        6.0  gas   57954.0   
6  12990      2015.0    toyota camry  excellent        4.0  gas   79212.0   
7  15990      2013.0     honda pilot  excellent        6.0  gas  109473.0   
8  11500      2012.0     kia sorento  excellent        4.0  gas  104174.0   
9   9200      2008.0     honda pilot  excellent        NaN  gas  147191.0   

  transmission    type paint_color  is_4wd date_posted  days_listed  
0    automatic     SUV         NaN     1.0  2018-06-23           19  
1    automat

After printing our general information of the vehicle_us.csv, a dataset containing used cars for sale details, we know that there are 13 columns and 51525 rows/entries. The respective columsn are: price in dollars, model_year, model (of the car), condition (i.e. excellent, good, fair), cylinders, fuel (i.e. gas), odometer, transmission (i.e. automatic), type (i.e. SUV, pickup), paint color, is_4wd (1 = yes, 0 = no), date_posted, and days_listed. There are columns with missing rows/entries like model_year, cylinders, odometer, is_4wd, and paint color. We will need to drop the rows with missing data. 

In [None]:
#checking for duplicates and removing 
dup = df.duplicated()
number_dup = dup.sum()
print(number_dup)


0


We checked for duplicated rows in df and returned 0 meaning we there are no duplicated rows and we can proceed with further data clean up.

In [None]:
#checking missing values 
print(df.isnull().sum())
print()




price               0
model_year       3619
model               0
condition           0
cylinders        5260
fuel                0
odometer         7892
transmission        0
type                0
paint_color      9267
is_4wd          25953
date_posted         0
days_listed         0
dtype: int64



We checked for missing values in each individual column and we see that 5 columns have missing values as mentioned previously with is_4wd column having the most missing rows. To proceed, we will remove those rows with missing values instead of filling them with a value due to is_4wd having Boolean values (1 yes, 0 no), so if we fill in all missing values with 0 it will affect the accuracy of the data set. 

In [None]:
df = df.dropna()
print(df.isnull().sum())


price           0
model_year      0
model           0
condition       0
cylinders       0
fuel            0
odometer        0
transmission    0
type            0
paint_color     0
is_4wd          0
date_posted     0
days_listed     0
dtype: int64


There are no longer any missing values in each column and can be used. 

In [None]:
#header 

st.header('Used Vehicles for Sale')

#data viewer 

st.header("Data Viewer")
@st.cache 
def load_data(): 
    return df

def main():
    st.write(df)

if __name__ == "__main__": 
    main()    




In [None]:
#creating plots and histograms

#histogram of price distribution from df
hist_price = px.histogram(df, x = "price", nbins= 70, range_x = [0, 80000])
hist_price.show()

st.plotly_chart(hist_price)

DeltaGenerator()

We see that majority of cars are priced between 0 to 20K and has a sharp decline after in cars that is priced between 20K to 60K.

In [None]:
#histogram of car model year from df
hist_model_year = px.histogram(df, x = "model_year")
hist_model_year.show()

st.plotly_chart(hist_model_year)

DeltaGenerator()

Most of the cars for sales in the data set were manufactored after 1990 with the disturbution leaning towards right, the 2010s. 

In [None]:
#histogram of car type from df
hist_type = px.histogram(df, x = "type")
hist_type.show()

st.plotly_chart(hist_type)

DeltaGenerator()

Most of the cars for sale are larger vehicles with trucks and SUV being the two most common.

In [None]:
condition_price = px.scatter(df, x = "price", y = "condition")
condition_price.show()

st.plotly_chart(condition_price)


DeltaGenerator()

Visualizing price ranges for different car conditions 

In [None]:
price_type = px.scatter(df, x = "price", y = "type")
price_type.show()

st.plotly_chart(price_type)


DeltaGenerator()

visualizing the price range for different car types

In [None]:
model_year_price = px.scatter(df, x = "price", y = "model_year")
model_year_price.show()

st.plotly_chart(model_year_price)

DeltaGenerator()

visualizing price range for different car model years, majority of scatterplot is towards the upper left quadrant, cost under 50K and most model years after the 2000s. 

In [None]:
#interactive histograms

#added to remove warning that was on webpage after running app.py 
st.set_option('deprecation.showPyplotGlobalUse', False) 

#price vs car types histogram
def price_type(df): 

    #making sidebar widget to take different car type as input 
    select_type = st.sidebar.selectbox("select vehicle types", df["type"].unique())

    #filter to get the car type selected
    filtered_type = df[df["type"] == select_type]

    #histogram of price column of selected vehicle 
    plt.hist(filtered_type["price"], bins = 20, color = "lightblue")
    plt.title("Price Distribution for {}".format(select_type))
    plt.xlabel("Price in USD")
    plt.ylabel("Frequency")
    #added range limit since distribution is right skewed 
    plt.xlim(0, 200000)

    #displaying message 
    st.title("Vehicle Price VS. Vehicle Type")
    st.write("select different vehicle type to visualize price distribution")

    #displaying
    st.pyplot()

#interactive hiatogram of vehicle price for model year

def price_year(df):

    #sidebar widget to input model year user want to see 
    select_year = st.sidebar.selectbox("select a Model Year", df['model_year'].unique())

    #filter to display model years that matches the selected year
    filter_price_year = df[df["model_year"] == select_year]

    #histogram of price distribution for different car models of the selected year
    plt.hist(filter_price_year['price'], bins = 10, color = "lightgreen")

    #checkbox widget to normalize histogram if checked
    show_normal_hist = st.sidebar.checkbox("Display as normalize histogram")
    if show_normal_hist:
        _, bins, _ = plt.hist(filter_price_year['price'], bins = 10, color = "lightgreen")
    else:
        plt.hist(filter_price_year['price'], bins = 5, color = "green")

    
    plt.title("Price distribution for vehicles model year {}".format(select_year))
    plt.xlabel("Price (in USD)")
    plt.ylabel("Frequency")

    st.title("Distribution of price for different Vehicle model year")
    st.write("select different years to visualize cost distribution for vehicle models of that year")
    st.pyplot()


if __name__ == '__main__':
    price_type(df)
    price_year(df)