In [107]:
#import pandas and matplotlib libaries
import pandas as pd
from matplotlib import pyplot as plt

#read data from csv file into vehicles_df dataframe. No delimiters are specified, default comma is used.
vehicles_df = pd.read_csv('../vehicles_us.csv')

#display vehicles_df information
vehicles_df.info()






<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    47906 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     46265 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      43633 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   42258 non-null  object 
 10  is_4wd        25572 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 5.1+ MB


Looks like there may be some missing and possibly duplicate values in the dataframe, based on the call to the info() method. We'll investigate closer:

In [112]:

#Check for duplicated rows in the dataframe
print('There are',vehicles_df.duplicated().sum(),'duplicated rows in the dataframe.')
print('')

#define a function named check_missing to check for missing values. Includes a 
#for loop which iterates through each column in the vehicles_df dataframe and checks for columns where
#there are missing values and prints the total amount of missing values for each column
def check_missing():
    none_missing = 0
    for column in vehicles_df:
        missing = vehicles_df[column].isna().sum()
        if missing > 0:
            print("Column", column, "has", missing, "missing values.")
        else:
            none_missing += 1

    print('')
    print('There are', none_missing, 'columns with no missing values.')
    print('')

#call check_missing function
check_missing()

There are 0 duplicated rows in the dataframe.

Column model_year has 3619 missing values.
Column cylinders has 5260 missing values.
Column odometer has 7892 missing values.
Column paint_color has 9267 missing values.
Column is_4wd has 25953 missing values.

There are 8 columns with no missing values.



We've found some issues with the dataframe columns and the data contained within. In the following code block we will be making some changes to the dataframe:

In [114]:

#Fills missing values in model_year column with value of 3000 and converts entire column to integer type 
vehicles_df['model_year'] = vehicles_df['model_year'].fillna('3000').astype('int')

#Fills missing values in cylinders column with value of 0 and converts entire column to integer type
vehicles_df['cylinders'] = vehicles_df['cylinders'].fillna('0').astype('int')

#Fills missing values in odometer column with value of 0 and converts entire column to integer type
vehicles_df['odometer'] = vehicles_df['odometer'].fillna('0').astype('int')

#Fills missing values in paint_color column with string "unspecified" and converts entire column to string (object Dtype)
vehicles_df['paint_color'] = vehicles_df['paint_color'].fillna('unspecified').astype('str')

#Fills missing values in is_4wd column with string '2wd' (two wheel drive) and converts entire column to string (object Dtype)
vehicles_df['is_4wd'] = vehicles_df['is_4wd'].fillna('2wd').astype('str')

#Replaces values of 1 (with the assumption that a value of 1 means that vehicle is 4wd) with string '4wd'
vehicles_df['is_4wd'] = vehicles_df['is_4wd'].replace(['1.0'], '4wd')

#Renames is_4wd column to something more intuitive - "drivetrain"
vehicles_df = vehicles_df.rename(columns={'is_4wd': 'drivetrain'})

After modifying the dataframe, we then call the check_missing function again to show that we have successfully modified the dataframe:

In [115]:
#call check_missing function to see that the changes we made are successful. Since there are 13 columns, we expect to see 13 columns
#with no missing values as the result.
check_missing()


There are 13 columns with no missing values.

