In [36]:
# Import libraries
import pandas as pd
import numpy as np

# Define the dataset file name
data_frame = 'car_sales.csv'

In [40]:
# Load dataset into a DataFrame
df = pd.read_csv(data_frame)

# Standardize column headers
df.columns = (
    df.columns
    .str.strip()  # Remove any leading or trailing spaces
    .str.lower()  # Convert to lowercase
    .str.replace(" ", "_")  # Replace spaces with underscores
)

df.head(1)

Unnamed: 0,car_id,date,customer_name,gender,annual_income,dealer_name,company,model,engine,transmission,color,price_($),dealer_no,body_style,phone,dealer_region
0,C_CND_000001,1/2/2022,Geraldine,Male,13500,Buddy Storbeck's Diesel Service Inc,Ford,Expedition,DoubleÂ Overhead Camshaft,Auto,Black,26000,06457-3834,SUV,8264678,Middletown


In [38]:
# Handle special cases: convert currency symbols "$" to "usd"
df.rename(columns={"price_($)": "price_usd"}, inplace=True)

df.head(1)

Unnamed: 0,car_id,date,customer_name,gender,annual_income,dealer_name,company,model,engine,transmission,color,price_usd,dealer_no,body_style,phone,dealer_region
0,C_CND_000001,1/2/2022,Geraldine,Male,13500,Buddy Storbeck's Diesel Service Inc,Ford,Expedition,DoubleÂ Overhead Camshaft,Auto,Black,26000,06457-3834,SUV,8264678,Middletown


In [4]:
# Convert the 'date' column to datetime format, setting invalid values to NaT
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Warn if any records could not be converted to datetime
if df['date'].isnull().sum() > 0:
    print('Warning: Some dates could not be parsed and have been set to NaT.')

# Display column info
df['date'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 23906 entries, 0 to 23905
Series name: date
Non-Null Count  Dtype         
--------------  -----         
23906 non-null  datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 186.9 KB


In [32]:
# Remove duplicate records (if any) from the DataFrame
df.drop_duplicates(inplace=True)

# Analyze the number of unique values for each column
df_info = (
    df.nunique()
    .to_frame(name="UniqueVal")
    .assign(MissingVal=df.isnull().sum())
    .assign(DataType=df.dtypes)
    .sort_values(by="UniqueVal", ascending=True)
)

print(df_info)

               UniqueVal  MissingVal        DataType
gender                 2           0          object
engine                 2           0          object
transmission           2           0          object
color                  3           0          object
body_style             5           0          object
dealer_region          7           0          object
dealer_no              7           0          object
dealer_name           28           0          object
company               30           0          object
model                154           0          object
date                 612           0  datetime64[ns]
price_usd            870           0           int64
annual_income       2508           0           int64
customer_name       3021           1          object
phone              23804           0           int64
car_id             23906           0          object
