In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load the dataset
data = pd.read_csv("car_sales_data.csv")

In [4]:
# Display initial information
print("\nFirst few rows of the dataset:")
print(data.head())


First few rows of the dataset:
         Date      Salesperson   Customer Name Car Make  Car Model  Car Year  \
0  2022-08-01  Monica Moore MD     Mary Butler   Nissan     Altima      2018   
1  2023-03-15     Roberto Rose  Richard Pierce   Nissan      F-150      2016   
2  2023-04-29     Ashley Ramos    Sandra Moore     Ford      Civic      2016   
3  2022-09-04   Patrick Harris    Johnny Scott     Ford     Altima      2013   
4  2022-06-16       Eric Lopez   Vanessa Jones    Honda  Silverado      2022   

   Sale Price  Commission Rate  Commission Earned  
0       15983         0.070495            1126.73  
1       38474         0.134439            5172.40  
2       33340         0.114536            3818.63  
3       41937         0.092191            3866.20  
4       20256         0.113490            2298.85  


In [5]:
print("Initial data info:")
data.info()

Initial data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500000 entries, 0 to 2499999
Data columns (total 9 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Date               object 
 1   Salesperson        object 
 2   Customer Name      object 
 3   Car Make           object 
 4   Car Model          object 
 5   Car Year           int64  
 6   Sale Price         int64  
 7   Commission Rate    float64
 8   Commission Earned  float64
dtypes: float64(2), int64(2), object(5)
memory usage: 171.7+ MB


In [6]:
data.describe(include='all')

Unnamed: 0,Date,Salesperson,Customer Name,Car Make,Car Model,Car Year,Sale Price,Commission Rate,Commission Earned
count,2500000,2500000,2500000,2500000,2500000,2500000.0,2500000.0,2500000.0,2500000.0
unique,366,518657,518251,5,5,,,,
top,2022-12-24,Michael Smith,Michael Smith,Honda,Silverado,,,,
freq,7144,1229,1167,500687,500842,,,,
mean,,,,,,2015.996,30012.18,0.09998766,3001.005
std,,,,,,3.739132,11545.14,0.02887202,1481.467
min,,,,,,2010.0,10000.0,0.05000014,501.34
25%,,,,,,2013.0,20019.0,0.0749645,1821.71
50%,,,,,,2016.0,30006.0,0.1000058,2741.91
75%,,,,,,2019.0,40022.0,0.1250065,3978.142


In [7]:
# Renaming columns (if needed)
# Rename any column with improper characters or formatting
data.rename(columns={'__year_resale_value': 'Year_resale_value'}, inplace=True)

In [8]:
# Converting 'Latest_Launch' to datetime format
if 'Latest_Launch' in data.columns:
    data['Latest_Launch'] = pd.to_datetime(data['Latest_Launch'], errors='coerce')

In [9]:
missing_values = data.isnull().sum()
print("Missing or null values in each column:\n", missing_values)

Missing or null values in each column:
 Date                 0
Salesperson          0
Customer Name        0
Car Make             0
Car Model            0
Car Year             0
Sale Price           0
Commission Rate      0
Commission Earned    0
dtype: int64


In [None]:
# Handling missing values
# Fill missing values in numerical columns with median
numeric_columns = data.select_dtypes(include='float64').columns
data[numeric_columns] = data[numeric_columns].apply(lambda x: x.fillna(x.median()))
#However there are no null values in the data

In [11]:
# Detecting and handling outliers using IQR
def handle_outliers(column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    # Clip outliers to within bounds
    data[column] = data[column].clip(lower=lower_bound, upper=upper_bound)

# Apply outlier handling on each continuous numerical column
for column in numeric_columns:
    handle_outliers(column)

In [12]:
# Display cleaned and preprocessed data
print("\nCleaned data info:")
data.info()
print("\nFirst few rows of the cleaned dataset:")
print(data.head())


Cleaned data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500000 entries, 0 to 2499999
Data columns (total 9 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Date               object 
 1   Salesperson        object 
 2   Customer Name      object 
 3   Car Make           object 
 4   Car Model          object 
 5   Car Year           int64  
 6   Sale Price         int64  
 7   Commission Rate    float64
 8   Commission Earned  float64
dtypes: float64(2), int64(2), object(5)
memory usage: 171.7+ MB

First few rows of the cleaned dataset:
         Date      Salesperson   Customer Name Car Make  Car Model  Car Year  \
0  2022-08-01  Monica Moore MD     Mary Butler   Nissan     Altima      2018   
1  2023-03-15     Roberto Rose  Richard Pierce   Nissan      F-150      2016   
2  2023-04-29     Ashley Ramos    Sandra Moore     Ford      Civic      2016   
3  2022-09-04   Patrick Harris    Johnny Scott     Ford     Altima      2013   
4  2022-06-1

In [13]:
data.to_csv("clean_car_sales_data.csv", index=False)